# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
from collections import Counter
from matplotlib.ticker import MaxNLocator
from patsy import dmatrices
from sklearn import linear_model
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
col_list = ["Credits", "Debit"]
df = pd.read_csv("FCM_642_Raw_Data_10_000_Transactions_clean.csv", usecols=col_list)
print(df.head(10))
df["Credits"] = df["Credits"].fillna(0)
df["Debit"] = df["Debit"].fillna(0)
# use describe to get mean and standard deviations
print(df.describe())
Credits Debit
0 75100.0 NaN
1 NaN 127106.0
2 72300.0 NaN
3 121100.0 NaN
4 339300.0 NaN
5 96500.0 NaN
6 23900.0 NaN
7 NaN 181594.0
8 44100.0 NaN
9 49500.0 NaN
Credits Debit
count 10000.000000 10000.000000
mean 40581.300000 64676.300660
std 74308.512526 84180.214463
min 0.000000 0.000000
25% 0.000000 0.000000
50% 0.000000 38007.190000
75% 61100.000000 98686.000000
max 389300.000000 438360.000000
# create violin plots of Credits and Debits
plt.figure(figsize=(15,8))
df.boxplot()
<AxesSubplot:>
df2 = df.melt(var_name='groups', value_name='vals')
df2 = df2[df2["vals"] > 0]
print (df2)
plt.figure(figsize=(15,8))
ax = sns.violinplot(x="groups", y="vals", data=df2)
groups vals 0 Credits 75100.0 2 Credits 72300.0 3 Credits 121100.0 4 Credits 339300.0 5 Credits 96500.0 ... ... ... 19816 Debit 117110.0 19817 Debit 26166.0 19818 Debit 38040.0 19819 Debit 90062.0 19820 Debit 78302.0 [10000 rows x 2 columns]