# setup
import math
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf
import statsmodels.tools as tools
from collections import Counter
from matplotlib.ticker import MaxNLocator
from patsy import dmatrices
from sklearn.cluster import KMeans
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics
from sklearn.metrics import r2_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
from sklearn.svm import SVR
from statistics import mode
from statsmodels.multivariate.manova import MANOVA
from statsmodels.stats.outliers_influence import variance_inflation_factor
pd.options.mode.chained_assignment = None
# read data for Reed and religiosity datasets at state level
reed_col_list = ["stateId","stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
,"transAdultPop2016","transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
,"religionImportantPew2014","worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014"
,"overallReligiosityPew2014","veryReligiousStatista2017","moderatelyReligiousStatista2017"
,"nonreligiousStatista2017","relLibScore2022","relLibVote2022","relLibVax2022"
,"relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
]
reed_index_list = ["stateId","stateName","antiTransLegislationRiskIndex32023"
,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
]
trans_pop_list = ["stateId","stateName","statePopulation2020","statePopulation2023","transAdultPop2016"
,"transAdultPercent2016","transAdultPop2022","transAdultPercent2022"
]
religiosity_2014_list = ["stateId","stateName","religionImportantPew2014","worshipWeeklyPew2014"
,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"
]
religiosity_2017_list = ["stateId","stateName","veryReligiousStatista2017","moderatelyReligiousStatista2017"
,"nonreligiousStatista2017"
]
religiosity_2022_list = ["stateId","stateName","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"
]
reedFulldf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_col_list)
reedFulldf = reedFulldf[reedFulldf["stateId"] != 11]
reedIndexdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=reed_index_list)
reedIndexdf = reedIndexdf[reedIndexdf["stateId"] != 11]
transStatePopdf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=trans_pop_list)
transStatePopdf = transStatePopdf[transStatePopdf["stateId"] != 11]
religious2014df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2014_list)
religious2014df = religious2014df[religious2014df["stateId"] != 11]
religious2017df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2017_list)
religious2017df = religious2017df[religious2017df["stateId"] != 11]
religious2022df = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\stateDatasetReed.csv", usecols=religiosity_2022_list)
religious2022df = religious2022df[religious2022df["stateId"] != 11]
#print(reedFulldf.head())
#create function to get count of unique values in column and get percentages
def countCol(df, dfCol):
tempdf = df[dfCol]
namecount = dfCol + "count"
namepercent = dfCol + "percent"
tempdf[namecount] = df[dfCol].value_counts()
tempdf[namepercent] = df[dfCol].value_counts(normalize=True)*100
return tempdf
# use describe to get mean and standard deviations of dataframe data
def describeDF(df, dfCol):
print(df.describe())
# get mode and variance using built in stats library
for colName in dfCol:
if(df[colName].dtypes != object):
print("Mode of ",colName,": ", mode(df[colName]))
print("Variance of ",colName,": ", np.var(df[colName], ddof=1))
print()
def combinedf(df, dfCol, dfName):
retdf = pd.DataFrame()
for colName in dfCol:
namecount = colName + "count"
namepercent = colName + "percent"
dfNameColCount = dfName + namecount
dfNameColPercent = dfName + namepercent
tempdf = countCol(df,colName)
retdf[dfNameColCount] = tempdf[namecount]
retdf[dfNameColPercent] = tempdf[namepercent]
return retdf
#function to allow grouping gender identity on 3 values
def basicGenMarker(asab, gender):
if(asab == 1 and gender == 1):
return "Cisgender Man"
elif (asab == 2 and gender == 2):
return "Cisgender Woman"
else:
return "Transgender"
#function to print covariance map based on given columns
def printCovariance(df,dfCol,colLabels,title):
cols = dfCol
stdsc = StandardScaler()
X_std = stdsc.fit_transform(df[cols].iloc[:,range(0,len(dfCol))].values)
cov_mat = np.cov(X_std.T)
plt.figure(figsize=(7,7))
sns.set(font_scale=1)
hm = sns.heatmap(cov_mat, cbar = True, annot = True, square = True, fmt = ".2f", cmap = "vlag",
annot_kws={"size":12}, yticklabels = colLabels, xticklabels = colLabels, cbar_kws={"shrink": 0.5})
#plt.title(title)
plt.tight_layout()
plt.show()
#function to build classifier matrix after testing NB model
# Build Naive Bayes Classifer to sort and classify data
# split datasets into training and test sets
def printNBClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
# scale input data for training if necessary for better predictions
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# initiale, train and test the BNB
bnb = BernoulliNB()
bnb.fit(X_train, y_train)
pred = bnb.predict(X_test)
#check accuracy
bnb_accuracy = metrics.accuracy_score(pred, y_test)
printConfusionMatrix(y_test, pred,matrixTitle)
#function to build classifier matrix after testing SVM model
#build SVM
# split datasets into training and test sets
def printSVMClassifierOutcome(X,y,trainSize,trainState,matrixTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
# scale input data for training if necessary for better predictions
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# build and train model
clf = SVC(kernel = "linear")
clf.fit(X_train, y_train)
pred = clf.predict(X_test)
#check accuracy
clf_accuracy = metrics.accuracy_score(pred, y_test)
printConfusionMatrix(y_test, pred,matrixTitle)
def printConfusionMatrix(y_test, pred,matrixTitle):
#confusion matrix
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6,6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
for j in range(cfm.shape[1]):
ax.text(x=j, y=i,s=cfm[i, j], va='center',
ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title(matrixTitle, fontsize = 14)
plt.show()
print(metrics.classification_report(y_test, pred, zero_division = 0))
nEstimators = 500
decPrecision = 4
maxDepth = 3
def printRFRClassifierOutcome(X,y,trainSize,trainState, featTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=1-size, random_state=state)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
rf_regressor = RandomForestRegressor(n_estimators = nEstimators, random_state = state)
rf_regressor.fit(X_train, y_train)
rf_y_pred = rf_regressor.predict(X_test)
np.set_printoptions(precision=decPrecision)
print("Random Forest (" + str(nEstimators) + " Tree) Regression Accuracy: " + str(round(r2_score(y_test, rf_y_pred), decPrecision)))
#rf_regressor.feature_names_in_
featureDf = pd.DataFrame({"Features" : rf_regressor.feature_names_in_, "Importance" : rf_regressor.feature_importances_})
featureDf = featureDf.sort_values(by=["Importance"], ascending=False)
print(featureDf)
#plot bar chart of importance
f, ax = plt.subplots(figsize=(20,12))
sns.barplot(x=featureDf["Features"], y=featureDf["Importance"], palette="flare")
plt.xlabel('Features', fontsize = 16)
plt.ylabel('Importance', fontsize = 16)
plt.title(featTitle, fontsize=16)
plt.xticks(rotation=45)
#for val in plt.containers:
#plt.bar_label(val)
plt.show()
#rfc = RandomForestClassifier(n_estimators=nEstimators, max_depth=maxDepth, random_state=state)
#rfc.fit(X_train, y_train)
#features = X.columns.values
#classes = ['Cisgender man', 'Cisgender Woman', 'Transgender']
#for estimator in rfc.estimators_:
#print(estimator)
#plt.figure(figsize=(20,10))
#tree.plot_tree(estimator, feature_names=features, class_names=classes, fontsize=10, filled=True, rounded=True)
#plt.show()
maxIter=1000000000
def LogRegressionOutcome(X,y,trainSize,trainState,featTitle):
size = trainSize
state = trainState
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-size, random_state = state)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
print("Logistic Regression Accuracy: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
x_train_const = tools.add_constant(X_train)
coefArray = []
for ind in range(log_regression.coef_.shape[0]):
if ind == 0:
featFor = "Cisgender Men"
elif ind == 1:
featFor = "Cisgender Women"
else:
featFor = "Transgender"
fullFeatTitle = "Logistic Regression " + featFor + " Feature Important for " + featTitle
for x in log_regression.coef_[ind]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
print(featureDf)
#plot bar chart of importance
f, ax = plt.subplots(figsize=(20,12))
sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
plt.xlabel('Features', fontsize = 16)
plt.ylabel('Coefficients', fontsize = 16)
plt.title(fullFeatTitle, fontsize=16)
plt.xticks(rotation=45)
#for val in plt.containers:
#plt.bar_label(val)
plt.show()
describeDF(reedIndexdf, reed_index_list)
stateId antiTransLegislationRiskIndex32023 \
count 50.000000 50.00000
mean 29.320000 2.08000
std 15.782243 1.60153
min 1.000000 0.00000
25% 17.250000 1.00000
50% 29.500000 2.00000
75% 41.750000 4.00000
max 56.000000 4.00000
antiTransLegislationRiskIndex122022 \
count 50.000000
mean 1.860000
std 1.340271
min 0.000000
25% 1.000000
50% 2.000000
75% 3.000000
max 4.000000
antiTransLegislationRiskIndex112022
count 50.00000
mean 1.82000
std 1.33539
min 0.00000
25% 1.00000
50% 2.00000
75% 3.00000
max 4.00000
Mode of stateId : 1
Variance of stateId : 249.0791836734694
Mode of antiTransLegislationRiskIndex32023 : 4
Variance of antiTransLegislationRiskIndex32023 : 2.564897959183673
Mode of antiTransLegislationRiskIndex122022 : 3
Variance of antiTransLegislationRiskIndex122022 : 1.7963265306122445
Mode of antiTransLegislationRiskIndex112022 : 1
Variance of antiTransLegislationRiskIndex112022 : 1.7832653061224486
reed_cov_list = ["stateId","antiTransLegislationRiskIndex32023"
,"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022"
]
reed_label = ["stateId","RiskIndex32023"
,"RiskIndex122022","RiskIndex112022"]
printCovariance(reedIndexdf, reed_cov_list, reed_label, "Anti-Transgender Legislation Risk Index Covariance Matrix")
describeDF(transStatePopdf, trans_pop_list)
stateId statePopulation2020 statePopulation2023 transAdultPop2016 \
count 50.000000 5.000000e+01 5.000000e+01 50.00000
mean 29.320000 6.615242e+06 8.960485e+06 27654.00000
std 15.782243 7.436124e+06 1.907631e+07 36854.01958
min 1.000000 5.768510e+05 5.808170e+05 1400.00000
25% 17.250000 1.869706e+06 1.940934e+06 6375.00000
50% 29.500000 4.581796e+06 4.625424e+06 19450.00000
75% 41.750000 7.566836e+06 7.844464e+06 31037.50000
max 56.000000 3.953822e+07 1.309280e+08 218400.00000
transAdultPercent2016 transAdultPop2022 transAdultPercent2022
count 50.000000 50.000000 50.000000
mean 0.530400 26638.000000 0.531800
std 0.121722 29080.703259 0.126889
min 0.300000 2100.000000 0.200000
25% 0.432500 7025.000000 0.442500
50% 0.535000 16950.000000 0.525000
75% 0.610000 33225.000000 0.600000
max 0.780000 150100.000000 0.870000
Mode of stateId : 1
Variance of stateId : 249.0791836734694
Mode of statePopulation2020 : 5024279
Variance of statePopulation2020 : 55295936980950.49
Mode of statePopulation2023 : 5097641
Variance of statePopulation2023 : 363905671623562.75
Mode of transAdultPop2016 : 2700
Variance of transAdultPop2016 : 1358218759.1836734
Mode of transAdultPercent2016 : 0.43
Variance of transAdultPercent2016 : 0.014816163265306125
Mode of transAdultPop2022 : 6300
Variance of transAdultPop2022 : 845687302.0408163
Mode of transAdultPercent2022 : 0.6
Variance of transAdultPercent2022 : 0.016100775510204078
pop_cov_list = ["stateId","statePopulation2020","statePopulation2023","transAdultPop2016","transAdultPop2022"]
pop_label = ["stateId","TotalPop2020","TotalPop2023","TransPop2016","TransPop2022"]
printCovariance(transStatePopdf, pop_cov_list, pop_label, "Population by State Covariance Matrix")
describeDF(religious2014df, religiosity_2014_list)
stateId religionImportantPew2014 worshipWeeklyPew2014 \
count 50.000000 50.000000 50.000000
mean 29.320000 0.527000 0.359400
std 15.782243 0.107499 0.075035
min 1.000000 0.320000 0.210000
25% 17.250000 0.452500 0.310000
50% 29.500000 0.510000 0.355000
75% 41.750000 0.597500 0.390000
max 56.000000 0.770000 0.530000
prayDailyPew2014 certainAboutGodPew2014 overallReligiosityPew2014
count 50.000000 50.000000 50.000000
mean 0.541400 0.633600 0.547000
std 0.094286 0.095271 0.107423
min 0.330000 0.400000 0.330000
25% 0.490000 0.575000 0.482500
50% 0.530000 0.630000 0.540000
75% 0.607500 0.690000 0.625000
max 0.750000 0.820000 0.770000
Mode of stateId : 1
Variance of stateId : 249.0791836734694
Mode of religionImportantPew2014 : 0.44
Variance of religionImportantPew2014 : 0.011556122448979588
Mode of worshipWeeklyPew2014 : 0.34
Variance of worshipWeeklyPew2014 : 0.005630244897959184
Mode of prayDailyPew2014 : 0.51
Variance of prayDailyPew2014 : 0.008889836734693879
Mode of certainAboutGodPew2014 : 0.61
Variance of certainAboutGodPew2014 : 0.009076571428571429
Mode of overallReligiosityPew2014 : 0.54
Variance of overallReligiosityPew2014 : 0.011539795918367344
religiosity_cov_2014_list = ["stateId","religionImportantPew2014","worshipWeeklyPew2014"
,"prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014"]
rel_2014_label = ["stateId","VeryImportant","WorshipWeekly","PrayDaily","CertainAboutGod","Overall"]
printCovariance(religious2014df, religiosity_cov_2014_list, rel_2014_label, "Pew 2014 Religiosity Covariance Matrix")
describeDF(religious2017df, religiosity_2017_list)
stateId veryReligiousStatista2017 moderatelyReligiousStatista2017 \
count 50.000000 50.000000 50.000000
mean 29.320000 0.371600 0.287200
std 15.782243 0.090449 0.030442
min 1.000000 0.160000 0.160000
25% 17.250000 0.310000 0.270000
50% 29.500000 0.365000 0.295000
75% 41.750000 0.437500 0.300000
max 56.000000 0.590000 0.330000
nonreligiousStatista2017
count 50.000000
mean 0.342000
std 0.099857
min 0.120000
25% 0.290000
50% 0.340000
75% 0.397500
max 0.590000
Mode of stateId : 1
Variance of stateId : 249.0791836734694
Mode of veryReligiousStatista2017 : 0.28
Variance of veryReligiousStatista2017 : 0.008181061224489796
Mode of moderatelyReligiousStatista2017 : 0.3
Variance of moderatelyReligiousStatista2017 : 0.0009266938775510202
Mode of nonreligiousStatista2017 : 0.33
Variance of nonreligiousStatista2017 : 0.00997142857142857
religiosity_cov_2017_list = ["stateId","veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"]
rel_2017_label = ["stateId","Very","Moderate","Nonreligious"]
printCovariance(religious2017df, religiosity_cov_2017_list, rel_2017_label, "Gallup 2017 Religiosity Covariance Matrix")
describeDF(religious2022df, religiosity_2022_list)
stateId relLibScore2022 relLibVote2022 relLibVax2022 \
count 50.000000 50.000000 50.000000 50.000000
mean 29.320000 0.393948 0.800000 0.900000
std 15.782243 0.133298 0.404061 0.303046
min 1.000000 0.155800 0.000000 0.000000
25% 17.250000 0.314950 1.000000 1.000000
50% 29.500000 0.371200 1.000000 1.000000
75% 41.750000 0.467550 1.000000 1.000000
max 56.000000 0.818200 1.000000 1.000000
relLibHealth2022 relLibHealthMandate2022 relLibMarriage2022 \
count 50.000000 50.000000 50.000000
mean 6.760000 0.640000 1.160000
std 4.023198 0.484873 1.489555
min 0.000000 0.000000 0.000000
25% 4.250000 0.000000 0.000000
50% 5.500000 1.000000 0.000000
75% 9.000000 1.000000 3.000000
max 20.000000 1.000000 5.000000
relLibRfra2022
count 50.000000
mean 0.480000
std 0.504672
min 0.000000
25% 0.000000
50% 0.000000
75% 1.000000
max 1.000000
Mode of stateId : 1
Variance of stateId : 249.0791836734694
Mode of relLibScore2022 : 0.3377
Variance of relLibScore2022 : 0.01776847438367347
Mode of relLibVote2022 : 1.0
Variance of relLibVote2022 : 0.16326530612244897
Mode of relLibVax2022 : 1.0
Variance of relLibVax2022 : 0.09183673469387756
Mode of relLibHealth2022 : 5.0
Variance of relLibHealth2022 : 16.186122448979592
Mode of relLibHealthMandate2022 : 1.0
Variance of relLibHealthMandate2022 : 0.2351020408163265
Mode of relLibMarriage2022 : 0.0
Variance of relLibMarriage2022 : 2.218775510204082
Mode of relLibRfra2022 : 0.0
Variance of relLibRfra2022 : 0.25469387755102035
religiosity_cov_2022_list = ["stateId","relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022"
,"relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022"]
rel_2022_label = ["stateId","Score","Vote","Vax","Health","HealthMandate","Marriage","Rfra"]
printCovariance(religious2022df, religiosity_cov_2022_list, rel_2022_label, "Religious Liberty 2022 Covariance Matrix")
# read self-identification Census Pulse Survey data
pulse_col_list = ["SCRAM","WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER"
,"CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN"]
pulse_num_col_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]
pulsedf = pd.read_csv("P:\\UticaMSDS\\DSC680-ResearchPracticum\\dataSets\\pulseModFull.csv", usecols=pulse_col_list)
print("Full count of data pulse data: ", (pulsedf["GENID_DESCRIBE"] > -100).sum())
print()
countDistrictofColumbia = (pulsedf["EST_ST"] == 11).sum()
print("Count of participants in District of Columbia: ", countDistrictofColumbia)
print()
countMissingGender = (pulsedf["GENID_DESCRIBE"] < 0).sum()
print("Count of missing or unreported gender identity: ", countMissingGender)
print()
countMissingSexuality = (pulsedf["SEXUAL_ORIENTATION"] < 0).sum()
print("Count of missing or unreported sexuality: ", countMissingSexuality)
print()
countMissingIncome = (pulsedf["INCOME"] < 0).sum()
print("Count of missing or unreported minimum income: ", countMissingIncome)
print()
#remove DC residents
pulsedf = pulsedf[pulsedf["EST_ST"] != 11]
#remove missing gender identity based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["GENID_DESCRIBE"].isin([1,2,3,4])]
#remove missing sexuality based on under 10% of total for better results
pulsedf = pulsedf[pulsedf["SEXUAL_ORIENTATION"].isin([1,2,3,4,5])]
#remove missing income based on null values for better results
pulsedf = pulsedf[pulsedf["INCOME"].isin([1,2,3,4,5,6,7,8])]
pulsedfCount = len(pulsedf.index)
print("Count after row removal: ", pulsedfCount)
Full count of data pulse data: 1341164 Count of participants in District of Columbia: 17702 Count of missing or unreported gender identity: 17691 Count of missing or unreported sexuality: 24617 Count of missing or unreported minimum income: 263337 Count after row removal: 1048575
describeDF(pulsedf,pulse_num_col_list)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \
count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06
mean 4.391728e+01 2.854206e+01 1.968571e+03 5.367118e+00 1.994822e+00
std 6.120523e+00 1.640470e+01 1.575978e+01 1.436177e+00 7.176839e-02
min 3.400000e+01 1.000000e+00 1.933000e+03 1.000000e+00 1.000000e+00
25% 3.900000e+01 1.300000e+01 1.956000e+03 4.000000e+00 2.000000e+00
50% 4.300000e+01 2.800000e+01 1.968000e+03 6.000000e+00 2.000000e+00
75% 4.900000e+01 4.200000e+01 1.981000e+03 7.000000e+00 2.000000e+00
max 5.400000e+01 5.600000e+01 2.005000e+03 7.000000e+00 2.000000e+00
EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \
count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06
mean 1.581580e+00 1.997595e+00 1.609426e+00 2.069715e+00
std 4.933001e-01 4.898353e-02 5.507229e-01 4.835032e-01
min 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
25% 1.000000e+00 2.000000e+00 1.000000e+00 2.000000e+00
50% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00
75% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00
max 2.000000e+00 2.000000e+00 4.000000e+00 5.000000e+00
INCOME
count 1.048575e+06
mean 4.620572e+00
std 2.128103e+00
min 1.000000e+00
25% 3.000000e+00
50% 5.000000e+00
75% 6.000000e+00
max 8.000000e+00
Mode of WEEK : 43
Variance of WEEK : 37.460803320626816
Mode of EST_ST : 6
Variance of EST_ST : 269.1142209085299
Mode of TBIRTH_YEAR : 1955
Variance of TBIRTH_YEAR : 248.3706702974734
Mode of EEDUC : 6
Variance of EEDUC : 2.0626035639994904
Mode of AEDUC : 2
Variance of AEDUC : 0.005150701178258357
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.2433449743390272
Mode of AGENID_BIRTH : 2
Variance of AGENID_BIRTH : 0.0023993863704273163
Mode of GENID_DESCRIBE : 2
Variance of GENID_DESCRIBE : 0.3032957446696943
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.23377538647520063
Mode of INCOME : 6
Variance of INCOME : 4.528821091548032
pulse_cov_list = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME"]
pulse_label_list = ["WEEK","EST_ST","BIRTH_YEAR","EDUC","SEX_AT_BIRTH","GENDERID","SEXUALITY","INCOME"]
printCovariance(pulsedf, pulse_cov_list, pulse_label_list, "USCB Pulse Survey Covariance Matrix")
# further analysis of pulse data
pulseIncomedf = pd.DataFrame()
pulseIncomedf["INCOMEMIN"] = pulsedf["INCOMEMIN"].astype(float)
print(pulseIncomedf.describe())
print("Mode of INCOMEMIN: ", mode(pulseIncomedf["INCOMEMIN"]))
print("Variance of INCOMEMIN: ", np.var(pulseIncomedf["INCOMEMIN"], ddof=1))
del pulseIncomedf
INCOMEMIN count 1.048575e+06 mean 7.935849e+04 std 5.884945e+04 min 0.000000e+00 25% 3.500000e+04 50% 7.500000e+04 75% 1.000000e+05 max 2.000000e+05 Mode of INCOMEMIN: 100000.0 Variance of INCOMEMIN: 3463257810.0526085
# look at income data based on gender
#clean data
pulseIncomeStatsdf = pulsedf
#remove missing values from table for income and force to number
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["INCOMEMIN"].astype(str).str.isdigit()]
pulseIncomeStatsdf["INCOMEMIN"] = pd.to_numeric(pulseIncomeStatsdf["INCOMEMIN"], errors='coerce')
#remove unreported or missing chosen gender
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["GENID_DESCRIBE"] > 0]
#print(pulseIncomeStatsdf.head())
pulseIncomeStatsdf = pulseIncomeStatsdf[pulseIncomeStatsdf["SEXUAL_ORIENTATION"] > 0]
pulse_income_col = ["WEEK","EST_ST","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN"]
describeDF(pulseIncomeStatsdf, pulse_income_col)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \
count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06
mean 4.391728e+01 2.854206e+01 1.968571e+03 5.367118e+00 1.994822e+00
std 6.120523e+00 1.640470e+01 1.575978e+01 1.436177e+00 7.176839e-02
min 3.400000e+01 1.000000e+00 1.933000e+03 1.000000e+00 1.000000e+00
25% 3.900000e+01 1.300000e+01 1.956000e+03 4.000000e+00 2.000000e+00
50% 4.300000e+01 2.800000e+01 1.968000e+03 6.000000e+00 2.000000e+00
75% 4.900000e+01 4.200000e+01 1.981000e+03 7.000000e+00 2.000000e+00
max 5.400000e+01 5.600000e+01 2.005000e+03 7.000000e+00 2.000000e+00
EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \
count 1.048575e+06 1.048575e+06 1.048575e+06 1.048575e+06
mean 1.581580e+00 1.997595e+00 1.609426e+00 2.069715e+00
std 4.933001e-01 4.898353e-02 5.507229e-01 4.835032e-01
min 1.000000e+00 1.000000e+00 1.000000e+00 1.000000e+00
25% 1.000000e+00 2.000000e+00 1.000000e+00 2.000000e+00
50% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00
75% 2.000000e+00 2.000000e+00 2.000000e+00 2.000000e+00
max 2.000000e+00 2.000000e+00 4.000000e+00 5.000000e+00
INCOME INCOMEMIN
count 1.048575e+06 1.048575e+06
mean 4.620572e+00 7.935849e+04
std 2.128103e+00 5.884945e+04
min 1.000000e+00 0.000000e+00
25% 3.000000e+00 3.500000e+04
50% 5.000000e+00 7.500000e+04
75% 6.000000e+00 1.000000e+05
max 8.000000e+00 2.000000e+05
Mode of WEEK : 43
Variance of WEEK : 37.460803320626816
Mode of EST_ST : 6
Variance of EST_ST : 269.1142209085299
Mode of TBIRTH_YEAR : 1955
Variance of TBIRTH_YEAR : 248.3706702974734
Mode of EEDUC : 6
Variance of EEDUC : 2.0626035639994904
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.2433449743390272
Mode of GENID_DESCRIBE : 2
Variance of GENID_DESCRIBE : 0.3032957446696943
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.23377538647520063
Mode of INCOMEMIN : 100000
Variance of INCOMEMIN : 3463257810.0526085
#build violin plot
plt.figure(figsize=(12,8))
sns.violinplot(x=pulseIncomeStatsdf["EGENID_BIRTH"],y=pulseIncomeStatsdf["INCOMEMIN"],palette="bright")
ax = plt.gca()
leg = ax.get_legend()
ax.set_xticklabels(["Assigned Male at Birth","Assigned Female at Birth"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income")
plt.show()
#build violin plot
incomeGeniddf = pulseIncomeStatsdf
incomeGeniddf["CUR_GENID"] = incomeGeniddf.apply(lambda x: str(x["EGENID_BIRTH"]) + str(x["GENID_DESCRIBE"]), axis=1)
#print(incomeGeniddf)
plt.figure(figsize=(15,8))
sns.violinplot(x=incomeGeniddf["CUR_GENID"],y=incomeGeniddf["INCOMEMIN"],palette="bright")
ax = plt.gca()
ax.set_xticklabels(["Cisgender Women","Cisgender Men", "Nonbinary AFAB", "Nonbinary AMAB"
, "Transgender AFAB", "Transgender AMAB", "Transgender FTM", "Transgender MTF"])
plt.ylabel("Income in Dollars")
plt.xlabel("")
plt.title("Minimum Yearly Reported Income by Gender Identity and Sex Assigned at Birth")
plt.show()
#income comparison based on assigned gender at birth
pulseIncomeAMABdf = pulseIncomeStatsdf
pulseIncomeAMABdf = pulseIncomeAMABdf[pulseIncomeAMABdf["EGENID_BIRTH"] == 1]
describeDF(pulseIncomeAMABdf, pulse_income_col)
pulseIncomeAFABdf = pulseIncomeStatsdf
pulseIncomeAFABdf = pulseIncomeAFABdf[pulseIncomeAFABdf["EGENID_BIRTH"] == 2]
describeDF(pulseIncomeAFABdf, pulse_income_col)
WEEK EST_ST TBIRTH_YEAR EEDUC \
count 438745.000000 438745.000000 438745.000000 438745.000000
mean 44.064304 28.533891 1967.503153 5.435818
std 6.133248 16.498385 16.139392 1.424280
min 34.000000 1.000000 1933.000000 1.000000
25% 39.000000 13.000000 1954.000000 4.000000
50% 44.000000 28.000000 1966.000000 6.000000
75% 50.000000 44.000000 1981.000000 7.000000
max 54.000000 56.000000 2005.000000 7.000000
AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \
count 438745.000000 438745.0 438745.000000 438745.000000
mean 1.994215 1.0 1.997568 1.034546
std 0.075837 0.0 0.049255 0.300578
min 1.000000 1.0 1.000000 1.000000
25% 2.000000 1.0 2.000000 1.000000
50% 2.000000 1.0 2.000000 1.000000
75% 2.000000 1.0 2.000000 1.000000
max 2.000000 1.0 2.000000 4.000000
SEXUAL_ORIENTATION INCOME INCOMEMIN
count 438745.000000 438745.000000 438745.000000
mean 2.032830 4.966391 88654.389224
std 0.475219 2.088031 60425.293955
min 1.000000 1.000000 0.000000
25% 2.000000 4.000000 50000.000000
50% 2.000000 5.000000 75000.000000
75% 2.000000 6.000000 100000.000000
max 5.000000 8.000000 200000.000000
Mode of WEEK : 43
Variance of WEEK : 37.616725915359496
Mode of EST_ST : 6
Variance of EST_ST : 272.19670766553105
Mode of TBIRTH_YEAR : 1955
Variance of TBIRTH_YEAR : 260.47997895673205
Mode of EEDUC : 6
Variance of EEDUC : 2.0285745651188547
Mode of EGENID_BIRTH : 1
Variance of EGENID_BIRTH : 0.0
Mode of GENID_DESCRIBE : 1
Variance of GENID_DESCRIBE : 0.0903474059981086
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.22583355356615814
Mode of INCOMEMIN : 100000
Variance of INCOMEMIN : 3651216149.53128
WEEK EST_ST TBIRTH_YEAR EEDUC \
count 609830.000000 609830.000000 609830.000000 609830.000000
mean 43.811497 28.547931 1969.338557 5.317692
std 6.109169 16.336979 15.435331 1.442654
min 34.000000 1.000000 1933.000000 1.000000
25% 39.000000 13.000000 1957.000000 4.000000
50% 43.000000 28.000000 1969.000000 6.000000
75% 49.000000 42.000000 1982.000000 7.000000
max 54.000000 56.000000 2005.000000 7.000000
AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \
count 609830.000000 609830.0 609830.000000 609830.000000
mean 1.995259 2.0 1.997614 2.023026
std 0.068689 0.0 0.048788 0.218328
min 1.000000 2.0 1.000000 1.000000
25% 2.000000 2.0 2.000000 2.000000
50% 2.000000 2.0 2.000000 2.000000
75% 2.000000 2.0 2.000000 2.000000
max 2.000000 2.0 2.000000 4.000000
SEXUAL_ORIENTATION INCOME INCOMEMIN
count 609830.000000 609830.000000 609830.000000
mean 2.096251 4.371771 72670.506535
std 0.487654 2.121895 56755.052819
min 1.000000 1.000000 0.000000
25% 2.000000 3.000000 35000.000000
50% 2.000000 4.000000 50000.000000
75% 2.000000 6.000000 100000.000000
max 5.000000 8.000000 200000.000000
Mode of WEEK : 43
Variance of WEEK : 37.32194368751015
Mode of EST_ST : 6
Variance of EST_ST : 266.89687185333827
Mode of TBIRTH_YEAR : 1955
Variance of TBIRTH_YEAR : 238.24944848073915
Mode of EEDUC : 6
Variance of EEDUC : 2.081250683388053
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.0
Mode of GENID_DESCRIBE : 2
Variance of GENID_DESCRIBE : 0.047666915897604446
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.23780655025122638
Mode of INCOMEMIN : 50000
Variance of INCOMEMIN : 3221136020.4754634
# build datasets based on chosen gender
col_for_counts = ["WEEK"]
pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set week counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)
pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllWEEKcount"] = pulseStateCountdf["AllWEEKcount"].astype(int)
del pulseStateCountdf
pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]
describeDF(pulseCisMendf, pulse_income_col)
#print("Cis men week counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")
pulseStateTotalsdf["CisMenWEEKcount"] = pulseCisMenReducedDf["CisMenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisMenWEEKcount"])
WEEK EST_ST TBIRTH_YEAR EEDUC \
count 432414.000000 432414.000000 432414.000000 432414.000000
mean 44.064441 28.527571 1967.402887 5.441038
std 6.133263 16.496388 16.082041 1.421493
min 34.000000 1.000000 1933.000000 1.000000
25% 39.000000 13.000000 1954.000000 4.000000
50% 44.000000 28.000000 1966.000000 6.000000
75% 50.000000 44.000000 1981.000000 7.000000
max 54.000000 56.000000 2005.000000 7.000000
AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \
count 432414.000000 432414.0 432414.000000 432414.0
mean 1.994237 1.0 1.998985 1.0
std 0.075695 0.0 0.031847 0.0
min 1.000000 1.0 1.000000 1.0
25% 2.000000 1.0 2.000000 1.0
50% 2.000000 1.0 2.000000 1.0
75% 2.000000 1.0 2.000000 1.0
max 2.000000 1.0 2.000000 1.0
SEXUAL_ORIENTATION INCOME INCOMEMIN
count 432414.000000 432414.000000 432414.000000
mean 2.019733 4.980276 88989.105348
std 0.440445 2.082161 60375.536150
min 1.000000 1.000000 0.000000
25% 2.000000 4.000000 50000.000000
50% 2.000000 5.000000 75000.000000
75% 2.000000 7.000000 150000.000000
max 5.000000 8.000000 200000.000000
Mode of WEEK : 43
Variance of WEEK : 37.6169203141804
Mode of EST_ST : 6
Variance of EST_ST : 272.1308119804907
Mode of TBIRTH_YEAR : 1955
Variance of TBIRTH_YEAR : 258.63203797251515
Mode of EEDUC : 6
Variance of EEDUC : 2.0206427755641716
Mode of EGENID_BIRTH : 1
Variance of EGENID_BIRTH : 0.0
Mode of GENID_DESCRIBE : 1
Variance of GENID_DESCRIBE : 0.0
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.19399188938993783
Mode of INCOMEMIN : 100000
Variance of INCOMEMIN : 3645205365.415228
pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]
describeDF(pulseCisWomendf, pulse_income_col)
#print("Cis women week counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")
pulseStateTotalsdf["CisWomenWEEKcount"] = pulseCisWomenReducedDf["CisWomenWEEKcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenWEEKcount"])
WEEK EST_ST TBIRTH_YEAR EEDUC \
count 599899.000000 599899.000000 599899.000000 599899.000000
mean 43.804849 28.541668 1969.179439 5.321482
std 6.108812 16.333975 15.348571 1.440654
min 34.000000 1.000000 1933.000000 1.000000
25% 39.000000 13.000000 1957.000000 4.000000
50% 43.000000 28.000000 1969.000000 6.000000
75% 49.000000 42.000000 1982.000000 7.000000
max 54.000000 56.000000 2005.000000 7.000000
AEDUC EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE \
count 599899.000000 599899.0 599899.000000 599899.0
mean 1.995281 2.0 1.998725 2.0
std 0.068534 0.0 0.035687 0.0
min 1.000000 2.0 1.000000 2.0
25% 2.000000 2.0 2.000000 2.0
50% 2.000000 2.0 2.000000 2.0
75% 2.000000 2.0 2.000000 2.0
max 2.000000 2.0 2.000000 2.0
SEXUAL_ORIENTATION INCOME INCOMEMIN
count 599899.000000 599899.000000 599899.000000
mean 2.085014 4.384903 72975.084139
std 0.461465 2.119228 56763.424571
min 1.000000 1.000000 0.000000
25% 2.000000 3.000000 35000.000000
50% 2.000000 4.000000 50000.000000
75% 2.000000 6.000000 100000.000000
max 5.000000 8.000000 200000.000000
Mode of WEEK : 43
Variance of WEEK : 37.317581415112876
Mode of EST_ST : 6
Variance of EST_ST : 266.79872778314314
Mode of TBIRTH_YEAR : 1955
Variance of TBIRTH_YEAR : 235.5786372658161
Mode of EEDUC : 6
Variance of EEDUC : 2.07548426437156
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.0
Mode of GENID_DESCRIBE : 2
Variance of GENID_DESCRIBE : 0.0
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.2129499850843681
Mode of INCOMEMIN : 50000
Variance of INCOMEMIN : 3222086368.9871078
#print("Cisgender week counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["CisMenWEEKcount","CisWomenWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderWEEKcount"] = cisdf["CisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]
describeDF(pulseTranswomendf, pulse_income_col)
#print("Trans women week counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)
pulseStateTotalsdf["TransWomenWEEKcount"] = pulseTranswomenReduceddf["TransWomenWEEKcount"].astype(int)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \
count 2654.000000 2654.000000 2654.000000 2654.000000 2654.000000
mean 44.383572 29.207611 1976.794650 4.939337 1.993595
std 6.134856 16.413115 18.514133 1.537229 0.079792
min 34.000000 1.000000 1933.000000 1.000000 1.000000
25% 40.000000 16.000000 1962.000000 4.000000 2.000000
50% 44.000000 29.000000 1982.000000 5.000000 2.000000
75% 50.000000 42.000000 1993.000000 6.000000 2.000000
max 54.000000 56.000000 2005.000000 7.000000 2.000000
EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \
count 2654.0 2654.000000 2654.000000 2654.00000
mean 1.0 1.794650 2.554635 2.64318
std 0.0 0.404034 0.497100 1.16700
min 1.0 1.000000 2.000000 1.00000
25% 1.0 2.000000 2.000000 2.00000
50% 1.0 2.000000 3.000000 3.00000
75% 1.0 2.000000 3.000000 4.00000
max 1.0 2.000000 3.000000 5.00000
INCOME INCOMEMIN
count 2654.000000 2654.000000
mean 3.669932 56865.109269
std 2.190414 55358.774687
min 1.000000 0.000000
25% 2.000000 25000.000000
50% 4.000000 50000.000000
75% 5.000000 75000.000000
max 8.000000 200000.000000
Mode of WEEK : 52
Variance of WEEK : 37.6364582501901
Mode of EST_ST : 6
Variance of EST_ST : 269.39035290414995
Mode of TBIRTH_YEAR : 1992
Variance of TBIRTH_YEAR : 342.7731187425988
Mode of EEDUC : 4
Variance of EEDUC : 2.363073212535268
Mode of EGENID_BIRTH : 1
Variance of EGENID_BIRTH : 0.0
Mode of GENID_DESCRIBE : 3
Variance of GENID_DESCRIBE : 0.24710817771523663
Mode of SEXUAL_ORIENTATION : 3
Variance of SEXUAL_ORIENTATION : 1.3618890161739803
Mode of INCOMEMIN : 0
Variance of INCOMEMIN : 3064593934.8353977
pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]
describeDF(pulseTransmendf, pulse_income_col)
#print("Trans men week counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)
pulseStateTotalsdf["TransMenWEEKcount"] = pulseTransmenReduceddf["TransMenWEEKcount"].astype(int)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \
count 3444.000000 3444.000000 3444.000000 3444.000000 3444.000000
mean 44.622242 29.090012 1982.988095 5.067364 1.994483
std 6.068434 16.591153 17.827721 1.547615 0.074081
min 34.000000 1.000000 1933.000000 1.000000 1.000000
25% 40.000000 15.000000 1974.000000 4.000000 2.000000
50% 45.000000 29.000000 1990.000000 6.000000 2.000000
75% 50.000000 42.000000 1996.000000 6.000000 2.000000
max 54.000000 56.000000 2005.000000 7.000000 2.000000
EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \
count 3444.0 3444.000000 3444.000000 3444.000000
mean 2.0 1.828688 2.310105 2.852497
std 0.0 0.376836 0.950841 1.133170
min 2.0 1.000000 1.000000 1.000000
25% 2.0 2.000000 1.000000 2.000000
50% 2.0 2.000000 3.000000 3.000000
75% 2.0 2.000000 3.000000 4.000000
max 2.0 2.000000 3.000000 5.000000
INCOME INCOMEMIN
count 3444.000000 3444.000000
mean 3.504355 52778.745645
std 2.122240 53090.947248
min 1.000000 0.000000
25% 2.000000 25000.000000
50% 3.000000 35000.000000
75% 5.000000 75000.000000
max 8.000000 200000.000000
Mode of WEEK : 54
Variance of WEEK : 36.82588913592966
Mode of EST_ST : 6
Variance of EST_ST : 275.26636549507276
Mode of TBIRTH_YEAR : 1996
Variance of TBIRTH_YEAR : 317.82762471819984
Mode of EEDUC : 6
Variance of EEDUC : 2.3951123034735593
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.0
Mode of GENID_DESCRIBE : 3
Variance of GENID_DESCRIBE : 0.9040976945597842
Mode of SEXUAL_ORIENTATION : 3
Variance of SEXUAL_ORIENTATION : 1.2840745062361207
Mode of INCOMEMIN : 0
Variance of INCOMEMIN : 2818648679.692473
pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]
describeDF(pulseNonedf, pulse_income_col)
#print("Non-Binary week counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)
pulseStateTotalsdf["EnbyWEEKcount"] = pulseNoneReduceddf["EnbyWEEKcount"].astype(int)
WEEK EST_ST TBIRTH_YEAR EEDUC AEDUC \
count 10164.000000 10164.000000 10164.000000 10164.000000 10164.000000
mean 43.931425 28.821822 1975.280500 5.129083 1.993113
std 6.128902 16.585780 17.485799 1.551461 0.082706
min 34.000000 1.000000 1933.000000 1.000000 1.000000
25% 39.000000 13.000000 1962.000000 4.000000 2.000000
50% 44.000000 29.000000 1978.000000 6.000000 2.000000
75% 49.000000 44.000000 1990.000000 6.000000 2.000000
max 54.000000 56.000000 2005.000000 7.000000 2.000000
EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \
count 10164.000000 10164.000000 10164.0 10164.000000
mean 1.638233 1.981995 4.0 2.878099
std 0.480535 0.132975 0.0 1.199313
min 1.000000 1.000000 4.0 1.000000
25% 1.000000 2.000000 4.0 2.000000
50% 2.000000 2.000000 4.0 2.000000
75% 2.000000 2.000000 4.0 4.000000
max 2.000000 2.000000 4.0 5.000000
INCOME INCOMEMIN
count 10164.000000 10164.000000
mean 3.853503 61277.056277
std 2.212131 56845.106327
min 1.000000 0.000000
25% 2.000000 25000.000000
50% 4.000000 50000.000000
75% 6.000000 100000.000000
max 8.000000 200000.000000
Mode of WEEK : 41
Variance of WEEK : 37.56343628567068
Mode of EST_ST : 6
Variance of EST_ST : 275.0880960203078
Mode of TBIRTH_YEAR : 1993
Variance of TBIRTH_YEAR : 305.75315311040043
Mode of EEDUC : 6
Variance of EEDUC : 2.407029720940252
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.23091436232464746
Mode of GENID_DESCRIBE : 4
Variance of GENID_DESCRIBE : 0.0
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 1.4383513604283242
Mode of INCOMEMIN : 0
Variance of INCOMEMIN : 3231366113.305501
#print("NonCisgender week counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderWEEKcount"] = pulseStateTotalsdf.loc[:,["TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderWEEKcount"] = transdf["NonCisgenderWEEKcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf
print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderWEEKcount"]/x["AllWEEKcount"] * 100,2)))
Percentages:
| AllWEEKcount | CisMenWEEKcount | CisWomenWEEKcount | CisgenderWEEKcount | TransWomenWEEKcount | TransMenWEEKcount | EnbyWEEKcount | NonCisgenderWEEKcount | NonCisPercent | |
|---|---|---|---|---|---|---|---|---|---|
| 43 | 64543 | 25986 | 37672 | 63658 | 131 | 188 | 566 | 885 | 1.37 |
| 41 | 61390 | 24939 | 35525 | 60464 | 162 | 185 | 579 | 926 | 1.51 |
| 42 | 60221 | 24204 | 35106 | 59310 | 144 | 198 | 569 | 911 | 1.51 |
| 54 | 60220 | 25198 | 34080 | 59278 | 153 | 230 | 559 | 942 | 1.56 |
| 52 | 56169 | 24487 | 30771 | 55258 | 172 | 196 | 543 | 911 | 1.62 |
| 53 | 54574 | 22845 | 30918 | 53763 | 131 | 190 | 490 | 811 | 1.49 |
| 36 | 53108 | 21142 | 31193 | 52335 | 108 | 147 | 518 | 773 | 1.46 |
| 35 | 52930 | 21095 | 31041 | 52136 | 148 | 142 | 504 | 794 | 1.50 |
| 34 | 49604 | 19756 | 29066 | 48822 | 122 | 138 | 522 | 782 | 1.58 |
| 44 | 49565 | 20076 | 28742 | 48818 | 112 | 160 | 475 | 747 | 1.51 |
| 37 | 49090 | 19691 | 28689 | 48380 | 93 | 135 | 482 | 710 | 1.45 |
| 51 | 48800 | 21354 | 26597 | 47951 | 153 | 173 | 523 | 849 | 1.74 |
| 46 | 48346 | 19406 | 28206 | 47612 | 113 | 155 | 466 | 734 | 1.52 |
| 40 | 47908 | 20526 | 26667 | 47193 | 123 | 149 | 443 | 715 | 1.49 |
| 45 | 47676 | 19242 | 27636 | 46878 | 145 | 169 | 484 | 798 | 1.67 |
| 38 | 45991 | 18625 | 26685 | 45310 | 101 | 152 | 428 | 681 | 1.48 |
| 47 | 44593 | 17972 | 25899 | 43871 | 114 | 147 | 461 | 722 | 1.62 |
| 39 | 43767 | 17899 | 25256 | 43155 | 91 | 106 | 415 | 612 | 1.40 |
| 49 | 40405 | 17954 | 21784 | 39738 | 128 | 158 | 381 | 667 | 1.65 |
| 48 | 36405 | 14994 | 20714 | 35708 | 113 | 178 | 406 | 697 | 1.91 |
| 50 | 33270 | 15023 | 17652 | 32675 | 97 | 148 | 350 | 595 | 1.79 |
pulseWeekTotalsdf = pulseStateTotalsdf
print(pulseWeekTotalsdf.head())
print()
state_count_col = ["AllWEEKcount","CisgenderWEEKcount","NonCisgenderWEEKcount"
,"CisMenWEEKcount","CisWomenWEEKcount"
,"TransWomenWEEKcount","TransMenWEEKcount","EnbyWEEKcount"]
describeDF(pulseWeekTotalsdf,state_count_col)
AllWEEKcount CisMenWEEKcount CisWomenWEEKcount CisgenderWEEKcount \
43 64543 25986 37672 63658
41 61390 24939 35525 60464
42 60221 24204 35106 59310
54 60220 25198 34080 59278
52 56169 24487 30771 55258
TransWomenWEEKcount TransMenWEEKcount EnbyWEEKcount \
43 131 188 566
41 162 185 579
42 144 198 569
54 153 230 559
52 172 196 543
NonCisgenderWEEKcount
43 885
41 926
42 911
54 942
52 911
AllWEEKcount CisMenWEEKcount CisWomenWEEKcount CisgenderWEEKcount \
count 21.000000 21.000000 21.000000 21.000000
mean 49932.142857 20591.142857 28566.619048 49157.761905
std 8010.431825 3130.041123 4924.881364 7920.842814
min 33270.000000 14994.000000 17652.000000 32675.000000
25% 45991.000000 18625.000000 26597.000000 45310.000000
50% 49090.000000 20076.000000 28689.000000 48380.000000
75% 54574.000000 22845.000000 31041.000000 53763.000000
max 64543.000000 25986.000000 37672.000000 63658.000000
TransWomenWEEKcount TransMenWEEKcount EnbyWEEKcount \
count 21.000000 21.000000 21.00000
mean 126.380952 164.000000 484.00000
std 23.341971 27.597101 63.95389
min 91.000000 106.000000 350.00000
25% 112.000000 147.000000 443.00000
50% 123.000000 158.000000 484.00000
75% 145.000000 185.000000 523.00000
max 172.000000 230.000000 579.00000
NonCisgenderWEEKcount
count 21.000000
mean 774.380952
std 101.523631
min 595.000000
25% 710.000000
50% 773.000000
75% 849.000000
max 942.000000
Mode of AllWEEKcount : 64543
Variance of AllWEEKcount : 64167018.02857144
Mode of CisgenderWEEKcount : 63658
Variance of CisgenderWEEKcount : 62739750.89047618
Mode of NonCisgenderWEEKcount : 911
Variance of NonCisgenderWEEKcount : 10307.04761904762
Mode of CisMenWEEKcount : 25986
Variance of CisMenWEEKcount : 9797157.42857143
Mode of CisWomenWEEKcount : 37672
Variance of CisWomenWEEKcount : 24254456.447619047
Mode of TransWomenWEEKcount : 131
Variance of TransWomenWEEKcount : 544.8476190476191
Mode of TransMenWEEKcount : 147
Variance of TransMenWEEKcount : 761.6
Mode of EnbyWEEKcount : 566
Variance of EnbyWEEKcount : 4090.1
# build datasets based on chosen gender
col_for_counts = ["EST_ST"]
pulseStateReduceddf = pulseIncomeStatsdf
#print("Total set state counts:")
pulseStateCountdf = combinedf(pulseStateReduceddf,col_for_counts,"All")
#print(pulseStateCountdf)
pulseStateTotalsdf = pd.DataFrame()
pulseStateTotalsdf["AllEST_STcount"] = pulseStateCountdf["AllEST_STcount"].astype(int)
del pulseStateCountdf
pulseCisMendf = pulseIncomeStatsdf
pulseCisMendf = pulseCisMendf[pulseCisMendf["EGENID_BIRTH"] == 1]
pulseCisMendf = pulseCisMendf[pulseCisMendf["GENID_DESCRIBE"] == 1]
#print("Cis men state counts:")
pulseCisMenReducedDf = combinedf(pulseCisMendf,col_for_counts,"CisMen")
pulseStateTotalsdf["CisMenEST_STcount"] = pulseCisMenReducedDf["CisMenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisMenEST_STcount"])
pulseCisWomendf = pulseIncomeStatsdf
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["EGENID_BIRTH"] == 2]
pulseCisWomendf = pulseCisWomendf[pulseCisWomendf["GENID_DESCRIBE"] == 2]
#print("Cis men state counts:")
pulseCisWomenReducedDf = combinedf(pulseCisWomendf,col_for_counts,"CisWomen")
pulseStateTotalsdf["CisWomenEST_STcount"] = pulseCisWomenReducedDf["CisWomenEST_STcount"].astype(int)
#print(pulseStateTotalsdf["CisWomenEST_STcount"])
#print("Cisgender state counts:")
cisdf = pd.DataFrame()
cisdf["CisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["CisMenEST_STcount","CisWomenEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["CisgenderEST_STcount"] = cisdf["CisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
pulseTranswomendf = pulseIncomeStatsdf
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["EGENID_BIRTH"] == 1]
pulseTranswomendf = pulseTranswomendf[pulseTranswomendf["GENID_DESCRIBE"].isin([2,3])]
#print("Trans women state counts:")
pulseTranswomenReduceddf = combinedf(pulseTranswomendf,col_for_counts,"TransWomen")
#print(pulseTranswomenReduceddf)
pulseStateTotalsdf["TransWomenEST_STcount"] = pulseTranswomenReduceddf["TransWomenEST_STcount"].astype(int)
pulseTransmendf = pulseIncomeStatsdf
pulseTransmendf = pulseTransmendf[pulseTransmendf["EGENID_BIRTH"] == 2]
pulseTransmendf = pulseTransmendf[pulseTransmendf["GENID_DESCRIBE"].isin([1,3])]
#print("Trans men column counts:")
pulseTransmenReduceddf = combinedf(pulseTransmendf,col_for_counts,"TransMen")
#print(pulseTransmenReduceddf)
pulseStateTotalsdf["TransMenEST_STcount"] = pulseTransmenReduceddf["TransMenEST_STcount"].astype(int)
pulseNonedf = pulseIncomeStatsdf
pulseNonedf = pulseNonedf[pulseNonedf["GENID_DESCRIBE"] == 4]
#print("Non-Binary column counts:")
pulseNoneReduceddf = combinedf(pulseNonedf,col_for_counts,"Enby")
#print(pulseNoneReduceddf)
pulseStateTotalsdf["EnbyEST_STcount"] = pulseNoneReduceddf["EnbyEST_STcount"].astype(int)
print("NonCisgender state counts:")
transdf = pd.DataFrame()
transdf["NonCisgenderEST_STcount"] = pulseStateTotalsdf.loc[:,["TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]].sum(axis=1)
pulseStateTotalsdf["NonCisgenderEST_STcount"] = transdf["NonCisgenderEST_STcount"].astype(int)
#print(pulseStateTotalsdf)
del transdf
NonCisgender state counts:
print("Percentages:")
pulseStateTotalsdf.assign(CisPercent = lambda x: (round(x["CisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
pulseStateTotalsdf.assign(NonCisPercent = lambda x: (round(x["NonCisgenderEST_STcount"]/x["AllEST_STcount"] * 100,2)))
Percentages:
| AllEST_STcount | CisMenEST_STcount | CisWomenEST_STcount | CisgenderEST_STcount | TransWomenEST_STcount | TransMenEST_STcount | EnbyEST_STcount | NonCisgenderEST_STcount | NonCisPercent | |
|---|---|---|---|---|---|---|---|---|---|
| 6 | 79763 | 35075 | 43358 | 78433 | 196 | 292 | 842 | 1330 | 1.67 |
| 48 | 52968 | 22829 | 29432 | 52261 | 118 | 136 | 453 | 707 | 1.33 |
| 53 | 44216 | 19190 | 24043 | 43233 | 161 | 216 | 606 | 983 | 2.22 |
| 12 | 36164 | 15525 | 20207 | 35732 | 69 | 76 | 287 | 432 | 1.19 |
| 25 | 29233 | 12153 | 16581 | 28734 | 97 | 106 | 296 | 499 | 1.71 |
| 41 | 29039 | 11455 | 16841 | 28296 | 139 | 188 | 416 | 743 | 2.56 |
| 26 | 28883 | 11989 | 16456 | 28445 | 80 | 91 | 267 | 438 | 1.52 |
| 49 | 28771 | 12504 | 15855 | 28359 | 81 | 79 | 252 | 412 | 1.43 |
| 51 | 28311 | 12243 | 15679 | 27922 | 62 | 71 | 256 | 389 | 1.37 |
| 4 | 28004 | 11663 | 15922 | 27585 | 63 | 83 | 273 | 419 | 1.50 |
| 8 | 27907 | 11805 | 15629 | 27434 | 70 | 110 | 293 | 473 | 1.69 |
| 42 | 26712 | 11195 | 15118 | 26313 | 67 | 95 | 237 | 399 | 1.49 |
| 13 | 24722 | 9982 | 14380 | 24362 | 58 | 85 | 217 | 360 | 1.46 |
| 24 | 24421 | 9971 | 14084 | 24055 | 40 | 82 | 244 | 366 | 1.50 |
| 27 | 23848 | 10004 | 13467 | 23471 | 78 | 80 | 219 | 377 | 1.58 |
| 17 | 22360 | 9488 | 12471 | 21959 | 80 | 76 | 245 | 401 | 1.79 |
| 20 | 19673 | 7807 | 11564 | 19371 | 49 | 73 | 180 | 302 | 1.54 |
| 36 | 19474 | 8180 | 10947 | 19127 | 44 | 82 | 221 | 347 | 1.78 |
| 16 | 19306 | 7815 | 11212 | 19027 | 42 | 67 | 170 | 279 | 1.45 |
| 35 | 19302 | 7508 | 11472 | 18980 | 45 | 72 | 205 | 322 | 1.67 |
| 34 | 19089 | 8364 | 10506 | 18870 | 32 | 34 | 153 | 219 | 1.15 |
| 37 | 18992 | 7637 | 11100 | 18737 | 33 | 53 | 169 | 255 | 1.34 |
| 18 | 18836 | 7441 | 11123 | 18564 | 39 | 59 | 174 | 272 | 1.44 |
| 29 | 18833 | 7387 | 11145 | 18532 | 55 | 60 | 186 | 301 | 1.60 |
| 55 | 18774 | 7712 | 10750 | 18462 | 50 | 84 | 178 | 312 | 1.66 |
| 47 | 18081 | 7053 | 10763 | 17816 | 29 | 64 | 172 | 265 | 1.47 |
| 9 | 17796 | 7118 | 10424 | 17542 | 44 | 53 | 157 | 254 | 1.43 |
| 39 | 17452 | 7106 | 10094 | 17200 | 46 | 51 | 155 | 252 | 1.44 |
| 19 | 17153 | 6676 | 10241 | 16917 | 38 | 50 | 148 | 236 | 1.38 |
| 33 | 16854 | 7144 | 9473 | 16617 | 44 | 50 | 143 | 237 | 1.41 |
| 32 | 16779 | 7124 | 9422 | 16546 | 36 | 46 | 151 | 233 | 1.39 |
| 2 | 16617 | 6761 | 9546 | 16307 | 51 | 60 | 199 | 310 | 1.87 |
| 40 | 16149 | 6216 | 9686 | 15902 | 41 | 50 | 156 | 247 | 1.53 |
| 31 | 16114 | 6469 | 9396 | 15865 | 31 | 57 | 161 | 249 | 1.55 |
| 45 | 15821 | 6122 | 9491 | 15613 | 26 | 38 | 144 | 208 | 1.31 |
| 21 | 14878 | 5790 | 8900 | 14690 | 43 | 39 | 106 | 188 | 1.26 |
| 5 | 13482 | 5176 | 8145 | 13321 | 28 | 36 | 97 | 161 | 1.19 |
| 1 | 13238 | 5261 | 7807 | 13068 | 26 | 35 | 109 | 170 | 1.28 |
| 30 | 12483 | 4962 | 7352 | 12314 | 31 | 33 | 105 | 169 | 1.35 |
| 22 | 12068 | 4465 | 7413 | 11878 | 33 | 25 | 132 | 190 | 1.57 |
| 50 | 11646 | 4572 | 6880 | 11452 | 34 | 50 | 110 | 194 | 1.67 |
| 10 | 11563 | 4573 | 6833 | 11406 | 21 | 28 | 108 | 157 | 1.36 |
| 15 | 11375 | 4895 | 6309 | 11204 | 30 | 30 | 111 | 171 | 1.50 |
| 23 | 11128 | 4338 | 6605 | 10943 | 35 | 41 | 109 | 185 | 1.66 |
| 46 | 10867 | 4453 | 6284 | 10737 | 26 | 18 | 86 | 130 | 1.20 |
| 56 | 10844 | 4325 | 6360 | 10685 | 28 | 25 | 106 | 159 | 1.47 |
| 54 | 10685 | 3943 | 6606 | 10549 | 23 | 30 | 83 | 136 | 1.27 |
| 44 | 9601 | 3822 | 5595 | 9417 | 24 | 47 | 113 | 184 | 1.92 |
| 28 | 9301 | 3342 | 5846 | 9188 | 14 | 19 | 80 | 113 | 1.21 |
| 38 | 8999 | 3786 | 5086 | 8872 | 24 | 19 | 84 | 127 | 1.41 |
print(pulseStateTotalsdf.head())
print()
state_count_col = ["AllEST_STcount","CisgenderEST_STcount","NonCisgenderEST_STcount"
,"CisMenEST_STcount","CisWomenEST_STcount"
,"TransWomenEST_STcount","TransMenEST_STcount","EnbyEST_STcount"]
describeDF(pulseStateTotalsdf,state_count_col)
AllEST_STcount CisMenEST_STcount CisWomenEST_STcount \
6 79763 35075 43358
48 52968 22829 29432
53 44216 19190 24043
12 36164 15525 20207
25 29233 12153 16581
CisgenderEST_STcount TransWomenEST_STcount TransMenEST_STcount \
6 78433 196 292
48 52261 118 136
53 43233 161 216
12 35732 69 76
25 28734 97 106
EnbyEST_STcount NonCisgenderEST_STcount
6 842 1330
48 453 707
53 606 983
12 287 432
25 296 499
AllEST_STcount CisMenEST_STcount CisWomenEST_STcount \
count 50.000000 50.000000 50.000000
mean 20971.500000 8648.280000 11997.980000
std 12222.406664 5449.401873 6575.810556
min 8999.000000 3342.000000 5086.000000
25% 13299.000000 5197.250000 7891.500000
50% 18427.500000 7265.500000 10628.000000
75% 24646.750000 9998.500000 14306.000000
max 79763.000000 35075.000000 43358.000000
CisgenderEST_STcount TransWomenEST_STcount TransMenEST_STcount \
count 50.000000 50.000000 50.000000
mean 20646.260000 53.080000 68.880000
std 12013.928975 36.049762 50.080604
min 8872.000000 14.000000 18.000000
25% 13131.250000 31.000000 38.250000
50% 18139.000000 42.500000 58.000000
75% 24285.250000 62.750000 81.500000
max 78433.000000 196.000000 292.000000
EnbyEST_STcount NonCisgenderEST_STcount
count 50.000000 50.000000
mean 203.280000 325.240000
std 135.915946 219.943366
min 80.000000 113.000000
25% 111.500000 188.500000
50% 169.500000 260.000000
75% 242.250000 386.000000
max 842.000000 1330.000000
Mode of AllEST_STcount : 79763
Variance of AllEST_STcount : 149387224.66326532
Mode of CisgenderEST_STcount : 78433
Variance of CisgenderEST_STcount : 144334489.42081633
Mode of NonCisgenderEST_STcount : 1330
Variance of NonCisgenderEST_STcount : 48375.08408163265
Mode of CisMenEST_STcount : 35075
Variance of CisMenEST_STcount : 29695980.777142856
Mode of CisWomenEST_STcount : 43358
Variance of CisWomenEST_STcount : 43241284.46897959
Mode of TransWomenEST_STcount : 44
Variance of TransWomenEST_STcount : 1299.585306122449
Mode of TransMenEST_STcount : 50
Variance of TransMenEST_STcount : 2508.0669387755106
Mode of EnbyEST_STcount : 106
Variance of EnbyEST_STcount : 18473.14448979592
#add basic gender column
pulseMungedf = pulseIncomeStatsdf
pulseMungedf["CUR_GENID"] = pulseMungedf.apply(lambda x: basicGenMarker(x["EGENID_BIRTH"],x["GENID_DESCRIBE"]), axis=1)
#replace state column name to allow merging
pulseMungedf.rename(columns={"EST_ST":"stateId"}, inplace=True)
print(pulseMungedf.head())
pulseMungedf["stateId"] = pulseMungedf["stateId"].astype(int)
reedFulldf["stateId"] = reedFulldf["stateId"].astype(int)
pulseMungedf = pd.merge(pulseMungedf, reedFulldf, on="stateId", how='inner')
print(pulseMungedf.head())
#list(pulseMungedf.columns)
SCRAM WEEK stateId TBIRTH_YEAR EEDUC AEDUC EGENID_BIRTH \
1 V340000002 34 4 1982 7 2 2
3 V340000004 34 31 1957 4 2 1
4 V340000005 34 45 1962 5 2 2
5 V340000006 34 8 1956 7 2 1
6 V340000007 34 41 1982 7 2 2
AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION INCOME ENDDATE \
1 2 2 2 7 8/2/2021
3 2 1 2 6 8/2/2021
4 2 2 2 4 8/2/2021
5 2 1 2 7 8/2/2021
6 2 2 2 8 8/2/2021
EDUCATION ASSIGNEDGENDER CHOSENGENDER SEXUALORIENTATION \
1 Graduate degree female female straight
3 some college male male straight
4 Associate's degree female female straight
5 Graduate degree male male straight
6 Graduate degree female female straight
INCOMEMIN CUR_GENID
1 150000 Cisgender Woman
3 100000 Cisgender Man
4 50000 Cisgender Woman
5 150000 Cisgender Man
6 200000 Cisgender Woman
SCRAM WEEK stateId TBIRTH_YEAR EEDUC AEDUC EGENID_BIRTH \
0 V340000002 34 4 1982 7 2 2
1 V340000076 34 4 1986 6 2 1
2 V340000087 34 4 1945 6 2 1
3 V340000238 34 4 1966 6 2 2
4 V340000281 34 4 1973 4 2 2
AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION ... \
0 2 2 2 ...
1 2 1 2 ...
2 2 1 2 ...
3 2 2 2 ...
4 2 2 2 ...
veryReligiousStatista2017 moderatelyReligiousStatista2017 \
0 0.31 0.31
1 0.31 0.31
2 0.31 0.31
3 0.31 0.31
4 0.31 0.31
nonreligiousStatista2017 relLibScore2022 relLibVote2022 relLibVax2022 \
0 0.39 0.4156 1.0 1.0
1 0.39 0.4156 1.0 1.0
2 0.39 0.4156 1.0 1.0
3 0.39 0.4156 1.0 1.0
4 0.39 0.4156 1.0 1.0
relLibHealth2022 relLibHealthMandate2022 relLibMarriage2022 relLibRfra2022
0 4.0 1.0 0.0 1.0
1 4.0 1.0 0.0 1.0
2 4.0 1.0 0.0 1.0
3 4.0 1.0 0.0 1.0
4 4.0 1.0 0.0 1.0
[5 rows x 43 columns]
munge_col_list = ["SCRAM","WEEK","stateId","TBIRTH_YEAR","EEDUC","AEDUC","EGENID_BIRTH","AGENID_BIRTH",
"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOME","ENDDATE","EDUCATION","ASSIGNEDGENDER",
"CHOSENGENDER","SEXUALORIENTATION","INCOMEMIN","CUR_GENID",
"stateName","statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023",
"antiTransLegislationRiskIndex122022","antiTransLegislationRiskIndex112022","transAdultPop2016",
"transAdultPercent2016","transAdultPop2022","transAdultPercent2022","religionImportantPew2014",
"worshipWeeklyPew2014","prayDailyPew2014","certainAboutGodPew2014","overallReligiosityPew2014",
"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017",
"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022",
"relLibMarriage2022","relLibRfra2022"]
#sample from the dataset based on the "CUR_GENID" and stateId columns,
#looking for (3gender*50state)*113samples = 16,950 rows per run
seed_value = 19
random.seed(seed_value)
rand_int = random.randint(0,1000)
modelSampledf = pulseMungedf.groupby(["CUR_GENID","stateId"]).sample(n=113,random_state=rand_int)
describeDF(modelSampledf,munge_col_list)
WEEK stateId TBIRTH_YEAR EEDUC AEDUC \
count 16950.000000 16950.000000 16950.000000 16950.000000 16950.000000
mean 44.072153 29.320000 1971.105959 5.254572 1.995162
std 6.117397 15.624084 16.989544 1.487861 0.069388
min 34.000000 1.000000 1933.000000 1.000000 1.000000
25% 39.000000 17.000000 1957.000000 4.000000 2.000000
50% 44.000000 29.500000 1971.000000 6.000000 2.000000
75% 50.000000 42.000000 1985.000000 7.000000 2.000000
max 54.000000 56.000000 2005.000000 7.000000 2.000000
EGENID_BIRTH AGENID_BIRTH GENID_DESCRIBE SEXUAL_ORIENTATION \
count 16950.000000 16950.00000 16950.000000 16950.000000
mean 1.533510 1.97056 2.136224 2.308024
std 0.498891 0.16904 1.122674 0.850734
min 1.000000 1.00000 1.000000 1.000000
25% 1.000000 2.00000 1.000000 2.000000
50% 2.000000 2.00000 2.000000 2.000000
75% 2.000000 2.00000 3.000000 2.000000
max 2.000000 2.00000 4.000000 5.000000
INCOME ... veryReligiousStatista2017 \
count 16950.000000 ... 16950.000000
mean 4.256342 ... 0.371600
std 2.169266 ... 0.089543
min 1.000000 ... 0.160000
25% 2.000000 ... 0.310000
50% 4.000000 ... 0.365000
75% 6.000000 ... 0.440000
max 8.000000 ... 0.590000
moderatelyReligiousStatista2017 nonreligiousStatista2017 \
count 16950.000000 16950.000000
mean 0.287200 0.342000
std 0.030137 0.098856
min 0.160000 0.120000
25% 0.270000 0.290000
50% 0.295000 0.340000
75% 0.300000 0.400000
max 0.330000 0.590000
relLibScore2022 relLibVote2022 relLibVax2022 relLibHealth2022 \
count 16950.000000 16950.000000 16950.000000 16950.00000
mean 0.393948 0.800000 0.900000 6.76000
std 0.131963 0.400012 0.300009 3.98288
min 0.155800 0.000000 0.000000 0.00000
25% 0.311700 1.000000 1.000000 4.00000
50% 0.371200 1.000000 1.000000 5.50000
75% 0.476200 1.000000 1.000000 9.00000
max 0.818200 1.000000 1.000000 20.00000
relLibHealthMandate2022 relLibMarriage2022 relLibRfra2022
count 16950.000000 16950.000000 16950.000000
mean 0.640000 1.160000 0.480000
std 0.480014 1.474628 0.499615
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 1.000000 0.000000 0.000000
75% 1.000000 3.000000 1.000000
max 1.000000 5.000000 1.000000
[8 rows x 35 columns]
Mode of WEEK : 54
Variance of WEEK : 37.422547430596495
Mode of stateId : 1
Variance of stateId : 244.11200188801695
Mode of TBIRTH_YEAR : 1960
Variance of TBIRTH_YEAR : 288.6446219936923
Mode of EEDUC : 6
Variance of EEDUC : 2.2137306418648626
Mode of AEDUC : 2
Variance of AEDUC : 0.00481463825799801
Mode of EGENID_BIRTH : 2
Variance of EGENID_BIRTH : 0.24889174203157796
Mode of AGENID_BIRTH : 2
Variance of AGENID_BIRTH : 0.02857452802620946
Mode of GENID_DESCRIBE : 1
Variance of GENID_DESCRIBE : 1.2603963861043481
Mode of SEXUAL_ORIENTATION : 2
Variance of SEXUAL_ORIENTATION : 0.7237482323771591
Mode of INCOME : 4
Variance of INCOME : 4.705716751155775
Mode of INCOMEMIN : 50000
Variance of INCOMEMIN : 3305722837.0866547
Mode of statePopulation2020 : 5024279
Variance of statePopulation2020 : 54193215481182.87
Mode of statePopulation2023 : 5097641
Variance of statePopulation2023 : 356648599406395.7
Mode of antiTransLegislationRiskIndex32023 : 4
Variance of antiTransLegislationRiskIndex32023 : 2.513748303734734
Mode of antiTransLegislationRiskIndex122022 : 3
Variance of antiTransLegislationRiskIndex122022 : 1.7605038645347806
Mode of antiTransLegislationRiskIndex112022 : 1
Variance of antiTransLegislationRiskIndex112022 : 1.7477031093279838
Mode of transAdultPop2016 : 2700
Variance of transAdultPop2016 : 1331132916.9154522
Mode of transAdultPercent2016 : 0.43
Variance of transAdultPercent2016 : 0.014520696678270103
Mode of transAdultPop2022 : 6300
Variance of transAdultPop2022 : 828822454.0798867
Mode of transAdultPercent2022 : 0.6
Variance of transAdultPercent2022 : 0.015779690955218594
Mode of religionImportantPew2014 : 0.44
Variance of religionImportantPew2014 : 0.011325668181013627
Mode of worshipWeeklyPew2014 : 0.34
Variance of worshipWeeklyPew2014 : 0.005517965543689893
Mode of prayDailyPew2014 : 0.51
Variance of prayDailyPew2014 : 0.008712554014986136
Mode of certainAboutGodPew2014 : 0.61
Variance of certainAboutGodPew2014 : 0.008895564812083307
Mode of overallReligiosityPew2014 : 0.54
Variance of overallReligiosityPew2014 : 0.011309667237005131
Mode of veryReligiousStatista2017 : 0.28
Variance of veryReligiousStatista2017 : 0.008017913033217302
Mode of moderatelyReligiousStatista2017 : 0.3
Variance of moderatelyReligiousStatista2017 : 0.0009082135819222371
Mode of nonreligiousStatista2017 : 0.33
Variance of nonreligiousStatista2017 : 0.009772576553188979
Mode of relLibScore2022 : 0.3377
Variance of relLibScore2022 : 0.0174141322784353
Mode of relLibVote2022 : 1.0
Variance of relLibVote2022 : 0.16000944008496082
Mode of relLibVax2022 : 1.0
Variance of relLibVax2022 : 0.09000531004779044
Mode of relLibHealth2022 : 5.0
Variance of relLibHealth2022 : 15.86333589002301
Mode of relLibHealthMandate2022 : 1.0
Variance of relLibHealthMandate2022 : 0.2304135937223435
Mode of relLibMarriage2022 : 0.0
Variance of relLibMarriage2022 : 2.174528290754617
Mode of relLibRfra2022 : 0.0
Variance of relLibRfra2022 : 0.2496147265325388
#convert all values that will be used to model to categories or numbers
col_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"GENID_DESCRIBE","SEXUAL_ORIENTATION","INCOMEMIN","CUR_GENID"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
dfClean = modelSampledf[col_list].copy()
print(dfClean.dtypes)
# convert text columns to category values
dfClean["CUR_GENID"] = dfClean["CUR_GENID"].astype("category")
dfClean["CUR_GENID_CAT"] = dfClean["CUR_GENID"].cat.codes
print(dfClean.describe())
WEEK int64
stateId int32
TBIRTH_YEAR int64
EEDUC int64
EGENID_BIRTH int64
GENID_DESCRIBE int64
SEXUAL_ORIENTATION int64
INCOMEMIN int64
CUR_GENID object
statePopulation2020 int64
statePopulation2023 int64
antiTransLegislationRiskIndex32023 int64
transAdultPop2022 int64
overallReligiosityPew2014 float64
veryReligiousStatista2017 float64
moderatelyReligiousStatista2017 float64
nonreligiousStatista2017 float64
relLibScore2022 float64
relLibVote2022 float64
relLibVax2022 float64
relLibHealth2022 float64
relLibHealthMandate2022 float64
relLibMarriage2022 float64
relLibRfra2022 float64
dtype: object
WEEK stateId TBIRTH_YEAR EEDUC EGENID_BIRTH \
count 16950.000000 16950.000000 16950.000000 16950.000000 16950.000000
mean 44.072153 29.320000 1971.105959 5.254572 1.533510
std 6.117397 15.624084 16.989544 1.487861 0.498891
min 34.000000 1.000000 1933.000000 1.000000 1.000000
25% 39.000000 17.000000 1957.000000 4.000000 1.000000
50% 44.000000 29.500000 1971.000000 6.000000 2.000000
75% 50.000000 42.000000 1985.000000 7.000000 2.000000
max 54.000000 56.000000 2005.000000 7.000000 2.000000
GENID_DESCRIBE SEXUAL_ORIENTATION INCOMEMIN statePopulation2020 \
count 16950.000000 16950.000000 16950.000000 1.695000e+04
mean 2.136224 2.308024 70282.595870 6.615242e+06
std 1.122674 0.850734 57495.415792 7.361604e+06
min 1.000000 1.000000 0.000000 5.768510e+05
25% 1.000000 2.000000 25000.000000 1.839106e+06
50% 2.000000 2.000000 50000.000000 4.581796e+06
75% 3.000000 2.000000 100000.000000 7.705281e+06
max 4.000000 5.000000 200000.000000 3.953822e+07
statePopulation2023 ... moderatelyReligiousStatista2017 \
count 1.695000e+04 ... 16950.000000
mean 8.960485e+06 ... 0.287200
std 1.888514e+07 ... 0.030137
min 5.808170e+05 ... 0.160000
25% 1.920562e+06 ... 0.270000
50% 4.625424e+06 ... 0.295000
75% 7.999503e+06 ... 0.300000
max 1.309280e+08 ... 0.330000
nonreligiousStatista2017 relLibScore2022 relLibVote2022 \
count 16950.000000 16950.000000 16950.000000
mean 0.342000 0.393948 0.800000
std 0.098856 0.131963 0.400012
min 0.120000 0.155800 0.000000
25% 0.290000 0.311700 1.000000
50% 0.340000 0.371200 1.000000
75% 0.400000 0.476200 1.000000
max 0.590000 0.818200 1.000000
relLibVax2022 relLibHealth2022 relLibHealthMandate2022 \
count 16950.000000 16950.00000 16950.000000
mean 0.900000 6.76000 0.640000
std 0.300009 3.98288 0.480014
min 0.000000 0.00000 0.000000
25% 1.000000 4.00000 0.000000
50% 1.000000 5.50000 1.000000
75% 1.000000 9.00000 1.000000
max 1.000000 20.00000 1.000000
relLibMarriage2022 relLibRfra2022 CUR_GENID_CAT
count 16950.000000 16950.000000 16950.000000
mean 1.160000 0.480000 1.000000
std 1.474628 0.499615 0.816521
min 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000
50% 0.000000 0.000000 1.000000
75% 3.000000 1.000000 2.000000
max 5.000000 1.000000 2.000000
[8 rows x 24 columns]
# Build kNN Classifier to sort and classify data
# reduce dimensionality based on experimentation and hypothesis criteria
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
trainSize = 0.3
trainState = 1
# split datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)
# scale input data for training if necessary for better predictions
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.fit_transform(X_test)
# prep data
cv_count = 18
max_neighbors = 200 #arbitrary magic number
parameters = {"n_neighbors": np.arange(1, max_neighbors)}
# run regression
knnr = KNeighborsClassifier(n_neighbors=cv_count,weights='distance')
# best neighbor count found in testing at 127
# use gridsearch to test all values for best n_neighbors number and highest accuracy
#knnr_gscv = GridSearchCV(knnr, parameters, cv=cv_count)
#knnr_gscv.fit(X.values, y.values)
#print("Best value for neighbor count found: ",knnr_gscv.best_params_)
#print("Best Average Accuracy found: ",knnr_gscv.best_score_)
# Build the new model
# split dataset into dependent(features) and independent(target) variable
#params = knnr_gscv.best_params_
#n_count = int(params['n_neighbors'])
n_count = 18
# splt datasets into training and test sets
size = 0.3
state = 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=size, random_state=state)
# run regression
knnr = KNeighborsClassifier(n_neighbors=n_count,weights='distance')
knnr.fit(X_train.values, y_train.values)
pred = knnr.predict(X_test.values)
# confusion matrix for visualization is available, but unnecessary for this dataset
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6, 6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
for j in range(cfm.shape[1]):
ax.text(x=j, y=i,s=cfm[i, j], va='center',
ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('kNN Confusion Matrix', fontsize = 16)
plt.show()
print(metrics.classification_report(y_test, pred, zero_division = 0))
precision recall f1-score support
0 0.42 0.43 0.43 1721
1 0.36 0.38 0.37 1650
2 0.48 0.43 0.45 1714
accuracy 0.42 5085
macro avg 0.42 0.42 0.42 5085
weighted avg 0.42 0.42 0.42 5085
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix')
precision recall f1-score support
0 0.84 0.97 0.90 3976
1 0.75 0.93 0.83 3896
2 0.83 0.51 0.63 3993
accuracy 0.80 11865
macro avg 0.81 0.80 0.79 11865
weighted avg 0.81 0.80 0.79 11865
# initialize, train and test the GNB
gnb = GaussianNB()
gnb.fit(X_train, y_train)
pred = gnb.predict(X_test)
#check accuracy
gnb_accuracy = metrics.accuracy_score(pred, y_test)
#confusion matrix
cfm = metrics.confusion_matrix(y_test, pred)
fig, ax = plt.subplots(figsize=(6,6))
ax.matshow(cfm, cmap = plt.cm.Purples, alpha=0.3)
for i in range(cfm.shape[0]):
for j in range(cfm.shape[1]):
ax.text(x=j, y=i,s=cfm[i, j], va='center',
ha='center', size='xx-large')
plt.xlabel('Predictions', fontsize = 16)
plt.ylabel('Actuals', fontsize = 16)
plt.title('Gaussian NB Confusion Matrix', fontsize = 16)
plt.show()
print(metrics.classification_report(y_test, pred, zero_division = 0))
precision recall f1-score support
0 0.44 0.41 0.42 1721
1 0.32 0.15 0.20 1650
2 0.40 0.63 0.49 1714
accuracy 0.40 5085
macro avg 0.39 0.40 0.37 5085
weighted avg 0.39 0.40 0.38 5085
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix')
precision recall f1-score support
0 0.84 0.97 0.90 3976
1 0.69 0.98 0.81 3896
2 0.87 0.39 0.54 3993
accuracy 0.78 11865
macro avg 0.80 0.78 0.75 11865
weighted avg 0.80 0.78 0.75 11865
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Sexuality Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Sexuality Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.89 0.75 3896
2 0.52 0.22 0.31 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.62 11865
weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.90 0.78 3896
2 0.68 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.69 0.70 0.64 11865
weighted avg 0.69 0.70 0.64 11865
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Education and Income Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Education and Income Removed')
precision recall f1-score support
0 0.84 0.97 0.90 3976
1 0.75 0.93 0.83 3896
2 0.83 0.51 0.63 3993
accuracy 0.80 11865
macro avg 0.81 0.80 0.79 11865
weighted avg 0.81 0.80 0.79 11865
precision recall f1-score support
0 0.84 0.97 0.90 3976
1 0.69 0.98 0.81 3896
2 0.87 0.39 0.54 3993
accuracy 0.78 11865
macro avg 0.80 0.78 0.75 11865
weighted avg 0.80 0.78 0.75 11865
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Population Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Population Table Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.90 0.75 3896
2 0.52 0.21 0.30 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.62 11865
weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.91 0.78 3896
2 0.69 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.70 0.70 0.64 11865
weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Anti-Trans Legislation Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Anti-Trans Legislation Table Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.89 0.75 3896
2 0.52 0.22 0.31 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.63 11865
weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.90 0.78 3896
2 0.68 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.70 0.70 0.64 11865
weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Pew 2014 Table Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.89 0.75 3896
2 0.52 0.22 0.31 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.62 11865
weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.91 0.78 3896
2 0.68 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.70 0.70 0.64 11865
weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 Table Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.90 0.75 3896
2 0.52 0.22 0.31 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.62 11865
weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.90 0.78 3896
2 0.68 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.69 0.70 0.64 11865
weighted avg 0.69 0.70 0.64 11865
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Statista 2017 and Pew 2014 Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Statista 2017 and Pew 2014 Table Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.90 0.75 3896
2 0.52 0.22 0.31 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.62 11865
weighted avg 0.64 0.67 0.62 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.90 0.78 3896
2 0.68 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.69 0.70 0.64 11865
weighted avg 0.69 0.70 0.64 11865
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Religious Liberty Removed')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Religious Liberty Table Removed')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.89 0.75 3896
2 0.52 0.23 0.32 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.63 11865
weighted avg 0.64 0.67 0.63 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.69 0.91 0.78 3896
2 0.69 0.20 0.31 3993
accuracy 0.70 11865
macro avg 0.70 0.70 0.64 11865
weighted avg 0.70 0.70 0.64 11865
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printNBClassifierOutcome(X,y,trainSize,trainState,'Bernoulli Confusion Matrix Only Pulse Data')
printSVMClassifierOutcome(X,y,trainSize,trainState,'SVM Confusion Matrix Only Pulse Data')
precision recall f1-score support
0 0.74 0.90 0.81 3976
1 0.65 0.89 0.76 3896
2 0.53 0.23 0.32 3993
accuracy 0.67 11865
macro avg 0.64 0.67 0.63 11865
weighted avg 0.64 0.67 0.63 11865
precision recall f1-score support
0 0.71 1.00 0.83 3976
1 0.62 1.00 0.76 3896
2 0.00 0.00 0.00 3993
accuracy 0.66 11865
macro avg 0.44 0.67 0.53 11865
weighted avg 0.44 0.66 0.53 11865
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
# regression pick
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)
sc_X = StandardScaler()
sc_y = StandardScaler()
sc_X_train = sc_X.fit_transform(X_train)
sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
sc_y_train = sc_y_train
log_regression = LogisticRegression(solver="newton-cg", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.68
Test set score: 0.6743
featureNames Coefficients
0 WEEK 1.002763
1 stateId 1.000906
2 TBIRTH_YEAR 1.003440
3 EEDUC 0.005272
4 EGENID_BIRTH 1.041176
5 SEXUAL_ORIENTATION 1.000001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:210: ConvergenceWarning: newton-cg failed to converge. Increase the number of iterations. warnings.warn(
log_regression = LogisticRegression(solver="sag", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144
Test set score: 0.411
featureNames Coefficients
0 WEEK 0.500001
1 stateId 0.500001
2 TBIRTH_YEAR 0.499962
3 EEDUC 0.499995
4 EGENID_BIRTH 0.500000
5 SEXUAL_ORIENTATION 0.500001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
log_regression = LogisticRegression(solver="saga", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144
Test set score: 0.411
featureNames Coefficients
0 WEEK 0.500000
1 stateId 0.500000
2 TBIRTH_YEAR 0.499962
3 EEDUC 0.499997
4 EGENID_BIRTH 0.500000
5 SEXUAL_ORIENTATION 0.500001
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\linear_model\_sag.py:352: ConvergenceWarning: The max_iter was reached which means the coef_ did not converge warnings.warn(
log_regression = LogisticRegression(solver="lbfgs", random_state=0).fit(X_train,y_train)
print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#featTitle = "Logistic Regression Feature Coefficients"
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
coefArray = []
for x in log_regression.coef_[0]:
#print(np.exp(x)/(1 + np.exp(x)))
coefArray.append(np.exp(x)/(1 + np.exp(x)))
featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["featureNames", "Coefficients"])
print(featureDf)
Training set score: 0.4144
Test set score: 0.411
featureNames Coefficients
0 WEEK 0.499999
1 stateId 0.499999
2 TBIRTH_YEAR 0.499962
3 EEDUC 0.500000
4 EGENID_BIRTH 0.500000
5 SEXUAL_ORIENTATION 0.500001
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Full Dataset")
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 1-0.3, random_state = 0)
#sc_X = StandardScaler()
#sc_y = StandardScaler()
#sc_X_train = sc_X.fit_transform(X_train)
#sc_y_train = sc_y.fit_transform(y_train.values.reshape(len(y_train),1))
#sc_y_train = sc_y_train
#maxIter=1000000000
#log_regression = LogisticRegression(max_iter=max_iter)
#solvers = ["liblinear","newton-cg","sag","saga","lbfgs"]
#penalty=["l2"]
#cVals=[0.01,0.1,1.0,10.0,100.0]
#grid=dict(solver=solvers,penalty=penalty,C=cVals)
#cv=RepeatedStratifiedKFold(n_splits=10,n_repeats=3,random_state=state)
#grid_search = GridSearchCV(estimator=log_regression, param_grid=grid, n_jobs=-1, cv=cv, scoring='accuracy',error_score=0)
#grid_result = grid_search.fit(X,y)
# summarize results
#print("Accuracy rate of Logistic Regression: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
#Accuracy rate of Logistic Regression: 0.797286 using {'C': 0.01, 'penalty': 'l2', 'solver': 'newton-cg'}
#log_regression = LogisticRegression(solver="newton-cg", random_state=state, penalty="l2", C=0.01, max_iter=maxIter).fit(X_train,y_train)
#print(log_regression.coef_)
#print(log_regression.coef_.shape[0])
#print(log_regression.coef_[0])
#print("Training set score: " + str(round(log_regression.score(X_train,y_train), decPrecision)))
#print("Test set score: " + str(round(log_regression.score(X_test,y_test), decPrecision)))
#coefArray = []
#for ind in range(log_regression.coef_.shape[0]):
# if ind == 0:
# featFor = "Cisgender Men"
# elif ind == 1:
# featFor = "Cisgender Women"
# else:
# featFor = "Transgender"
# featTitle = "Logistic Regression " + featFor + " Feature Coefficients"
# for x in log_regression.coef_[ind]:
# #print(np.exp(x)/(1 + np.exp(x)))
# coefArray.append(np.exp(x)/(1 + np.exp(x)))
# featureDf = pd.DataFrame(zip(clean_feature_list, np.transpose(coefArray)), columns=["Features", "Coefficients"])
# featureDf = featureDf.sort_values(by=["Coefficients"], ascending=False)
# print(featureDf)
# #plot bar chart of importance
# f, ax = plt.subplots(figsize=(20,12))
# sns.barplot(x=featureDf["Features"], y=featureDf["Coefficients"], palette="flare")
# plt.title(featTitle, fontsize=14)
# plt.xticks(rotation=45)
# plt.show()
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.791
Features Coefficients
3 EEDUC 0.510278
11 overallReligiosityPew2014 0.504194
12 veryReligiousStatista2017 0.501816
13 moderatelyReligiousStatista2017 0.501757
21 relLibRfra2022 0.501642
15 relLibScore2022 0.500698
20 relLibMarriage2022 0.500558
2 TBIRTH_YEAR 0.500448
18 relLibHealth2022 0.500004
6 INCOMEMIN 0.500001
7 statePopulation2020 0.500000
8 statePopulation2023 0.500000
10 transAdultPop2022 0.499999
0 WEEK 0.499949
1 stateId 0.499818
19 relLibHealthMandate2022 0.499021
14 nonreligiousStatista2017 0.499015
9 antiTransLegislationRiskIndex32023 0.498912
17 relLibVax2022 0.497051
16 relLibVote2022 0.491821
5 SEXUAL_ORIENTATION 0.406382
4 EGENID_BIRTH 0.112777
Features Coefficients 3 EEDUC 0.510278 11 overallReligiosityPew2014 0.504194 12 veryReligiousStatista2017 0.501816 13 moderatelyReligiousStatista2017 0.501757 21 relLibRfra2022 0.501642 15 relLibScore2022 0.500698 20 relLibMarriage2022 0.500558 2 TBIRTH_YEAR 0.500448 18 relLibHealth2022 0.500004 6 INCOMEMIN 0.500001 7 statePopulation2020 0.500000 8 statePopulation2023 0.500000 10 transAdultPop2022 0.499999 0 WEEK 0.499949 1 stateId 0.499818 19 relLibHealthMandate2022 0.499021 14 nonreligiousStatista2017 0.499015 9 antiTransLegislationRiskIndex32023 0.498912 17 relLibVax2022 0.497051 16 relLibVote2022 0.491821 5 SEXUAL_ORIENTATION 0.406382 4 EGENID_BIRTH 0.112777
Features Coefficients 3 EEDUC 0.510278 11 overallReligiosityPew2014 0.504194 12 veryReligiousStatista2017 0.501816 13 moderatelyReligiousStatista2017 0.501757 21 relLibRfra2022 0.501642 15 relLibScore2022 0.500698 20 relLibMarriage2022 0.500558 2 TBIRTH_YEAR 0.500448 18 relLibHealth2022 0.500004 6 INCOMEMIN 0.500001 7 statePopulation2020 0.500000 8 statePopulation2023 0.500000 10 transAdultPop2022 0.499999 0 WEEK 0.499949 1 stateId 0.499818 19 relLibHealthMandate2022 0.499021 14 nonreligiousStatista2017 0.499015 9 antiTransLegislationRiskIndex32023 0.498912 17 relLibVax2022 0.497051 16 relLibVote2022 0.491821 5 SEXUAL_ORIENTATION 0.406382 4 EGENID_BIRTH 0.112777
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6689
Features Coefficients
3 EEDUC 0.511605
20 relLibRfra2022 0.505645
10 overallReligiosityPew2014 0.504704
12 moderatelyReligiousStatista2017 0.502584
19 relLibMarriage2022 0.501647
11 veryReligiousStatista2017 0.501373
0 WEEK 0.500593
2 TBIRTH_YEAR 0.500314
14 relLibScore2022 0.500257
8 antiTransLegislationRiskIndex32023 0.500143
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
9 transAdultPop2022 0.500000
17 relLibHealth2022 0.499961
1 stateId 0.499648
13 nonreligiousStatista2017 0.499627
16 relLibVax2022 0.497257
18 relLibHealthMandate2022 0.496243
15 relLibVote2022 0.489497
4 EGENID_BIRTH 0.112855
Features Coefficients 3 EEDUC 0.511605 20 relLibRfra2022 0.505645 10 overallReligiosityPew2014 0.504704 12 moderatelyReligiousStatista2017 0.502584 19 relLibMarriage2022 0.501647 11 veryReligiousStatista2017 0.501373 0 WEEK 0.500593 2 TBIRTH_YEAR 0.500314 14 relLibScore2022 0.500257 8 antiTransLegislationRiskIndex32023 0.500143 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.500000 17 relLibHealth2022 0.499961 1 stateId 0.499648 13 nonreligiousStatista2017 0.499627 16 relLibVax2022 0.497257 18 relLibHealthMandate2022 0.496243 15 relLibVote2022 0.489497 4 EGENID_BIRTH 0.112855
Features Coefficients 3 EEDUC 0.511605 20 relLibRfra2022 0.505645 10 overallReligiosityPew2014 0.504704 12 moderatelyReligiousStatista2017 0.502584 19 relLibMarriage2022 0.501647 11 veryReligiousStatista2017 0.501373 0 WEEK 0.500593 2 TBIRTH_YEAR 0.500314 14 relLibScore2022 0.500257 8 antiTransLegislationRiskIndex32023 0.500143 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.500000 17 relLibHealth2022 0.499961 1 stateId 0.499648 13 nonreligiousStatista2017 0.499627 16 relLibVax2022 0.497257 18 relLibHealthMandate2022 0.496243 15 relLibVote2022 0.489497 4 EGENID_BIRTH 0.112855
# Setup for A/B testing
# remove "EGENID_BIRTH", column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Sexuality Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.5102
Features Coefficients
3 EEDUC 0.510443
10 overallReligiosityPew2014 0.503726
18 relLibHealthMandate2022 0.502791
20 relLibRfra2022 0.502573
11 veryReligiousStatista2017 0.501265
12 moderatelyReligiousStatista2017 0.501013
8 antiTransLegislationRiskIndex32023 0.500868
0 WEEK 0.500678
14 relLibScore2022 0.500242
2 TBIRTH_YEAR 0.500045
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
9 transAdultPop2022 0.499999
1 stateId 0.499939
13 nonreligiousStatista2017 0.499419
17 relLibHealth2022 0.498873
19 relLibMarriage2022 0.497932
16 relLibVax2022 0.497623
15 relLibVote2022 0.490879
4 SEXUAL_ORIENTATION 0.402553
Features Coefficients 3 EEDUC 0.510443 10 overallReligiosityPew2014 0.503726 18 relLibHealthMandate2022 0.502791 20 relLibRfra2022 0.502573 11 veryReligiousStatista2017 0.501265 12 moderatelyReligiousStatista2017 0.501013 8 antiTransLegislationRiskIndex32023 0.500868 0 WEEK 0.500678 14 relLibScore2022 0.500242 2 TBIRTH_YEAR 0.500045 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.499999 1 stateId 0.499939 13 nonreligiousStatista2017 0.499419 17 relLibHealth2022 0.498873 19 relLibMarriage2022 0.497932 16 relLibVax2022 0.497623 15 relLibVote2022 0.490879 4 SEXUAL_ORIENTATION 0.402553
Features Coefficients 3 EEDUC 0.510443 10 overallReligiosityPew2014 0.503726 18 relLibHealthMandate2022 0.502791 20 relLibRfra2022 0.502573 11 veryReligiousStatista2017 0.501265 12 moderatelyReligiousStatista2017 0.501013 8 antiTransLegislationRiskIndex32023 0.500868 0 WEEK 0.500678 14 relLibScore2022 0.500242 2 TBIRTH_YEAR 0.500045 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 9 transAdultPop2022 0.499999 1 stateId 0.499939 13 nonreligiousStatista2017 0.499419 17 relLibHealth2022 0.498873 19 relLibMarriage2022 0.497932 16 relLibVax2022 0.497623 15 relLibVote2022 0.490879 4 SEXUAL_ORIENTATION 0.402553
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Education and Income Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.7853
Features Coefficients
9 overallReligiosityPew2014 0.503246
11 moderatelyReligiousStatista2017 0.501590
19 relLibRfra2022 0.501426
10 veryReligiousStatista2017 0.501360
13 relLibScore2022 0.500738
18 relLibMarriage2022 0.500612
2 TBIRTH_YEAR 0.500497
15 relLibVax2022 0.500336
16 relLibHealth2022 0.500105
5 statePopulation2020 0.500000
6 statePopulation2023 0.500000
7 transAdultPop2022 0.499999
1 stateId 0.499872
0 WEEK 0.499861
12 nonreligiousStatista2017 0.499604
17 relLibHealthMandate2022 0.499491
8 antiTransLegislationRiskIndex32023 0.496482
14 relLibVote2022 0.494724
4 SEXUAL_ORIENTATION 0.403340
3 EGENID_BIRTH 0.114981
Features Coefficients 9 overallReligiosityPew2014 0.503246 11 moderatelyReligiousStatista2017 0.501590 19 relLibRfra2022 0.501426 10 veryReligiousStatista2017 0.501360 13 relLibScore2022 0.500738 18 relLibMarriage2022 0.500612 2 TBIRTH_YEAR 0.500497 15 relLibVax2022 0.500336 16 relLibHealth2022 0.500105 5 statePopulation2020 0.500000 6 statePopulation2023 0.500000 7 transAdultPop2022 0.499999 1 stateId 0.499872 0 WEEK 0.499861 12 nonreligiousStatista2017 0.499604 17 relLibHealthMandate2022 0.499491 8 antiTransLegislationRiskIndex32023 0.496482 14 relLibVote2022 0.494724 4 SEXUAL_ORIENTATION 0.403340 3 EGENID_BIRTH 0.114981
Features Coefficients 9 overallReligiosityPew2014 0.503246 11 moderatelyReligiousStatista2017 0.501590 19 relLibRfra2022 0.501426 10 veryReligiousStatista2017 0.501360 13 relLibScore2022 0.500738 18 relLibMarriage2022 0.500612 2 TBIRTH_YEAR 0.500497 15 relLibVax2022 0.500336 16 relLibHealth2022 0.500105 5 statePopulation2020 0.500000 6 statePopulation2023 0.500000 7 transAdultPop2022 0.499999 1 stateId 0.499872 0 WEEK 0.499861 12 nonreligiousStatista2017 0.499604 17 relLibHealthMandate2022 0.499491 8 antiTransLegislationRiskIndex32023 0.496482 14 relLibVote2022 0.494724 4 SEXUAL_ORIENTATION 0.403340 3 EGENID_BIRTH 0.114981
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Population Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.7093
Features Coefficients
4 EEDUC 0.509532
17 relLibRfra2022 0.505297
16 relLibMarriage2022 0.502339
7 overallReligiosityPew2014 0.502115
6 antiTransLegislationRiskIndex32023 0.501436
9 moderatelyReligiousStatista2017 0.501296
0 WEEK 0.500696
8 veryReligiousStatista2017 0.500224
11 relLibScore2022 0.500088
5 INCOMEMIN 0.500001
14 relLibHealth2022 0.499730
1 stateId 0.499727
10 nonreligiousStatista2017 0.498659
2 TBIRTH_YEAR 0.498232
15 relLibHealthMandate2022 0.495723
13 relLibVax2022 0.493200
12 relLibVote2022 0.489051
3 EGENID_BIRTH 0.111752
Features Coefficients 4 EEDUC 0.509532 17 relLibRfra2022 0.505297 16 relLibMarriage2022 0.502339 7 overallReligiosityPew2014 0.502115 6 antiTransLegislationRiskIndex32023 0.501436 9 moderatelyReligiousStatista2017 0.501296 0 WEEK 0.500696 8 veryReligiousStatista2017 0.500224 11 relLibScore2022 0.500088 5 INCOMEMIN 0.500001 14 relLibHealth2022 0.499730 1 stateId 0.499727 10 nonreligiousStatista2017 0.498659 2 TBIRTH_YEAR 0.498232 15 relLibHealthMandate2022 0.495723 13 relLibVax2022 0.493200 12 relLibVote2022 0.489051 3 EGENID_BIRTH 0.111752
Features Coefficients 4 EEDUC 0.509532 17 relLibRfra2022 0.505297 16 relLibMarriage2022 0.502339 7 overallReligiosityPew2014 0.502115 6 antiTransLegislationRiskIndex32023 0.501436 9 moderatelyReligiousStatista2017 0.501296 0 WEEK 0.500696 8 veryReligiousStatista2017 0.500224 11 relLibScore2022 0.500088 5 INCOMEMIN 0.500001 14 relLibHealth2022 0.499730 1 stateId 0.499727 10 nonreligiousStatista2017 0.498659 2 TBIRTH_YEAR 0.498232 15 relLibHealthMandate2022 0.495723 13 relLibVax2022 0.493200 12 relLibVote2022 0.489051 3 EGENID_BIRTH 0.111752
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Anti-Trans Legislation Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6684
Features Coefficients
4 EEDUC 0.511643
19 relLibRfra2022 0.505801
9 overallReligiosityPew2014 0.505390
11 moderatelyReligiousStatista2017 0.502597
10 veryReligiousStatista2017 0.502021
18 relLibMarriage2022 0.501604
0 WEEK 0.500581
13 relLibScore2022 0.500353
2 TBIRTH_YEAR 0.500315
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
8 transAdultPop2022 0.500000
16 relLibHealth2022 0.499941
1 stateId 0.499647
12 nonreligiousStatista2017 0.498976
15 relLibVax2022 0.497316
17 relLibHealthMandate2022 0.496308
14 relLibVote2022 0.489418
3 EGENID_BIRTH 0.112749
Features Coefficients 4 EEDUC 0.511643 19 relLibRfra2022 0.505801 9 overallReligiosityPew2014 0.505390 11 moderatelyReligiousStatista2017 0.502597 10 veryReligiousStatista2017 0.502021 18 relLibMarriage2022 0.501604 0 WEEK 0.500581 13 relLibScore2022 0.500353 2 TBIRTH_YEAR 0.500315 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499941 1 stateId 0.499647 12 nonreligiousStatista2017 0.498976 15 relLibVax2022 0.497316 17 relLibHealthMandate2022 0.496308 14 relLibVote2022 0.489418 3 EGENID_BIRTH 0.112749
Features Coefficients 4 EEDUC 0.511643 19 relLibRfra2022 0.505801 9 overallReligiosityPew2014 0.505390 11 moderatelyReligiousStatista2017 0.502597 10 veryReligiousStatista2017 0.502021 18 relLibMarriage2022 0.501604 0 WEEK 0.500581 13 relLibScore2022 0.500353 2 TBIRTH_YEAR 0.500315 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499941 1 stateId 0.499647 12 nonreligiousStatista2017 0.498976 15 relLibVax2022 0.497316 17 relLibHealthMandate2022 0.496308 14 relLibVote2022 0.489418 3 EGENID_BIRTH 0.112749
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Pew 2014 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6694
Features Coefficients
4 EEDUC 0.511619
19 relLibRfra2022 0.505808
11 moderatelyReligiousStatista2017 0.502805
18 relLibMarriage2022 0.501627
10 veryReligiousStatista2017 0.501534
0 WEEK 0.500607
13 relLibScore2022 0.500326
2 TBIRTH_YEAR 0.500317
9 antiTransLegislationRiskIndex32023 0.500300
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
8 transAdultPop2022 0.500000
16 relLibHealth2022 0.499978
1 stateId 0.499649
12 nonreligiousStatista2017 0.499524
15 relLibVax2022 0.497192
17 relLibHealthMandate2022 0.496067
14 relLibVote2022 0.489147
3 EGENID_BIRTH 0.111512
Features Coefficients 4 EEDUC 0.511619 19 relLibRfra2022 0.505808 11 moderatelyReligiousStatista2017 0.502805 18 relLibMarriage2022 0.501627 10 veryReligiousStatista2017 0.501534 0 WEEK 0.500607 13 relLibScore2022 0.500326 2 TBIRTH_YEAR 0.500317 9 antiTransLegislationRiskIndex32023 0.500300 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499978 1 stateId 0.499649 12 nonreligiousStatista2017 0.499524 15 relLibVax2022 0.497192 17 relLibHealthMandate2022 0.496067 14 relLibVote2022 0.489147 3 EGENID_BIRTH 0.111512
Features Coefficients 4 EEDUC 0.511619 19 relLibRfra2022 0.505808 11 moderatelyReligiousStatista2017 0.502805 18 relLibMarriage2022 0.501627 10 veryReligiousStatista2017 0.501534 0 WEEK 0.500607 13 relLibScore2022 0.500326 2 TBIRTH_YEAR 0.500317 9 antiTransLegislationRiskIndex32023 0.500300 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 16 relLibHealth2022 0.499978 1 stateId 0.499649 12 nonreligiousStatista2017 0.499524 15 relLibVax2022 0.497192 17 relLibHealthMandate2022 0.496067 14 relLibVote2022 0.489147 3 EGENID_BIRTH 0.111512
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6689
Features Coefficients
4 EEDUC 0.511610
17 relLibRfra2022 0.505686
10 overallReligiosityPew2014 0.504812
16 relLibMarriage2022 0.501656
0 WEEK 0.500590
2 TBIRTH_YEAR 0.500315
11 relLibScore2022 0.500276
9 antiTransLegislationRiskIndex32023 0.500180
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
8 transAdultPop2022 0.500000
14 relLibHealth2022 0.499969
1 stateId 0.499648
13 relLibVax2022 0.497284
15 relLibHealthMandate2022 0.496187
12 relLibVote2022 0.489382
3 EGENID_BIRTH 0.112520
Features Coefficients 4 EEDUC 0.511610 17 relLibRfra2022 0.505686 10 overallReligiosityPew2014 0.504812 16 relLibMarriage2022 0.501656 0 WEEK 0.500590 2 TBIRTH_YEAR 0.500315 11 relLibScore2022 0.500276 9 antiTransLegislationRiskIndex32023 0.500180 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 14 relLibHealth2022 0.499969 1 stateId 0.499648 13 relLibVax2022 0.497284 15 relLibHealthMandate2022 0.496187 12 relLibVote2022 0.489382 3 EGENID_BIRTH 0.112520
Features Coefficients 4 EEDUC 0.511610 17 relLibRfra2022 0.505686 10 overallReligiosityPew2014 0.504812 16 relLibMarriage2022 0.501656 0 WEEK 0.500590 2 TBIRTH_YEAR 0.500315 11 relLibScore2022 0.500276 9 antiTransLegislationRiskIndex32023 0.500180 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 14 relLibHealth2022 0.499969 1 stateId 0.499648 13 relLibVax2022 0.497284 15 relLibHealthMandate2022 0.496187 12 relLibVote2022 0.489382 3 EGENID_BIRTH 0.112520
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Statista 2017 and Pew 2014 Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6802
Features Coefficients
4 EEDUC 0.510759
16 relLibRfra2022 0.506450
9 antiTransLegislationRiskIndex32023 0.501529
15 relLibMarriage2022 0.501230
0 WEEK 0.500592
13 relLibHealth2022 0.500099
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
8 transAdultPop2022 0.500000
2 TBIRTH_YEAR 0.499922
1 stateId 0.499648
14 relLibHealthMandate2022 0.497649
12 relLibVax2022 0.494921
10 relLibScore2022 0.491488
11 relLibVote2022 0.488917
3 EGENID_BIRTH 0.109164
Features Coefficients 4 EEDUC 0.510759 16 relLibRfra2022 0.506450 9 antiTransLegislationRiskIndex32023 0.501529 15 relLibMarriage2022 0.501230 0 WEEK 0.500592 13 relLibHealth2022 0.500099 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 2 TBIRTH_YEAR 0.499922 1 stateId 0.499648 14 relLibHealthMandate2022 0.497649 12 relLibVax2022 0.494921 10 relLibScore2022 0.491488 11 relLibVote2022 0.488917 3 EGENID_BIRTH 0.109164
Features Coefficients 4 EEDUC 0.510759 16 relLibRfra2022 0.506450 9 antiTransLegislationRiskIndex32023 0.501529 15 relLibMarriage2022 0.501230 0 WEEK 0.500592 13 relLibHealth2022 0.500099 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.500000 2 TBIRTH_YEAR 0.499922 1 stateId 0.499648 14 relLibHealthMandate2022 0.497649 12 relLibVax2022 0.494921 10 relLibScore2022 0.491488 11 relLibVote2022 0.488917 3 EGENID_BIRTH 0.109164
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Religious Liberty Removed")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Logistic Regression Accuracy: 0.6761
Features Coefficients
4 EEDUC 0.510815
13 overallReligiosityPew2014 0.506299
11 moderatelyReligiousStatista2017 0.504494
10 veryReligiousStatista2017 0.501713
9 antiTransLegislationRiskIndex32023 0.500621
0 WEEK 0.500606
12 nonreligiousStatista2017 0.500278
2 TBIRTH_YEAR 0.500069
5 INCOMEMIN 0.500001
6 statePopulation2020 0.500000
7 statePopulation2023 0.500000
8 transAdultPop2022 0.499999
1 stateId 0.499662
3 EGENID_BIRTH 0.111579
Features Coefficients 4 EEDUC 0.510815 13 overallReligiosityPew2014 0.506299 11 moderatelyReligiousStatista2017 0.504494 10 veryReligiousStatista2017 0.501713 9 antiTransLegislationRiskIndex32023 0.500621 0 WEEK 0.500606 12 nonreligiousStatista2017 0.500278 2 TBIRTH_YEAR 0.500069 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.499999 1 stateId 0.499662 3 EGENID_BIRTH 0.111579
Features Coefficients 4 EEDUC 0.510815 13 overallReligiosityPew2014 0.506299 11 moderatelyReligiousStatista2017 0.504494 10 veryReligiousStatista2017 0.501713 9 antiTransLegislationRiskIndex32023 0.500621 0 WEEK 0.500606 12 nonreligiousStatista2017 0.500278 2 TBIRTH_YEAR 0.500069 5 INCOMEMIN 0.500001 6 statePopulation2020 0.500000 7 statePopulation2023 0.500000 8 transAdultPop2022 0.499999 1 stateId 0.499662 3 EGENID_BIRTH 0.111579
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
LogRegressionOutcome(X,y,trainSize,trainState,"Only Pulse Data")
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:478: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:327: LineSearchWarning: The line search algorithm did not converge
warn('The line search algorithm did not converge', LineSearchWarning)
Logistic Regression Accuracy: 0.7099
Features Coefficients
4 EEDUC 0.509491
0 WEEK 0.500709
5 INCOMEMIN 0.500002
1 stateId 0.499759
2 TBIRTH_YEAR 0.498227
3 EGENID_BIRTH 0.111782
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\scipy\optimize\linesearch.py:437: LineSearchWarning: Rounding errors prevent the line search from converging
warn(msg, LineSearchWarning)
C:\Users\Warlock1111\AppData\Local\Programs\Python\Python39\lib\site-packages\sklearn\utils\optimize.py:203: UserWarning: Line Search failed
warnings.warn("Line Search failed")
Features Coefficients 4 EEDUC 0.509491 0 WEEK 0.500709 5 INCOMEMIN 0.500002 1 stateId 0.499759 2 TBIRTH_YEAR 0.498227 3 EGENID_BIRTH 0.111782
Features Coefficients 4 EEDUC 0.509491 0 WEEK 0.500709 5 INCOMEMIN 0.500002 1 stateId 0.499759 2 TBIRTH_YEAR 0.498227 3 EGENID_BIRTH 0.111782
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH"
,"SEXUAL_ORIENTATION","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState, "Random Forest Feature Importance")
Random Forest (500 Tree) Regression Accuracy: 0.4966
Features Importance
5 SEXUAL_ORIENTATION 0.283867
4 EGENID_BIRTH 0.247087
2 TBIRTH_YEAR 0.115527
0 WEEK 0.082819
6 INCOMEMIN 0.051162
3 EEDUC 0.041775
1 stateId 0.024697
8 statePopulation2023 0.019611
15 relLibScore2022 0.017565
7 statePopulation2020 0.014974
11 overallReligiosityPew2014 0.014905
10 transAdultPop2022 0.014747
18 relLibHealth2022 0.014445
13 moderatelyReligiousStatista2017 0.011986
12 veryReligiousStatista2017 0.011783
14 nonreligiousStatista2017 0.011340
9 antiTransLegislationRiskIndex32023 0.007486
20 relLibMarriage2022 0.005403
19 relLibHealthMandate2022 0.002955
21 relLibRfra2022 0.002738
16 relLibVote2022 0.002267
17 relLibVax2022 0.000862
# Setup for A/B testing
# remove "SEXUAL_ORIENTATION" column
# split dataset into dependent(features) and independent(target) variable
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EEDUC","EGENID_BIRTH","INCOMEMIN"
,"statePopulation2020","statePopulation2023","antiTransLegislationRiskIndex32023"
,"transAdultPop2022","overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Sexuality Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2711
Features Importance
4 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.200776
0 WEEK 0.127075
5 INCOMEMIN 0.087728
3 EEDUC 0.068879
1 stateId 0.041188
14 relLibScore2022 0.030655
7 statePopulation2023 0.025917
9 transAdultPop2022 0.023027
10 overallReligiosityPew2014 0.022958
6 statePopulation2020 0.021619
17 relLibHealth2022 0.021323
12 moderatelyReligiousStatista2017 0.019417
11 veryReligiousStatista2017 0.018438
13 nonreligiousStatista2017 0.017602
8 antiTransLegislationRiskIndex32023 0.011535
19 relLibMarriage2022 0.009055
20 relLibRfra2022 0.004458
18 relLibHealthMandate2022 0.004111
15 relLibVote2022 0.003277
16 relLibVax2022 0.001768
# Setup for A/B testing
# remove "EEDUC","INCOMEMIN" column
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","SEXUAL_ORIENTATION"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Education and Income Removed")
Random Forest (500 Tree) Regression Accuracy: 0.4744
Features Importance
4 SEXUAL_ORIENTATION 0.285457
3 EGENID_BIRTH 0.248089
2 TBIRTH_YEAR 0.166847
0 WEEK 0.121240
1 stateId 0.024385
6 statePopulation2023 0.018840
13 relLibScore2022 0.017829
16 relLibHealth2022 0.014760
5 statePopulation2020 0.014618
7 transAdultPop2022 0.014434
9 overallReligiosityPew2014 0.014231
10 veryReligiousStatista2017 0.012056
12 nonreligiousStatista2017 0.012010
11 moderatelyReligiousStatista2017 0.011817
8 antiTransLegislationRiskIndex32023 0.008044
18 relLibMarriage2022 0.005661
17 relLibHealthMandate2022 0.003300
19 relLibRfra2022 0.002953
14 relLibVote2022 0.002526
15 relLibVax2022 0.000904
# Setup for A/B testing
# remove ,"statePopulation2020","statePopulation2023","transAdultPop2022" column
#population table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Population Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2717
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.200083
0 WEEK 0.127960
5 INCOMEMIN 0.087268
4 EEDUC 0.069257
1 stateId 0.054054
11 relLibScore2022 0.041764
7 overallReligiosityPew2014 0.030534
14 relLibHealth2022 0.028542
9 moderatelyReligiousStatista2017 0.025326
8 veryReligiousStatista2017 0.024941
10 nonreligiousStatista2017 0.023795
6 antiTransLegislationRiskIndex32023 0.016182
16 relLibMarriage2022 0.011925
17 relLibRfra2022 0.006150
15 relLibHealthMandate2022 0.005858
12 relLibVote2022 0.004419
13 relLibVax2022 0.002745
# Setup for A/B testing
# remove ,"antiTransLegislationRiskIndex32023" column
#anti-trans legislation table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"overallReligiosityPew2014"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Anti-Trans Legislation Removed")
Random Forest (500 Tree) Regression Accuracy: 0.271
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.201075
0 WEEK 0.127463
5 INCOMEMIN 0.087390
4 EEDUC 0.069082
1 stateId 0.042216
13 relLibScore2022 0.031651
7 statePopulation2023 0.026780
8 transAdultPop2022 0.024179
9 overallReligiosityPew2014 0.024081
6 statePopulation2020 0.021919
16 relLibHealth2022 0.021759
11 moderatelyReligiousStatista2017 0.020444
10 veryReligiousStatista2017 0.019535
12 nonreligiousStatista2017 0.019072
18 relLibMarriage2022 0.009757
19 relLibRfra2022 0.004788
17 relLibHealthMandate2022 0.004328
14 relLibVote2022 0.003453
15 relLibVax2022 0.001831
# Setup for A/B testing
# remove ,"overallReligiosityPew2014" column
#Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Pew 2014 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2723
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.200986
0 WEEK 0.127489
5 INCOMEMIN 0.087619
4 EEDUC 0.068820
1 stateId 0.043777
13 relLibScore2022 0.032480
7 statePopulation2023 0.027228
8 transAdultPop2022 0.024436
16 relLibHealth2022 0.023153
6 statePopulation2020 0.023089
12 nonreligiousStatista2017 0.022216
10 veryReligiousStatista2017 0.021502
11 moderatelyReligiousStatista2017 0.020735
9 antiTransLegislationRiskIndex32023 0.012801
18 relLibMarriage2022 0.009710
19 relLibRfra2022 0.004750
17 relLibHealthMandate2022 0.004410
14 relLibVote2022 0.003699
15 relLibVax2022 0.001903
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017" column
#Statista 2017 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"overallReligiosityPew2014"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2722
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.201996
0 WEEK 0.128081
5 INCOMEMIN 0.087045
4 EEDUC 0.068836
1 stateId 0.048270
11 relLibScore2022 0.036531
10 overallReligiosityPew2014 0.034922
7 statePopulation2023 0.031234
8 transAdultPop2022 0.028197
14 relLibHealth2022 0.026239
6 statePopulation2020 0.025789
9 antiTransLegislationRiskIndex32023 0.015637
16 relLibMarriage2022 0.011671
17 relLibRfra2022 0.005378
15 relLibHealthMandate2022 0.004677
12 relLibVote2022 0.004106
13 relLibVax2022 0.002194
# Setup for A/B testing
# remove ,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014" column
#Statista 2017 and Pew 2014 table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022"
,"relLibMarriage2022","relLibRfra2022"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Statista 2017 and Pew 2014 Removed")
Random Forest (500 Tree) Regression Accuracy: 0.274
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.202868
0 WEEK 0.128257
5 INCOMEMIN 0.087695
4 EEDUC 0.068477
1 stateId 0.055199
10 relLibScore2022 0.040815
7 statePopulation2023 0.034473
8 transAdultPop2022 0.031218
13 relLibHealth2022 0.030359
6 statePopulation2020 0.028814
9 antiTransLegislationRiskIndex32023 0.020202
15 relLibMarriage2022 0.013167
16 relLibRfra2022 0.006054
11 relLibVote2022 0.005467
14 relLibHealthMandate2022 0.005383
12 relLibVax2022 0.002356
# Setup for A/B testing
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"
,"statePopulation2020","statePopulation2023","transAdultPop2022"
,"antiTransLegislationRiskIndex32023"
,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Religious Liberty Removed")
Random Forest (500 Tree) Regression Accuracy: 0.2721
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.201343
0 WEEK 0.129005
5 INCOMEMIN 0.087782
4 EEDUC 0.069254
1 stateId 0.054400
7 statePopulation2023 0.034165
13 overallReligiosityPew2014 0.031626
8 transAdultPop2022 0.031116
6 statePopulation2020 0.029118
11 moderatelyReligiousStatista2017 0.026989
10 veryReligiousStatista2017 0.025652
12 nonreligiousStatista2017 0.024772
9 antiTransLegislationRiskIndex32023 0.015582
# Setup for A/B testing
#,"statePopulation2020","statePopulation2023","transAdultPop2022"
#,"antiTransLegislationRiskIndex32023"
#,"veryReligiousStatista2017","moderatelyReligiousStatista2017","nonreligiousStatista2017","overallReligiosityPew2014"
# remove ,"relLibScore2022","relLibVote2022","relLibVax2022","relLibHealth2022","relLibHealthMandate2022","relLibMarriage2022","relLibRfra2022" column
#Religious Liberty table
clean_feature_list = ["WEEK","stateId","TBIRTH_YEAR","EGENID_BIRTH","EEDUC","INCOMEMIN"]
X = dfClean[clean_feature_list]
y = dfClean["CUR_GENID_CAT"]
printRFRClassifierOutcome(X,y,trainSize,trainState,"Random Forest Feature Importance Only Pulse Data")
Random Forest (500 Tree) Regression Accuracy: 0.2661
Features Importance
3 EGENID_BIRTH 0.239197
2 TBIRTH_YEAR 0.226661
1 stateId 0.201604
0 WEEK 0.152386
5 INCOMEMIN 0.098260
4 EEDUC 0.081893