-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsampleanalysis
126 lines (109 loc) · 5.48 KB
/
sampleanalysis
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#This imports the proper Python libraries.
import pandas
import numpy
import statsmodels.api as sm
import statsmodels.formula.api as smf #For ANOVA analysis
import seaborn
import matplotlib.pyplot as plt #For making plots.
import statsmodels.stats.multicomp as multi #For Tukey POST HOC analysis
import scipy.stats
#This reduces errors.
pandas.set_option('display.float_format',lambda x:'%f'%x)
#This reads in the dataset.
data = pandas.read_csv("county_2015-2018_v4_49.04_MBonly_COMMSUPcombined_ASIANremovedJAILCOMSUPPonly_better.csv", low_memory=False)
#This ensures my variables are categorical or numeric.
#Explanatory Variables
data["race"] = data["race"].astype("category")
data["sex"] = data["sex"].astype("category")
data['atty_typ_desc'] = pandas.to_numeric(data['atty_typ_desc'], errors='coerce')
data["cs_disp_less30"] = data["cs_disp_less30"].astype("category")
data["judge"] = data["judge"].astype("category")
data['age'] = pandas.to_numeric(data['age'], errors='coerce')
#Response Variables
data['fnl_disp_desc'] = pandas.to_numeric(data['fnl_disp_desc'], errors='coerce')
print("*****Variables!*****")
print("(1) race is a two-level categorical explanatory variable, 0=white and 1=black")
print("(2) sex is a two-level categorical explanatory variable, 0=male and 1=female")
print("(3) atty_typ_desc is a two-level categorical explanatory variable, 0=appointed and 1=hired")
print("(4) cs_disp_less30 is a two-level categorical explanatory variable, 0=sentenced more than 30 days after arrest and 1=sentenced within 30 days after arrest")
print("(5) judge is a categorical explanatory variable with 6 levels, which represents the different judges who passed the sentence, 0=CC3, 1=CC5, 2=CC6, 3=CC7, 4=CC8, 5=CC9")
print("(6) age is a numerical explanatory variable")
print("(7) Final Disposition Description is a two-level categorical response variable, 0=jail and 1=probation")
print("*****Frequency Tables!*****")
#This section generates frequency tables to check that my categorical values were properly zero'd and coded properly.
print("Race Frequency Table: 0=white and 1=black")
ft_c = data["race"].value_counts(sort=False, dropna=False)
print("Count of Each Defendant's Race")
print(ft_c)
ft_p = data["race"].value_counts(sort=False, normalize=True)
print("Percentage of Each Defendant's Race")
print(ft_p)
print("Sex Frequency Table: 0=male and 1=female")
ft_c = data["sex"].value_counts(sort=False, dropna=False)
print("Count of Each Defendant's Sex")
print(ft_c)
ft_p = data["sex"].value_counts(sort=False, normalize=True)
print("Percentage of Each Defendant's Sex")
print(ft_p)
print("Attorney Type Frequency Table: 0=appointed and 1=hired")
ft_c = data["atty_typ_desc"].value_counts(sort=False, dropna=False)
print("Count of Each Defendant's Attorney Type")
print(ft_c)
ft_p = data["atty_typ_desc"].value_counts(sort=False, normalize=True)
print("Percentage of Each Defendant's Attorney Type")
print(ft_p)
print("Whether the Sentence Was Within 30 Days of Arrest Frequency Table: 0=sentenced more than 30 days after arrest and 1=sentenced within 30 days after arrest")
ft_c = data["cs_disp_less30"].value_counts(sort=False, dropna=False)
print("Count of Whether the Sentence Was Within 30 Days of Arrest")
print(ft_c)
ft_p = data["cs_disp_less30"].value_counts(sort=False, normalize=True)
print("Percentage of Whether the Sentence Was Within 30 Days of Arrest")
print(ft_p)
print("Judge Table: 0=CC3, 1=CC5, 2=CC6, 3=CC7, 4=CC8, 5=CC9")
ft_c = data["judge"].value_counts(sort=False, dropna=False)
print("Count of Each Court/Judge")
print(ft_c)
ft_p = data["judge"].value_counts(sort=False, normalize=True)
print("Percentage of Each Court/Judge")
print(ft_p)
print("Age Table: Numerical")
ft_c = data["age"].value_counts(sort=False, dropna=False)
print("Count of Each Defendant's Age")
print(ft_c)
ft_p = data["age"].value_counts(sort=False, normalize=True)
print("Percentage of Each Defendant's Age")
print(ft_p)
print("Final Disposition Description Table: 0=jail, 1=probation")
ft_c = data["fnl_disp_desc"].value_counts(sort=False, dropna=False)
print("Count of Each Defendant's Final Disposition Description")
print(ft_c)
ft_p = data["fnl_disp_desc"].value_counts(sort=False, normalize=True)
print("Percentage of Each Defendant's Final Disposition Description")
print(ft_p)
print("*****Centering My Numerical Explanatory Variable!*****")
data['age_c'] = (data['age'] - data['age'].mean())
#This checks to see that the mean of my centered age_c equals 0 or is close to zero.
print("Mean of numerical variable should be close to 0:")
print(data['age_c'].mean())
print("*****Logistic Regression Analysis!*****")
print("All of the variables")
#This runs the logistic regression.
lreg1= smf.logit(formula='fnl_disp_desc ~ race + sex + cs_disp_less30 + judge + age_c + atty_typ_desc', data = data).fit()
print(lreg1.summary())
#This is a post hoc test to see how age and attorney type confound the likelihood of receiving probation.
print(" ")
print("--------------------------------------------------------------------------------------------------")
print(" ")
print("**********Post hoc test on just attorney type and age of defendant**********")
lreg1= smf.logit(formula='atty_typ_desc ~ age_c', data = data).fit()
print(lreg1.summary())
#This calculates the odds ratio.
print("Odds Ratio")
print(numpy.exp(lreg1.params))
#This prints the confidence intervals of the odds ratio.
print("Confidence Intervals of Odds Ratio")
params = lreg1.params
conf = lreg1.conf_int()
conf['OR'] = params
conf.columns = ['Lower CI', 'Upper CI', 'OR']
print(numpy.exp(conf))