-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathMain Analysis Code
87 lines (68 loc) · 2.25 KB
/
Main Analysis Code
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 15 20:18:21 2021
@author: Nicolas
"""
print('Importing libraries.')
import pandas as pd
import numpy as np
from openpyxl import load_workbook
import time
import statsmodels.api as sm
import seaborn
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt #For making plots.
import statsmodels.stats.multicomp as multi #For Tukey POST HOC analysis
#Reduces errors.
pd.set_option('display.float_format',lambda x:'%f'%x)
#Sets theme.
seaborn.set_theme(style="whitegrid")
print('Opening dataset')
start = time.time()
end = time.time()
print(end - start)
#Opens dataset.
data = pd.read_csv("v4_oa_ir_bw_hs_blsvntyfv_cgeo_78745.csv", low_memory=False)
print("Beginning the dropna function.")
end = time.time()
print(end - start)
data = data.dropna()
#Separating numerical from categorical variables being used.
all_vars = data.columns.tolist()
num_vars = all_vars[:1]
cat_vars = all_vars[1:]
print('Converting to numerical and centering quantitative variables.')
end = time.time()
print(end - start)
#Data management.
for col in all_vars: data[col] = pd.to_numeric(data[col], errors='coerce') #Converts all to numerical.
for col in num_vars: data[col] = (data[col] - data[col].mean()) # center quantitative IVs for regression analysis
print("Data management complete.")
print("Beginning logistic regression.")
end = time.time()
print(end - start)
print("Putting the different APD Officers and APD SectorDistricts in bins.")
officers = []
tracts = []
for col in all_vars:
if col.startswith('officer'):
officers.append(col)
for col in all_vars:
if col.startswith('tract'):
tracts.append(col)
print(officers)
print(tracts)
#Logistic regression analysis.
officercount=0
for i in officers:
officercount+=1
blockscount =0
print('OfficerCount + block: %s + %s' %(officercount, blockscount))
end = time.time()
print(end - start)
for i2 in tracts:
print('OfficerCount + tract: %s + %s' %(i, i2 ))
blockscount+=1
lreg1 = smf.logit(formula = 'blackbool ~ C(%s) + C(%s) + C(sexbool) + blkwhtratio + HotSpotRating' %(i, i2),data = data).fit()
with open('logistic_reg_results.csv', 'a') as fh:
fh.write(lreg1.summary().as_csv())