-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathChicago_crime_prediction.Rmd
268 lines (196 loc) · 6.73 KB
/
Chicago_crime_prediction.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
---
title: "Predicting_Crime_Prophet"
author: "Urban Labs"
date: "December 13, 2017"
output: html_document
notes: "1/12: Specifying seasonality and holiday effects, outliers,
parameter tuning with cross validation and running the model
for only high crimes neighborhoods 15,11,10,9,7,6,5,4,3,2,1"
---
```{r directory}
getwd()
```
```{r library, message=FALSE}
library(prophet)
library(lubridate)
library(weatherData)
library(xts)
library(zoo)
library(ggplot2)
library(data.table)
library(reshape2)
library(anytime)
library(ggrepel)
library(scales)
library(dplyr)
library(plyr)
```
```{r read}
# Read Felonies Crime Data
#CrimeDataFelonies <- read.csv(file = "crimeDataFelonies.csv", header = TRUE, sep = ",")
```
```{r readRDS}
# RDS
#saveRDS(CrimeDataFelonies, "/export/home/keval/Crime Predictions using Prophet/CrimeDataFelonies.rds")
CrimeDataFelonies <- readRDS("/export/home/keval/Crime Predictions using Prophet/CrimeDataFelonies.rds")
```
```{r plot1}
# Let's do some exploratory analysis
c <- ggplot(CrimeDataFelonies, aes(factor(INBOX))) + geom_bar(stat = "count") + xlab("Crime")
c + theme(axis.text.x = element_text(angle = 90))
```
```{r plot2}
# Exploring Property Crimes
CrimeDataProperty = subset(CrimeDataFelonies, INBOX == "PROPERTY CRIMES")
ggplot(CrimeDataProperty, aes(factor(PRIMARY))) + geom_bar(stat = "count") + xlab("Property")
```
```{r plot3}
CrimeDataViolence = subset(CrimeDataFelonies, INBOX == "VIOLENT CRIMES")
v <- ggplot(CrimeDataViolence, aes(factor(PRIMARY))) + geom_bar(stat = "count") + xlab("Violent")
v + theme(axis.text.x = element_text(angle = 90))
```
```{r plot4}
d <- ggplot(CrimeDataFelonies, aes(x=factor(DISTRICT), fill = INBOX)) +
xlab("DISTRICT") +
geom_bar(stat = "count") +
ggtitle("Crime Categories by Districts")
d
```
```{r plot5}
library(scales)
y <- ggplot(CrimeDataFelonies, aes(x=factor(YEAR), fill = INBOX)) +
xlab("YEAR") +
geom_bar(stat = "count", position = "fill") +
scale_y_continuous(labels = percent_format())
ggtitle("Crime Categories by Year")
y
```
```{r summary}
# Summarize Property crime data
table(CrimeDataProperty$YEAR)
```
```{r plot6}
# Plot time series data of total property crimes
pc <- ggplot(CrimeDataProperty, aes(YEAR)) +
geom_line(stat = "count") +
xlab("Burglaries and Thefts") +
ggtitle("Property Crimes") +
ylim(50000,150000)
pc
```
```{r aggregate}
# Aggregate our data for weekly predictions and select a subset of columns for predictors into the model
# Group data by date, Inbox, Primary and create a calculated column for count of # of crimes
CrimeDataPropAgg <- aggregate(X ~ DATEOCC + DISTRICT, data = CrimeDataProperty, length)
# Rename Column
names(CrimeDataPropAgg)[1] <- "Date"
names(CrimeDataPropAgg)[3] <- "CrimeCount"
# Convert DATEOCC column to datetime object
CrimeDataPropAgg$Date <- as.Date(CrimeDataPropAgg$Date, format = '%d-%b-%y')
#CrimeDataPropAgg <- subset(CrimeDataPropAgg, select = c(Date, DISTRICT, X))
```
```{r plot7}
# Plot time series data of total property crimes in dataset
pc <- ggplot(CrimeDataPropAgg, aes(x=Date, y = CrimeCount)) +
geom_line() +
scale_x_date(labels=date_format("%m/%y")) +
xlab("Burglaries and Thefts") +
ylab("Crime Count") +
ggtitle("Daily Property Crimes")
pc
```
```{r aggregate1}
# Aggregate daily data to weekly summary
CrimeDataPropAgg$Week <- as.Date(cut(CrimeDataPropAgg$Date, breaks = "week", start.on.monday = FALSE))
# Summarize the data by week
CrimeDataPropGrouped <- group_by(CrimeDataPropAgg, Week, DISTRICT)
CrimeDataPropWeek <- aggregate(CrimeCount ~ Week + DISTRICT, data = CrimeDataPropGrouped, sum)
```
```{r plot8}
# Bar plots to analyze the spread of crimes in each district and outlier detection
wpc <- ggplot(CrimeDataPropWeek, aes(factor(DISTRICT), CrimeCount, color = factor(DISTRICT))) +
geom_boxplot() +
xlab("Districts") +
ylab("Crime Count") +
labs(color="District")
wpc
```
```{r plot9}
# Plot weekly property crimes
pcw <- ggplot(CrimeDataPropWeek, aes(x=Week, y=CrimeCount, color=factor(DISTRICT))) +
geom_line() +
scale_x_date(labels=date_format("%m/%y")) +
labs(color="District") +
xlab("Crimes") +
ggtitle("Weekly Property Crimes") +
scale_x_date(breaks = date_breaks("6 months"), labels = date_format("%b-%y")) +
theme(axis.text.x = element_text(angle = 90))
# geom_text_repel(
# data = CrimeDataPropWeek,
# aes(label = paste("District", DISTRICT)),
# size = 3,
# nudge_x = 45
#)
pcw
```
```{r prophet}
# Subset data
CrimeDataPropWeekProphet <- subset(CrimeDataPropWeek, select = c(Week, DISTRICT, CrimeCount))
# Data table
# CrimeDataPropWeekProphetDT <- as.data.table(CrimeDataPropWeekProphet)
# Model
modelGroupDist <- function(x, dist) {
cdpModel <- prophet(x, changepoint.prior.scale = 0.5, weekly.seasonality = TRUE)
cdpFuture <- make_future_dataframe(cdpModel, periods = 52, freq = "week", include_history = TRUE)
forecast <- predict(cdpModel, cdpFuture)
return(list(m=cdpModel, forecast=forecast, district=dist))
}
districts <- unique(CrimeDataPropWeekProphet$DISTRICT)
# Subset districts to high crime
hc_districts = c('15','11','10','9','7','6','5','4','3','2','1')
# Write code to iterate over each districts and pass crime counts and date fields only to the function and fit prophet model.
#Initialize result list to store model results
results <- vector("list", 11)
for (i in 1:length(hc_districts)) {
cdSubset <- subset(CrimeDataPropWeekProphet, DISTRICT == hc_districts[i])
names(cdSubset)[1] <- "ds"
names(cdSubset)[3] <- "y"
cdSubset <- subset(cdSubset, select = c("ds","y"))
# Store results of the model in
results[[i]] <- modelGroupDist(cdSubset, hc_districts[i])
}
```
```{r plot10}
# 52-week forecast
plot(results[[1]]$m, results[[1]]$forecast, xlabel = "Date", ylabel = "Crime Count")
```
```{r}
# Prophet component plots
prophet_plot_components(results[[1]]$m, results[[1]]$forecast)
```
```{r cv}
# Cross Validation using prophets in built CV method
# Calculate the average cross validation error for each district
cd.cv <- cross_validation(results[[1]]$m, initial = 312, period = 52, horizon = 52, units = "weeks")
tail(cd.cv)
```
```{r errorMAE}
# Calculate MAE
error.MAE <- mean(abs(cd.cv$yhat - cd.cv$y))
error.MAE
```
```{r errorRMSE}
# Calculate RMSE
error.RMSE <- sqrt(mean((cd.cv$yhat - cd.cv$y)^2))
error.RMSE
```
```{r plot13}
cvp <- ggplot(cd.cv, aes(ds)) +
geom_ribbon(aes(ymin=yhat_lower,ymax=yhat_upper), fill = "lightsteelblue2") +
geom_line(aes(y = y, color = "Actual")) +
geom_line(aes(y = yhat, color = "Predicted")) +
xlab("Date") +
ylab("Crime") +
ggtitle("Actual vs Predicted")
cvp
```