-
Notifications
You must be signed in to change notification settings - Fork 8
/
Copy pathgtfs_to_igraph.R
352 lines (207 loc) · 12.9 KB
/
gtfs_to_igraph.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
##############################################################################################
################### This script brings a function to convert a list ###################
################### of GTFS.zip files into an igraph for network analysis ###################
##############################################################################################
# github repo: https://github.com/rafapereirabr/gtfs_to_igraph
##################### Load packages -------------------------------------------------------
library(igraph)
library(data.table)
library(dplyr)
library(magrittr)
library(sp)
library(geosphere)
library(fasttime)
### Start Function
gtfs_to_igraph <- function( list_gtfs, dist_threshold, save_muxviz){
############ 0. read GTFS files -----------------
# list_gtfs= my_gtfs_feeds
# dist_threshold = 30
cat("reading GTFS data \n")
# function to read and rbind files from a list with different GTFS.zip
tmpd <- tempdir()
unzip_fread_gtfs <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character") }
unzip_fread_routes <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('route_id', 'route_short_name', 'route_type', 'route_long_name')) }
unzip_fread_trips <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('route_id', 'service_id', 'trip_id', 'direction_id')) }
unzip_fread_stops <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('stop_id', 'stop_name', 'stop_lat', 'stop_lon', 'parent_station', 'location_type')) }
unzip_fread_stoptimes <- function(zip, file) { unzip(zip, file, exdir=tmpd) %>% fread(colClasses = "character", select= c('trip_id', 'arrival_time', 'departure_time', 'stop_id', 'stop_sequence')) }
# Read
stops <- lapply( list_gtfs , unzip_fread_stops, file="stops.txt") %>% rbindlist()
stop_times <- lapply( list_gtfs , unzip_fread_stoptimes, file="stop_times.txt") %>% rbindlist()
routes <- lapply( list_gtfs , unzip_fread_routes, file="routes.txt") %>% rbindlist()
trips <- lapply( list_gtfs , unzip_fread_trips, file="trips.txt") %>% rbindlist()
calendar <- lapply( list_gtfs , unzip_fread_gtfs, file="calendar.txt") %>% rbindlist()
# make sure lat long are numeric, and text is encoded
stops[, stop_lon := as.numeric(stop_lon) ][, stop_lat := as.numeric(stop_lat) ]
Encoding(stops$stop_name) <- "UTF-8"
############ 1. Identify stops that closee than distance Threshold in meters ------------------
cat("calculating distances between stops \n")
### Convert stops into SpatialPointsDataFrame
# lat long projection
myprojection_latlong <- CRS("+proj=longlat +datum=WGS84 +ellps=WGS84 +towgs84=0,0,0")
# convert stops into spatial points
coordinates(stops) <- c("stop_lon", "stop_lat")
proj4string(stops) <- myprojection_latlong
stops <- spTransform(stops, myprojection_latlong)
# use the distm function to generate a geodesic distance matrix in meters
mdist <- distm(stops, stops, fun=distHaversine)
# cluster all points using a hierarchical clustering approach
hc <- hclust(as.dist(mdist), method="complete")
# define clusters based on a tree "height" cutoff "d" and add them to the SpDataFrame
stops@data$clust <- cutree(hc, h=dist_threshold)
gc(reset = T)
# convert stops back into data frame
df <- as.data.frame(stops) %>% setDT()
df <- df[order(clust)]
df <- unique(df) # remove duplicate of identical stops
head(df)
# identify how many stops per cluster
df[, quant := .N, by = clust]
table(df$quant)
plot(df$stop_lon, df$stop_lat, col=df$clust)
############ 2. Identify and update Parent Stations ------------------
cat("Identifying and updating Parent Stations \n")
# How many stops have a Parent Station
nrow(df[ parent_station !=""])
# How many stops without Parent Station
nrow(df[ parent_station ==""])
# Stops which are Parent Stations (location_type==1) will be Parent Stations of themselves
df[ location_type==1, parent_station := stop_id ]
# in case the field location_type is missinformed
df[ parent_station=="" & stop_id %in% df$parent_station, parent_station := stop_id ]
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station=="" & stop_id %in% df$parent_station, stop_id, "")), by=clust]
#total number of stops without Parent Station
nrow(df[ parent_station ==""])
# Update Parent Stations for each cluster
# a) Stops which alread have parent stations stay the same
# b) stations with no parent, will receive the parent of the cluster
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station=="", max(parent_station), "")), by=clust]
nrow(df[ parent_station ==""])
# d) For those clusters with no parent stations, get the 1st stop to be a Parent
df[ quant > 1 , parent_station:= ifelse( parent_station !="" , parent_station,
ifelse( parent_station== "", stop_id[1L], "")), by=clust]
nrow(df[ parent_station ==""])
# all clusters > 1 have a parent station
df[quant > 1 & parent_station==""][order(clust)] # should be empty
# Remaining stops without Parent Station
nrow(df[ parent_station ==""])
# make sure parent stations are consistent within each cluster with more than one stop
df[ quant > 1 , parent_station := max(parent_station), by=clust]
unique(df$parent_station) %>% length()
# for the lonly stops, make sure they are the Parent station of themselves
df[ quant ==1 & parent_station=="" , parent_station := stop_id , by=stop_id]
nrow(df[ parent_station ==""]) == 0
############ 3. Update Lat long of stops based on parent_station -----------------------
# Update in stops data: get lat long to be the same as 1st Parent Station
df[, stop_lon := stop_lon[1], by=parent_station]
df[, stop_lat := stop_lat[1], by=parent_station]
# Update in stop_times data: get lat long to be the same as 1st Parent Station
# Add parent_station info to stop_times
# merge stops and stop_times based on correspondence btwn stop_times$stop_id and df$stop_id
stop_times[df, on= 'stop_id', c('clust', 'parent_station') := list(i.clust, i.parent_station) ]
# CRUX: Replace stop_id with parent_station
stop_times[ !is.na(parent_station) , stop_id := parent_station ]
df[ , stop_id := parent_station ]
# remove repeated stops
df <- unique(df)
############ 4. identify transport modes, route and service level for each trip -----------------------
routes <- routes[,.(route_id, route_type)] # keep only necessary cols
trips <- trips[,.(route_id, trip_id, service_id)] # keep only necessary cols
trips[routes, on=.(route_id), route_type := i.route_type] # add route_type to trips
# add these columns to stop_times: route_id, route_type, service_id
stop_times[trips, on=.(trip_id), c('route_id', 'route_type', 'service_id') := list(i.route_id, i.route_type, i.service_id) ]
gc(reset = T)
# # Only keep trips during weekdays
# # remove columns with weekends
# calendar <- calendar[, -c('saturday', 'sunday')]
#
# # keep only rows that are not zero (i.e. that have service during weekday)
# calendar <- calendar[rowMeans(calendar >0)==T,]
#
# # Only keep those trips which run on weekdays
# stop_times2 <- subset(stop_times, service_id %in% calendar$service_id)
# Get edited info for stop_times and stops
stops_edited <- df[, .(stop_id, stop_name, parent_station, location_type, stop_lon, stop_lat)]
stop_times_edited <- stop_times[, .(route_type, route_id, trip_id, stop_id, stop_sequence, arrival_time, departure_time)]
# make sure stop_sequence is numeric
stop_times_edited[, stop_sequence := as.numeric(stop_sequence) ]
# make sure stops are in the right sequence for each group (in this case, each group is a trip_id)
setorder(stop_times_edited, trip_id, stop_sequence, route_id)
gc(reset = T)
############ 5. Indentify links between stops -----------------
cat("Identifying links between stops \n")
# calculate travel-time (in minutes) between stops
# Convert times to POSIX to do calculations
stop_times_edited[, arrival_time := paste("2000-01-01",arrival_time) ] # add full date. The date doesnt' matter much
stop_times_edited[, arrival_time := fastPOSIXct(arrival_time) ] # fast conversion to POSIX
# calculate travel time (in minutes) between stops
stop_times_edited[ , travel_time := difftime( data.table::shift(arrival_time, type = "lead"), arrival_time, units="mins") , by=trip_id][ , travel_time := as.numeric(travel_time) ]
# create three new columns by shifting the stop_id, arrival_time and departure_time of the following row up
# you can do the same operation on multiple columns at the same time
stop_times_edited[, `:=`(stop_id_to = shift(stop_id, type = "lead"),
arrival_time_stop_to = shift(arrival_time, type = "lead"),
departure_time_stop_to = shift(departure_time, type = "lead")
),
by = .(trip_id, route_id)]
# you will have NAs at this point because the last stop doesn't go anywhere. Let's remove them
stop_times_edited <- na.omit(stop_times_edited)
# frequency of trips per route (weight)
relations <- stop_times_edited[, .(weight = .N), by= .(stop_id, stop_id_to, route_id, route_type)]
relations <- unique(relations)
# reorder columns
setcolorder(relations, c('stop_id', 'stop_id_to', 'weight', 'route_id', 'route_type'))
# now we have 'from' and 'to' columns from which we can create an igraph
head(relations)
# plot densit distribution of trip frequency
density(relations$weight) %>% plot()
# subset stop columns
temp_stops <- stops_edited[, .(stop_id, stop_lon, stop_lat)] #
temp_stops <- unique(temp_stops)
# remove stops with no connections, and remove connections with ghost stops
e <- unique(c(relations$stop_id, relations$stop_id_to))
v <- unique(temp_stops$stop_id)
d <- setdiff(v,e) # stops in vertex data frame that are not present in edges data
dd <- setdiff(e,v) # stops in edges data frame that are not present in vertex data
temp_stops <- temp_stops[ !(stop_id %in% d) ] # stops with no connections
relations <- relations[ !(stop_id %in% dd) ] # trips with ghost stops
relations <- relations[ !(stop_id_to %in% dd) ] # trips with ghost stops
####### Overview of the network being built
cat("Number of nodes:", unique(relations$stop_id) %>% length(), " \n")
cat("Number of Edges:", nrow(relations), " \n")
cat("Number of routes:", unique(relations$route_id) %>% length(), " \n")
############ 6. Build igraph ---------------------
cat("building igraph \n")
g <- graph_from_data_frame(relations, directed=TRUE, vertices=temp_stops)
############ 7. Save MuxViz input ----------------------------------
cat("saving muxviz input \n")
if (save_muxviz==T){
# create directory where input files to muxviz will be saved
dir.create(file.path(".", "muxviz_input"), showWarnings = FALSE)
# stops
names(temp_stops) <- c('nodeID', 'nodeLong', 'nodeLat')
temp_stops[, nodeLabel := nodeID]
setcolorder(temp_stops, c('nodeID', 'nodeLabel', 'nodeLong', 'nodeLat'))
fwrite( temp_stops, "./muxviz_input/stops_layout.txt")
# Edges
#recode route type by tranport mode
relations[, route_type := ifelse(route_type==0, "LightRail", ifelse(route_type==1, "Subway", ifelse(route_type==2, "Rail",
ifelse(route_type==3, "Bus", ifelse(route_type==4, "Ferry",
ifelse(route_type==5, "Tramway", ifelse(route_type==6, "CableCar",
ifelse(route_type==7, "Funicular", "ERROR"))))))))]
# save links data for each tranport mode in a separate .txt file
cols_to_save <- c('stop_id', 'stop_id_to', 'weight')
relations[, fwrite(.SD, paste0("./muxviz_input/edge_list_", route_type ,".txt"),
col.names = F, sep = " "), by = route_type, .SDcols= cols_to_save]
# create Input file
edge_files <- list.files("./muxviz_input", pattern="edge_list", full.names=F)
layout_file <- list.files("./muxviz_input", pattern="layout", full.names=F)
input_file <- data.frame(a= edge_files, b = NA, c=layout_file)
input_file$a <- input_file$a
input_file$b <- gsub('^.*_\\s*|\\s*.t.*$', '', input_file$a)
input_file$c <- input_file$c
fwrite(x=input_file, file="./muxviz_input/input_Muxviz.txt" ,col.names = F, sep = ";" )
}
# return graph
return(g)
}