-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathonestopToGTFS.py
111 lines (89 loc) · 3.16 KB
/
onestopToGTFS.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
import os
import csv
import re
import sys
# from the CSV file, looks for a URL (sourceOfFeed) that matches an entity in the
# html GTFS Page. If found, returns the GTFS name. If not, returns "Failed."
def searchForMatchSource(feed_onestop_id, operatorName, sourceOfFeed, gtfs_page, debugger):
position = gtfs_page.find(sourceOfFeed)
if position != -1:
substring = gtfs_page[position - 300: position]
gtfs_html_group = re.match('(.+?)/agency/(.+)/">', substring)
if gtfs_html_group:
return gtfs_html_group.group(2)
elif debugger:
print "Failed: No regex match.", operatorName
elif debugger:
print "Failed: No match.", operatorName
# if nothing is returned (false value) then "Failed." is printed.
# From CSV file from Transitland, looks for a matching name in the html GTFS page.
# If found, returns the GTFS id name.
# Example: operatorName = NAME
# Search for: >NAME< --> and uses regex to find GTFS id name.
def searchForMatchName(feed_onestop_id, operatorName, sourceOfFeed, gtfs_page, debugger):
search = ">" + operatorName + "<"
position = gtfs_page.find(search)
if position != -1:
substring = gtfs_page[position - 300: position + 1]
gtfs_html_group = re.match('(.+?)/agency/(.+)/">', substring)
if gtfs_html_group:
return gtfs_html_group.group(2)
elif debugger:
print "Failure 2: Regex matches none.", operatorName
elif debugger:
print "Failure 2: No match.", operatorName
# if nothing is returned (false) then "Failure." is printed
# goes through CSV file, makes 1-2 calls to attempt to find a match in GTFS
# HTML document.
def parseFile(gtfs_page, debugger, start_index, end_index):
SUCCESS_COUNT = 0
TOTAL_COUNT = 0
SOURCE_COUNT = 0
NAME_COUNT = 0
currentIndex = 0
newCSVDocument = csv.writer(open("CSVNewFeedNames.csv", "w"))
with open('feeds.csv', 'rU') as f:
reader = csv.reader(f)
for row in reader:
if currentIndex < start_index:
continue
if currentIndex > end_index:
break
TOTAL_COUNT = TOTAL_COUNT + 1
if len(row) == 8:
continue
else:
feed_onestop_id = row[0]
groups = re.match('(.+)\/((.+).zip)', row[1])
sourceOfFeed = row[1]
operatorName = row[5]
if groups:
sourceOfFeed = groups.group(1)
match = searchForMatchName(feed_onestop_id, operatorName, sourceOfFeed, gtfs_page, debugger)
if match and match != "Failed.":
NAME_COUNT = NAME_COUNT + 1
else:
match = searchForMatchSource(feed_onestop_id, operatorName, sourceOfFeed, gtfs_page, debugger)
if match:
SOURCE_COUNT = SOURCE_COUNT + 1
if match:
print "Success! ========== "
row.append(match)
newCSVDocument.writerow(row)
SUCCESS_COUNT = SUCCESS_COUNT + 1
print "Source Count ", SOURCE_COUNT
print "Name Count ", NAME_COUNT
print "Successful Count: ", SUCCESS_COUNT
print "Total Count: ", TOTAL_COUNT
def main():
parse = './' + sys.argv[1]
debugger = sys.argv[2] == 'ON'
start_index = int(sys.argv[3])
end_index = int(sys.argv[4])
print start_index
print end_index
# open up gtfs HTML and removes all newlines
gtfs_page = open(parse, 'r').read().replace('\n', '')
parseFile(gtfs_page, debugger, 0, 850)
if __name__ == "__main__":
main()