forked from yellekelyk/scrape-spec
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathSpec2000Data.py
162 lines (129 loc) · 5.12 KB
/
Spec2000Data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
from SpecDataBase import *
import SpecDataElem
import Table
import utils
from ordereddict import OrderedDict
import re
import pdb
import urllib
import BeautifulSoup
class Spec2000Data(SpecDataBase):
"A class that parses and holds spec2000 data"
def __init__(self, soup, elem=SpecDataElem):
self.__hdrMap = {'Company Name': 'test_sponsor',
'System Name' : 'hw_model',
'#CPU' : 'hw_nthreadspercore',
'Base' : 'basemean',
'Peak' : 'peakmean',
'Full Disclosures' : 'link'}
SpecDataBase.__init__(self, soup, elem=elem)
def htmlTables(self, soup):
tabs = soup.findAll("h3")
return tabs
def parseTable(self, tab):
name = str(tab.text)
table = Table.Table(str(tab.text), self.getElem()().attrs())
#determine hdr order
hdrInfo = tab.findNext("tr").th
hdr = list()
while hdrInfo:
hdr.append(str(hdrInfo.text))
hdrInfo = hdrInfo.findNextSibling("th")
#loop through all lines in table
line = tab.findNext("tr").findNext("tr")
linecnt = 0
#pdb.set_trace()
while line:
saveData = self.getElem()()
entry = line.td
cnt = 0
error = False
while entry:
#print(entry)
if hdr[cnt] in self.__hdrMap:
attr = self.__hdrMap[hdr[cnt]]
data = str(entry.text)
if attr == "link":
#pdb.set_trace()
link = str("http://www.spec.org/cpu2000/results/" +
str(entry.findNext("a").findNextSibling("a")['href']))
data = link
html = urllib.urlopen(link).read()
soup = BeautifulSoup.BeautifulSoup(html)
error = error or self.__parseDetails__(saveData, soup)
saveData.update(attr, data)
cnt = cnt + 1
entry = entry.findNextSibling("td")
if error:
print "Warning: skipping line due to error"
else:
table.addEntry(saveData)
# go to next line
line = line.findNext("tr")
linecnt += 1
#if linecnt > 10:
# return table
return table
def __parseDetails__(self, saveData, soup):
error = False
tab = soup.body.table.findNextSibling("table").findNextSibling("table")
line = tab.tr.td
while line:
check = re.search("Hardware Avail:\s*(\w+-\d+)", str(line.text))
if check:
saveData.update("hw_avail", check.group(1))
line = line.findNextSibling("td")
tab = tab.findNextSibling("table")
#determine hdr order of results
#pdb.set_trace()
hdrInfo = tab.findNext("tr").th
hdr = list()
while hdrInfo:
hdr.append(str(hdrInfo.text))
hdrInfo = hdrInfo.findNextSibling("th")
# store results
#pdb.set_trace()
line = tab.findNext("tr").findNextSibling("tr")
while line:
cnt = 0
entry = line.td
#if entry.get("class") != "bm":
# raise Exception("Expected class 'bm', got " +
# entry.get("class"))
#testName = str(entry.text).replace(' ', '')
#testScore = None
#we only proceed if there is clearly enough data present
# (otherwise we might misplace something due to the cnt scheme)
if len(line.findAll("td")) >= len(hdr):
testName = None
testScore = None
while entry:
text = str(entry.text).replace(' ', '')
if cnt < len(hdr):
if hdr[cnt] == "Benchmark":
testName = text
elif hdr[cnt] == "BaseRatio":
testScore = text
cnt = cnt+1
entry = entry.findNextSibling("td")
if testScore == None:
print "Missing score for " + testName
#raise Exception("Missing score for " + testName)
saveData.update(testName, testScore)
line = line.findNextSibling("tr")
#pdb.set_trace()
# store HW attributes
tab = tab.findNextSibling("table").table
if tab != None:
line = tab.tr
#loop through all lines in table
while line:
if line.td:
attr = str(filter(utils.onlyascii,line.th.text))
data = str(filter(utils.onlyascii,line.td.text))
saveData.update(attr,data)
# go to next line
line = line.findNextSibling("tr")
else:
error = True
return error