-
Notifications
You must be signed in to change notification settings - Fork 1
/
cdx-summarize.py
executable file
·157 lines (145 loc) · 5.25 KB
/
cdx-summarize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
#!/usr/bin/python3
from argparse import ArgumentParser
import sys
import gzip
import json
import urllib
import mime_counter
import re
import CdxParser
# Used to filter out invalid dates
MIN_YEAR = 1991
MAX_YEAR = 2022
# Dictionary to hold the data per domain, per year (or yearmonth)
Hosts = {}
def print_to_stderr(*a):
print(*a, file = sys.stderr)
def summarize_line(info):
if not 'agg' in info:
return
agg = info['agg']
if not agg in Hosts:
Hosts[agg] = {}
date = info['date']
if not date in Hosts[agg]:
Hosts[agg][date] = mime_counter.init_counter()
if not 'mime' in info:
info['mime']='unknown'
mime_counter.add_mime(Hosts[agg][date], info["mime"], 1, info["length"])
mime_counter.add_scheme(Hosts[agg][date], info['scheme'], 1, info["length"])
def output_results(args):
dict_items = Hosts.items()
sorted_items = sorted(dict_items)
for lvl2, value in sorted_items:
out = {}
for year in value:
if args.compact:
tmp = mime_counter.as_dict(value[year])
out[year] = {}
for k in tmp:
if tmp[k] > 0:
out[year][k] = tmp[k]
else:
out[year] = mime_counter.as_dict(value[year])
print(lvl2, json.dumps(out))
# output only the values for the key agg_by (aggregate_by) and then delete that key from the dictionary
def output_partial_results(args, agg_by):
if agg_by in Hosts:
out = {}
for year in Hosts[agg_by]:
if args.compact:
tmp = mime_counter.as_dict(Hosts[agg_by][year])
out[year] = {}
for k in tmp:
if tmp[k] > 0:
out[year][k] = tmp[k]
else:
out[year] = mime_counter.as_dict(Hosts[agg_by][year])
print(agg_by, json.dumps(out))
del Hosts[agg_by]
def determine_cdx_type(line):
tokens = line.split()
if len(tokens) < 3:
return CdxParser.FORMAT_UNKNOWN
if len(tokens)>=6:
if line.strip() == 'CDX N b a m s k r M S V g':
return CdxParser.FORMAT_CDXNbamskrMSVg
if tokens[0]=='CDX':
if line[0:14] == ' CDX N b a m s':
return CdxParser.FORMAT_CDXNbams
else:
return CdxParser.FORMAT_UNKNOWN
# all supported formats have the date as a 14 digit string in the second place
if (len(tokens[1]) != 14):
return CdxParser.FORMAT_UNKNOWN
# check if the line ends with a "} indicative of JSON
if line.rstrip()[-2:] == '"}':
return CdxParser.FORMAT_CDXJ
if len(tokens)==7:
return CdxParser.FORMAT_CDX7
elif len(tokens)==10:
return CdxParser.FORMAT_CDXNbams
return CdxParser.FORMAT_UNKNOWN
def cdx_type_from_args(args):
if args.format=='cdxj':
return CdxParser.FORMAT_CDXJ
elif args.format=='cdx7':
return CdxParser.FORMAT_CDX7
elif args.format=='cdxNbams':
return CdxParser.FORMAT_CDXNbams
elif args.format=='cdxNbamskrMSVg':
return CdxParser.FORMAT_CDXNbamskrMSVg
else:
return CdxParser.FORMAT_UNKNOWN
def read_cdx_file(args, fil, filename):
line = fil.readline()
ftype = cdx_type_from_args(args)
if ftype == CdxParser.FORMAT_UNKNOWN:
ftype = determine_cdx_type(line)
if ftype == CdxParser.FORMAT_UNKNOWN:
print_to_stderr("Unsupported cdx format: ", filename, line)
return
parser = CdxParser.CdxParser(ftype, args.monthly, args.fullhost)
aggregate_by = ''
res = parser.parse_line(line)
if 'agg' in res:
aggregate_by = res['agg']
summarize_line(res)
for line in fil:
try:
res = parser.parse_line(line)
summarize_line(res)
if args.assume_unique:
if 'agg' in res and res['agg'] != aggregate_by:
output_partial_results(args, aggregate_by)
aggregate_by = res['agg']
except Exception as inst:
print_to_stderr("Unexpected error:", filename, inst, line)
def dowork(args):
for f in args.file:
if (args.gz or (len(f) > 3 and f[-3:] == '.gz')) and (not args.nogz):
try:
with gzip.open(f, mode='rt', encoding=args.encoding) as z:
read_cdx_file(args, z, f)
except Exception as inst:
print_to_stderr("Error", inst, f)
else:
try:
with open(f, 'r', encoding=args.encoding) as fil:
read_cdx_file(args, fil, f)
except Exception as inst:
print_to_stderr("Error (dowork)", inst, f)
output_results(args)
if __name__ == '__main__':
parser = ArgumentParser(description='Summarize CDX file(s) to JSONL, automatically uses gzip filter if file ends with .gz')
parser.add_argument('--gz', action="store_true", help='force use of gzip filter')
parser.add_argument('--nogz', action="store_true", help='force not using gzip filter')
parser.add_argument('--monthly', action="store_true", help='break up statistics into monthly buckets instead of yearly')
parser.add_argument('--compact', action="store_true", help='do not output fields that are 0')
parser.add_argument('--fullhost', action="store_true", default=False, help='aggregate by full hostname instead of second level domain')
parser.add_argument('--assume_unique', action="store_true", default=False, help='assume aggregation entry only appears in a continous run in the CDX file(s) (OK for single, sorted CDX with --fullhost)')
parser.add_argument('--format',choices=['cdxj','cdx7','cdxNbams', 'cdxNbamskrMSVg'], help='force use of cdx format (cdxNbams = N b a m s)')
parser.add_argument('--encoding', action="store", default='utf-8', help='encoding, e.g. iso-8859-1 (default is your locale\'s defaut encoding, probably utf-8 on Linux). All CDX files have to have the same encoding')
parser.add_argument('file', nargs='*', help='cdx file (can be several)')
args = parser.parse_args()
dowork(args)