-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathfilename_processing.py
49 lines (40 loc) · 1.29 KB
/
filename_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import tarfile
import shutil
import re
import csv
import gzip
reT = re.compile(r'\.tex$')
directory = os.fsencode("1801")
try:
os.mkdir('outdir')
except:
shutil.rmtree('outdir')
os.mkdir('outdir')
file_name_dict ={}
i = 0
for filename in os.listdir(directory):
i += 1
if i % 100 == 0:
print(i)
if filename.endswith(b".gz"):
location = directory + b"/" + filename
filename = filename.decode('utf-8')[:-3]
try:
t = tarfile.open(location, 'r')
file_name_dict[filename] = [m.name for m in t.getmembers()]
t.extractall('outdir', members=[m for m in t.getmembers() if reT.search(m.name)])
except:
outfilename = 'outdir/' + filename+".tex"
file_name_dict[filename] = []
with gzip.open(location.decode('utf-8'), 'rb') as f_in:
with open(outfilename, 'wb') as f_out:
shutil.copyfileobj(f_in, f_out)
outfile = open("original_contents_of_gz_files.csv", 'w')
for key in file_name_dict:
outstring = key + ","
for filenames in file_name_dict[key]:
outstring = outstring + filenames + ","
outstring = outstring[:-1] + "\n"
outfile.write(outstring)
outfile.close()