-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathtarfiles.py
143 lines (110 loc) · 4.08 KB
/
tarfiles.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
#!/usr/bin/env python
import os
import tarfile
import click
cli = click.Group()
# ### Make a tarball file
# ```bash
# # Make a tarball file for sharing
# CURRENT_DATE=$(date +%Y%m%d)
# python3 tarfiles.py graph-data biomedgps-graph-data-v${CURRENT_DATE}.tar.gz
# python3 tarfiles.py initial-embeddings biomedgps-initial-embeddings-v${CURRENT_DATE}.tar.gz
# # Upload the tarball file to the google drive or other shared storage.
# ```
# Wrap all essential data files into a tarball
graph_data_files = [
"graph_data/entities.tsv",
"graph_data/relations.tsv",
"graph_data/knowledge_graph.tsv",
"graph_data/annotated_knowledge_graph.tsv",
"graph_data/relations",
"graph_data/formatted_relations",
"graph_data/formatted_entities",
"graph_data/extracted_entities",
"graph_data/entities",
"graph_data/custom_relations",
# DB
"ontology_matcher_cache.sqlite",
]
initial_embedding_files = [
"embeddings",
]
def list_files(path):
files = []
for root, _, filenames in os.walk(path):
for filename in filenames:
files.append(os.path.join(root, filename))
return files
# Check if all files exist
def check_files(files):
for file in files:
if not os.path.exists(file):
print(f"File {file} does not exist!")
return False
return True
# Create a tarball
def create_tarball(files, destfile):
# Create a new tarball in write and gzip mode
with tarfile.open(destfile, "w:gz") as tar:
for file in files:
print(f"Adding {file} to tarball...")
# Add each file/directory to the tarball
tar.add(file, arcname=os.path.basename(file))
# Generate md5sum for a file
def compute_md5sum(filepath):
command = f"md5sum {filepath}"
return os.popen(command).read().split(" ")[0]
def list_files_md5sum(file_lst):
md5sums = []
for file in file_lst:
files = list_files(file) if os.path.isdir(file) else [file]
for filepath in files:
print(f"Computing md5sum for {filepath}...")
md5sums.append({"file": filepath, "md5sum": compute_md5sum(filepath)})
return md5sums
def relative_path(path, dest_dir):
return os.path.relpath(path, dest_dir)
@cli.command(help="Wrap all essential graph data files into a tarball")
@click.argument("destfile", type=click.Path(exists=False))
def graph_data(destfile):
current_dir = os.path.abspath(os.path.dirname(__file__))
md5file = os.path.join(current_dir, "graph_data", "md5sum.txt")
graph_data_filepaths = [
os.path.join(current_dir, file) for file in graph_data_files
]
print("Creating tarball...")
print("Checking if all files exist...")
if not check_files(graph_data_filepaths):
return
print("All files exist!")
with open(md5file, "w") as f:
for item in list_files_md5sum(graph_data_filepaths):
filename = relative_path(item["file"], current_dir)
f.write(f"{item['md5sum']} {filename}\n")
graph_data_filepaths.append(md5file)
create_tarball(graph_data_filepaths, destfile)
# Remove md5sum.txt
os.remove(md5file)
@cli.command(help="Wrap all initial embedding files into a tarball")
@click.argument("destfile", type=click.Path(exists=False))
def initial_embeddings(destfile):
current_dir = os.path.abspath(os.path.dirname(__file__))
md5file = os.path.join(current_dir, "embeddings", "md5sum.txt")
initial_embedding_filepaths = [
os.path.join(current_dir, file) for file in initial_embedding_files
]
print("Creating tarball...")
print("Checking if all files exist...")
if not check_files(initial_embedding_filepaths):
return
print("All files exist!")
with open(md5file, "w") as f:
for item in list_files_md5sum(initial_embedding_filepaths):
filename = relative_path(item["file"], current_dir)
f.write(f"{item['md5sum']} {filename}\n")
initial_embedding_filepaths.append(md5file)
create_tarball(initial_embedding_filepaths, destfile)
# Remove md5sum.txt
os.remove(md5file)
if __name__ == "__main__":
cli()