-
Notifications
You must be signed in to change notification settings - Fork 0
/
datafile.py
97 lines (82 loc) · 2.59 KB
/
datafile.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
def count_from_db(cur, start_id):
q = '''
select count(id) as total
from tardis_portal_datafile
where id > %s
'''
cur.execute(q, [start_id])
rows = cur.fetchall()
return rows[0]["total"]
def data_from_db(cur, start_id, rows_bulk):
q = '''
select
df.id as df_id,
df.filename,
df.created_time,
df.modification_time,
df.dataset_id,
e.id as experiment_id,
e.public_access,
a."pluginId" as plugin_id,
a."entityId" as entity_id
from tardis_portal_datafile df
left join tardis_portal_dataset ds
on ds.id = df.dataset_id
left join tardis_portal_dataset_experiments dse
on dse.dataset_id = ds.id
left join tardis_portal_experiment e
on e.id = dse.experiment_id
left join tardis_portal_objectacl a
on a.content_type_id = 15 and a.object_id = e.id
where df.id in (
select id
from tardis_portal_datafile
where id > %s
order by id
limit %s
)
'''
cur.execute(q, [start_id, rows_bulk])
rows = cur.fetchall()
data = {}
start = start_id
for row in rows:
df_id = row["df_id"]
if df_id > start:
start = df_id
if df_id not in data:
data[df_id] = {
'filename': row["filename"],
'created_time': row["created_time"],
'modification_time': row["modification_time"],
'dataset': [{
'id': row["dataset_id"]
}],
'experiments': {}
}
experiment_id = row["experiment_id"]
if experiment_id not in data[df_id]['experiments']:
data[df_id]['experiments'][experiment_id] = {
'id': row["experiment_id"],
'public_access': row["public_access"],
'objectacls': []
}
if row["plugin_id"] is not None:
data[df_id]['experiments'][experiment_id]['objectacls'].append({
'pluginId': row["plugin_id"],
'entityId': row["entity_id"]
})
return (data, start)
def data_to_es(index_name, data):
for df_id in data:
doc = data[df_id]
exp = doc["experiments"]
doc["experiments"] = []
for k, v in exp.items():
doc["experiments"].append(v)
yield {
'_index': index_name,
'_type': '_doc',
'_id': df_id,
'_source': doc
}