-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocess.py
247 lines (207 loc) · 9.29 KB
/
preprocess.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
import argparse
import os
import pickle
import dgl
import numpy as np
import numba
import pandas as pd
import torch
from tqdm import trange
from data_util import _iterate_datasets, load_data, load_split_edges
from utils import timeit
def load_data_var(dataset, task="node"):
if task == "node":
# For node classification, we don't remove unseen nodes from the
# training set.
edges, nodes = load_data(dataset, mode="format")
else:
train_edges, val_edges, test_edges, nodes = \
load_split_edges(dataset=dataset)
edges = pd.concat([train_edges, val_edges, test_edges])
id2idx = {row.node_id: row.id_map + 1 for row in nodes.itertuples()}
edges["from_node_id"] = edges["from_node_id"].map(id2idx)
edges["to_node_id"] = edges["to_node_id"].map(id2idx)
pad = pd.DataFrame(columns=edges.columns)
pad.loc[0] = [0] * len(edges.columns)
pad = pad.astype(edges.dtypes)
edges = pd.concat([pad, edges], axis=0).reset_index(drop=True)
return edges
def edge2dgl(edges):
src = edges["from_node_id"]
dst = edges["to_node_id"]
etime = torch.tensor(edges["timestamp"])
efeat = torch.tensor(
edges.iloc[:,
4:].to_numpy()) if len(edges.columns) > 4 else torch.zeros(
(len(edges), 2))
u = np.vstack((src, dst)).transpose().flatten()
v = np.vstack((dst, src)).transpose().flatten()
src, dst = u, v
etime = etime.repeat_interleave(2)
efeat = efeat.repeat_interleave(2, dim=0)
g = dgl.graph((src, dst))
g.edata["timestamp"] = etime
g.edata["efeat"] = efeat
return g
def init_adj(edges: pd.DataFrame):
src_l = edges["from_node_id"].to_numpy()
dst_l = edges["to_node_id"].to_numpy()
ts_l = edges["timestamp"].to_numpy()
eidx_l = np.arange(len(src_l))
max_idx = max(src_l.max(), dst_l.max())
adj_list = [[] for _ in range(max_idx + 1)]
for src, dst, eidx, ts in zip(src_l, dst_l, eidx_l, ts_l):
adj_list[src].append((dst, eidx, ts))
adj_list[dst].append((src, eidx, ts))
return adj_list
def init_offset(edges: pd.DataFrame):
adj_list = init_adj(edges)
n_idx_l = []
n_ts_l = []
e_idx_l = []
off_set_l = [0]
for i in range(len(adj_list)):
curr = adj_list[i]
curr = sorted(curr, key=lambda x: x[1])
n_idx_l.extend([x[0] for x in curr])
e_idx_l.extend([x[1] for x in curr])
n_ts_l.extend([x[2] for x in curr])
off_set_l.append(len(n_idx_l))
n_idx_l = np.array(n_idx_l)
n_ts_l = np.array(n_ts_l)
e_idx_l = np.array(e_idx_l)
off_set_l = np.array(off_set_l)
assert len(n_idx_l) == len(n_ts_l)
assert off_set_l[-1] == len(n_ts_l)
return n_idx_l, n_ts_l, e_idx_l, off_set_l
def subgraph_np(ngh_node_l, ngh_ts_l, ngh_eidx_l, offset_l, m=20):
"""We transform the latest M interactions into an interaction graph
sequentially, where the node adjacency matrix is `(B, K, K)` and the
node-edge adjacency matrix is `(B, K, M)`. To combine the node features
and edge features, we further provide a `(B, K)` node identity vector NID,
and a `(B, M)` edge identity vector EID. The final mesaage-passing
functions as `h^1 = W_node * A_node * h^0[NID] + W_edge * A_edge * e[EID]`
and `h^{l+1} = W_node * A_node * h^l + W_edge * A_edge * e[EID]`.
We employ an attention layer for cross-layer subgraph pooling as
`h_u = \sum_i a_{ui} h^L_{i}`, where `a_{ui}` is the attention score.
"""
def sequence2graph(ngh_node, ngh_eidx):
"""We often make bugs when writing a lot of codes in a function.
Thus, we move the function of transforming a sequence into a graph
in a internal function. The inputs are all (M,) arrays, and we return
two adjacency matrices, node2node and edge2node, and two identity
vectors, containing original ids of nodes and edges.
"""
num_nodes = ngh_eidx.shape[0]
num_edges = ngh_eidx.shape[0]
node2node = np.eye(num_nodes) # self-loop
# node2node = np.zeros((num_nodes, num_nodes))
edge2node = np.zeros((num_nodes, num_edges))
node_ids = np.unique(ngh_node)
inv_nid = {nid: idx for idx, nid in enumerate(node_ids)}
edge_ids = ngh_eidx
inv_eid = {eid: idx for idx, eid in enumerate(edge_ids)}
for src, dst in zip(ngh_node[:-1],
ngh_node[1:]): # sequential node to ndoe
src_idx, dst_idx = inv_nid[src], inv_nid[dst]
node2node[dst_idx, src_idx] = 1.0
# node2node[src_idx, dst_idx] = 1.0 # inverse link
for dst, edge in zip(ngh_node, ngh_eidx): # edge to node
dst_idx, eidx = inv_nid[dst], inv_eid[edge]
edge2node[dst_idx, eidx] = 1.0
return node2node, node_ids, edge2node, edge_ids
# padding node 0, and padding edge 0
k = m # m interactions have at most m neighbors
adj_n2n = [] # (E, K, K)
adj_nids = [] # (E, K)
adj_e2n = [] # (E, K, M)
adj_eids = [] # (E, M)
for nid in trange(len(offset_l) - 1):
start, end = offset_l[nid], offset_l[nid + 1]
for j in range(start, end):
left = max(start, j - m + 1)
right = j + 1
ngh_node = ngh_node_l[left:right]
ngh_eidx = ngh_eidx_l[left:right]
n2n, nids, e2n, eids = sequence2graph(ngh_node, ngh_eidx)
row_n2n = np.zeros((k, k))
row_nids = np.zeros((k,), dtype=np.int)
row_e2n = np.zeros((k, m))
row_eids = np.zeros((m,), dtype=np.int)
row_n2n[:len(n2n), :len(n2n)] = n2n
row_nids[:len(nids)] = nids
row_e2n[:e2n.shape[0], :e2n.shape[1]] = e2n
row_eids[:len(eids)] = eids
adj_n2n.append(row_n2n)
adj_nids.append(row_nids)
adj_e2n.append(row_e2n)
adj_eids.append(row_eids)
mat_n2n = np.stack(adj_n2n, axis=0)
mat_nids = np.stack(adj_nids, axis=0)
mat_e2n = np.stack(adj_e2n, axis=0)
mat_eids = np.stack(adj_eids, axis=0)
assert len(mat_n2n) == len(ngh_node_l)
return mat_n2n, mat_nids, mat_e2n, mat_eids
def subgraph_dgl(ngh_l, nts_l, eidx_l, offset_l, m=20):
# padding node 0, and padding edge 0
adj_subgraphs = []
for nid in trange(len(offset_l) - 1):
start, end = offset_l[nid], offset_l[nid + 1]
for j in range(start, end):
# We transform these interactions into two adjacency matrices,
# along with two identity vectors.
slice_ngh = np.arange(max(0, j - m + 1), j + 1)
row_nid = np.unique(ngh_l[slice_ngh])
# We allow parallel edges in edge2node matrix.
row_eid = eidx_l[slice_ngh][1:]
if len(slice_ngh) > 1: # ensure >= 2 interactions
inv_nid = {ngh_id: idx for idx, ngh_id in enumerate(row_nid)}
src, dst = ngh_l[slice_ngh][:-1], ngh_l[slice_ngh][1:]
map_src = [inv_nid[sid] for sid in src]
map_dst = [inv_nid[did] for did in dst]
sg = dgl.graph((map_src, map_dst))
sg.ndata[dgl.NID] = torch.LongTensor(row_nid)
sg.edata[dgl.EID] = torch.LongTensor(row_eid)
else:
sg = dgl.graph((row_nid, row_nid))
sg.ndata[dgl.NID] = torch.LongTensor(row_nid)
sg.edata[dgl.EID] = torch.LongTensor(eidx_l[slice_ngh])
adj_subgraphs.append(sg)
return adj_subgraphs
@timeit
def interaction2subgraph(dataset, m=20, task="edge", dgl_type=False):
"""For each (node, timestamp) pair, we construct an interaction graph,
which contains the latest `k` neighbors and `m` interactions, resuling in
a `k*k` adjacency matrix. We store the graph in a adajacency list due to
the potential parallel edge between nodes. The requirement for a fixed
number of nodes is for the convenience of graph neural networks.
The graph is padded with `(0, 0)` interaction at time `0`. For nodes with
less than `k` neighbors, we leave all other nodes as the padding node `0`.
"""
edges = load_data_var(dataset, task)
ngh_l, nts_l, eidx_l, offset_l = init_offset(edges)
if dgl_type:
adj_subgraphs = subgraph_dgl(ngh_l, nts_l, eidx_l, offset_l, m)
path = f"subgraph_cache/{task}-{dataset}-{m}.dgl"
# pickle.dump(adj_subgraphs, open(path, "wb"))
dgl.save_graphs(path, adj_subgraphs)
else:
mat_n2n, mat_nids, mat_e2n, mat_eids = subgraph_np(
ngh_l, nts_l, eidx_l, offset_l, m)
path = f"subgraph_cache/{task}-{dataset}-{m}.npz"
np.savez(path, mat_n2n=mat_n2n, mat_nids=mat_nids,
mat_e2n=mat_e2n, mat_eids=mat_eids)
if __name__ == "__main__":
parser = argparse.ArgumentParser(
description="Helper for preprocessing interaction subgraphs.")
parser.add_argument("-t", "--task", default="edge", choices=["edge", "node"])
parser.add_argument("-dt", "--dgl-type", action="store_true")
parser.add_argument("-m", "--m", default=20, type=int)
args = parser.parse_args()
if args.task == "node":
datasets = ["JODIE-wikipedia", "JODIE-mooc", "JODIE-reddit"]
else:
datasets = _iterate_datasets()[:13]
for data in datasets:
print(data)
interaction2subgraph(data, args.m, args.task, args.dgl_type)