-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdata_processing.py
184 lines (143 loc) · 8.38 KB
/
data_processing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import numpy as np
import random
import pandas as pd
class Data:
def __init__(self, sources, destinations, timestamps, edge_idxs, labels):
self.sources = sources
self.destinations = destinations
self.timestamps = timestamps
self.edge_idxs = edge_idxs
self.labels = labels
self.n_interactions = len(sources)
self.unique_nodes = set(sources) | set(destinations)
self.n_unique_nodes = len(self.unique_nodes)
def get_data_node_classification(dataset_name, use_validation=False):
### Load data and train val test split
graph_df = pd.read_csv('./data/ml_{}.csv'.format(dataset_name))
edge_features = np.load('./data/ml_{}.npy'.format(dataset_name))
node_features = np.load('./data/ml_{}_node.npy'.format(dataset_name))
val_time, test_time = list(np.quantile(graph_df.ts, [0.70, 0.85]))
sources = graph_df.u.values
destinations = graph_df.i.values
edge_idxs = graph_df.idx.values
labels = graph_df.label.values
timestamps = graph_df.ts.values
random.seed(2020)
train_mask = timestamps <= val_time if use_validation else timestamps <= test_time
test_mask = timestamps > test_time
val_mask = np.logical_and(timestamps <= test_time, timestamps > val_time) if use_validation else test_mask
full_data = Data(sources, destinations, timestamps, edge_idxs, labels)
train_data = Data(sources[train_mask], destinations[train_mask], timestamps[train_mask],
edge_idxs[train_mask], labels[train_mask])
val_data = Data(sources[val_mask], destinations[val_mask], timestamps[val_mask],
edge_idxs[val_mask], labels[val_mask])
test_data = Data(sources[test_mask], destinations[test_mask], timestamps[test_mask],
edge_idxs[test_mask], labels[test_mask])
return full_data, node_features, edge_features, train_data, val_data, test_data
def get_data(dataset_name, different_new_nodes_between_val_and_test=False, randomize_features=False):
### Load data and train val test split
graph_df = pd.read_csv('../10-tgn/data/ml_{}.csv'.format(dataset_name))
edge_features = np.load('../10-tgn/data/ml_{}.npy'.format(dataset_name))
node_features = np.load('../10-tgn/data/ml_{}_node.npy'.format(dataset_name))
if randomize_features:
node_features = np.random.rand(node_features.shape[0], node_features.shape[1])
val_time, test_time = list(np.quantile(graph_df.ts, [0.70, 0.85]))
sources = graph_df.u.values
destinations = graph_df.i.values
edge_idxs = graph_df.idx.values
labels = graph_df.label.values
timestamps = graph_df.ts.values
full_data = Data(sources, destinations, timestamps, edge_idxs, labels)
random.seed(2020)
node_set = set(sources) | set(destinations)
n_total_unique_nodes = len(node_set)
# Compute nodes which appear at test time
test_node_set = set(sources[timestamps > val_time]).union(
set(destinations[timestamps > val_time]))
# Sample nodes which we keep as new nodes (to test inductiveness), so than we have to remove all
# their edges from training
new_test_node_set = set(random.sample(test_node_set, int(0.1 * n_total_unique_nodes)))
# Mask saying for each source and destination whether they are new test nodes
new_test_source_mask = graph_df.u.map(lambda x: x in new_test_node_set).values
new_test_destination_mask = graph_df.i.map(lambda x: x in new_test_node_set).values
# Mask which is true for edges with both destination and source not being new test nodes (because
# we want to remove all edges involving any new test node)
observed_edges_mask = np.logical_and(~new_test_source_mask, ~new_test_destination_mask)
# For train we keep edges happening before the validation time which do not involve any new node
# used for inductiveness
train_mask = np.logical_and(timestamps <= val_time, observed_edges_mask)
train_data = Data(sources[train_mask], destinations[train_mask], timestamps[train_mask],
edge_idxs[train_mask], labels[train_mask])
# define the new nodes sets for testing inductiveness of the model
train_node_set = set(train_data.sources).union(train_data.destinations)
assert len(train_node_set & new_test_node_set) == 0
new_node_set = node_set - train_node_set
val_mask = np.logical_and(timestamps <= test_time, timestamps > val_time)
test_mask = timestamps > test_time
if different_new_nodes_between_val_and_test:
n_new_nodes = len(new_test_node_set) // 2
val_new_node_set = set(list(new_test_node_set)[:n_new_nodes])
test_new_node_set = set(list(new_test_node_set)[n_new_nodes:])
edge_contains_new_val_node_mask = np.array(
[(a in val_new_node_set or b in val_new_node_set) for a, b in zip(sources, destinations)])
edge_contains_new_test_node_mask = np.array(
[(a in test_new_node_set or b in test_new_node_set) for a, b in zip(sources, destinations)])
new_node_val_mask = np.logical_and(val_mask, edge_contains_new_val_node_mask)
new_node_test_mask = np.logical_and(test_mask, edge_contains_new_test_node_mask)
else:
edge_contains_new_node_mask = np.array(
[(a in new_node_set or b in new_node_set) for a, b in zip(sources, destinations)])
new_node_val_mask = np.logical_and(val_mask, edge_contains_new_node_mask)
new_node_test_mask = np.logical_and(test_mask, edge_contains_new_node_mask)
# validation and test with all edges
val_data = Data(sources[val_mask], destinations[val_mask], timestamps[val_mask],
edge_idxs[val_mask], labels[val_mask])
test_data = Data(sources[test_mask], destinations[test_mask], timestamps[test_mask],
edge_idxs[test_mask], labels[test_mask])
# validation and test with edges that at least has one new node (not in training set)
new_node_val_data = Data(sources[new_node_val_mask], destinations[new_node_val_mask],
timestamps[new_node_val_mask],
edge_idxs[new_node_val_mask], labels[new_node_val_mask])
new_node_test_data = Data(sources[new_node_test_mask], destinations[new_node_test_mask],
timestamps[new_node_test_mask], edge_idxs[new_node_test_mask],
labels[new_node_test_mask])
print("The dataset has {} interactions, involving {} different nodes".format(full_data.n_interactions,
full_data.n_unique_nodes))
print("The training dataset has {} interactions, involving {} different nodes".format(
train_data.n_interactions, train_data.n_unique_nodes))
print("The validation dataset has {} interactions, involving {} different nodes".format(
val_data.n_interactions, val_data.n_unique_nodes))
print("The test dataset has {} interactions, involving {} different nodes".format(
test_data.n_interactions, test_data.n_unique_nodes))
print("The new node validation dataset has {} interactions, involving {} different nodes".format(
new_node_val_data.n_interactions, new_node_val_data.n_unique_nodes))
print("The new node test dataset has {} interactions, involving {} different nodes".format(
new_node_test_data.n_interactions, new_node_test_data.n_unique_nodes))
print("{} nodes were used for the inductive testing, i.e. are never seen during training".format(
len(new_test_node_set)))
return node_features, edge_features, full_data, train_data, val_data, test_data, \
new_node_val_data, new_node_test_data
def compute_time_statistics(sources, destinations, timestamps):
last_timestamp_sources = dict()
last_timestamp_dst = dict()
all_timediffs_src = []
all_timediffs_dst = []
for k in range(len(sources)):
source_id = sources[k]
dest_id = destinations[k]
c_timestamp = timestamps[k]
if source_id not in last_timestamp_sources.keys():
last_timestamp_sources[source_id] = 0
if dest_id not in last_timestamp_dst.keys():
last_timestamp_dst[dest_id] = 0
all_timediffs_src.append(c_timestamp - last_timestamp_sources[source_id])
all_timediffs_dst.append(c_timestamp - last_timestamp_dst[dest_id])
last_timestamp_sources[source_id] = c_timestamp
last_timestamp_dst[dest_id] = c_timestamp
assert len(all_timediffs_src) == len(sources)
assert len(all_timediffs_dst) == len(sources)
mean_time_shift_src = np.mean(all_timediffs_src)
std_time_shift_src = np.std(all_timediffs_src)
mean_time_shift_dst = np.mean(all_timediffs_dst)
std_time_shift_dst = np.std(all_timediffs_dst)
return mean_time_shift_src, std_time_shift_src, mean_time_shift_dst, std_time_shift_dst