forked from mzz235711/Selectivity-Estimation-for-Queries-Containing-Predicates-over-Set-Valued-Attributes
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathclique_trans_table.py
118 lines (112 loc) · 3.7 KB
/
clique_trans_table.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
import pandas as pd
import numpy as np
import re
import queue
import argparse
parser = argparse.ArgumentParser()
parser.add_argument('--partnum', default=5, type=int)
parser.add_argument('--keepnum', default=500, type=int)
args = parser.parse_args()
partnum = args.partnum
keep_num = args.keepnum
idx_file_path = './idx.csv'
idx_file = open(idx_file_path, 'r')
idx_lines = idx_file.readlines()
idx2word = ["" for _ in range(len(idx_lines))]
word2idx = {}
# Change the folrder path for ST or STH
folder_path = './color_partition_{}'.format(partnum, keep_num)
#folder_path = './graph_color'
#folder_path = './color_partition_{}'.format(partnum)
#part_file_path = folder_path + '/approx_clique_part_{}.txt'.format(partnum)
part_file_path = folder_path + '/part.txt'
#part_file_path = './t_{}/approx_clique_part_{}.txt'.format(partnum, partnum)
partid = np.loadtxt(part_file_path, dtype=np.int)
partnum = max(partid) + 1
pad = 'ZZZ'
for line in idx_lines:
w = line.split('\t')[0]
num = int(line.split('\t')[1])
idx2word[num] = w
word2idx[w] = num
idx2word.append(pad)
word2idx[pad] = len(idx_lines)
df = pd.read_csv('./dataset/geotweet/geotweet.csv', escapechar='\\', quotechar='"')
df['tags'].fillna('', inplace=True)
lines = df['tags']
texts = []
avglenth = 0
for line in lines:
#line = line.lower()
#text = re.findall('[a-zA-Z]+', line)
text = line.split(',')
tmp_text = []
for t in text:
if t not in tmp_text:
tmp_text.append(t)
texts.append(tmp_text)
avglenth += len(tmp_text)
avglenth /= len(lines)
print("AvgLenth: ", avglenth)
set_map_file = open(folder_path + '/dis_set_map.txt', 'r')
set_map_lines = set_map_file.readlines()
set_map = [{} for _ in range(partnum)]
line_id = 0
for i in range(partnum):
line = set_map_lines[line_id]
dis_num = int(line.split(' ')[1])
line_id += 1
print(line_id, i, dis_num)
for line_id in range(line_id, line_id + dis_num):
line = set_map_lines[line_id]
line_split = line.split(' ')
line_split = [int(i) for i in line_split]
key = tuple(line_split[1:-1])
value = line_split[-1]
set_map[i][key] = value
#set_map[i][tuple([word2idx[pad]])] = len(set_map[i])
line_id += 1
new_column_data = []
#frequency = []
#for i in range(partnum):
# frequency.append([0] * len(set_map[i]))
for lineid, text in enumerate(texts):
data = [[] for _ in range(partnum)]
has_lfword = 0
for word in text:
if word in word2idx:
wid = word2idx[word]
pid = partid[wid]
data[pid].append(wid)
else:
has_lfword = 1
for i in range(partnum):
if len(data[i]) == 0:
data[i].append(word2idx[pad])
data[i].sort()
if tuple(data[i]) not in set_map[i]:
print(text)
print(lineid)
print(i)
print(data[i])
exit()
set_value = set_map[i][tuple(data[i])]
data[i] = set_value
#frequency[i][set_value] += 1;
data.append(has_lfword)
new_column_data.append(data)
new_column_data = np.array(new_column_data, dtype=np.int)
df.drop(columns=['tags'], inplace=True)
for i in range(partnum):
col_name = 'tags' + str(i)
df[col_name] = pd.Series(new_column_data[:,i], index=df.index)
col_name = 'tags_lfword'
df[col_name] = pd.Series(new_column_data[:,-1], index=df.index)
df.to_csv( folder_path + '/geotweet_approx_clique.csv'.format(partnum, keep_num), index=False)
#frequency_file = open("./partition_{}_{}/frequency.txt".format(partnum, keep_num), "w")
#for i in range(partnum):
# for freq in frequency[i]:
# frequency_file.write(str(freq) + "\n")
#frequency_file.close()
for i in range(partnum):
print(len(set_map[i]))