-
Notifications
You must be signed in to change notification settings - Fork 0
/
loadWebKBFull.m
120 lines (73 loc) · 2.16 KB
/
loadWebKBFull.m
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
schools = {'cornell', 'texas', 'washington', 'wisconsin'};
allLabels = {'course', 'faculty', 'student', 'research.project', 'other'};
clear label school;
I = [];
J = [];
pageFile = fopen('WebKB/webkb_old/pages.data', 'r');
tokens = textscan(pageFile, '%f\t%s\t%s\t', 'CollectOutput');
id = tokens{1};
[~, label(id)] = ismember(tokens{2}, allLabels);
[~, school(id)] = ismember(tokens{3}, schools);
fclose(pageFile);
%%
clear words;
dictFile = fopen('WebKB/webkb_old/words.uniq.data', 'r');
tokens = textscan(dictFile, '%f\t%s\t%s\t%f');
fclose(dictFile);
word_ids = tokens{1};
schoolDicts = {'wo_cornell', 'wo_texas', 'wo_washington', 'wo_wisconsin'};
[allWords, ~, wordMap] = unique(tokens{2});
[~, school_id] = ismember(tokens{3}, schoolDicts);
for i = 1:length(schools)
inds = school_id == i;
dicts(word_ids(inds),i) = wordMap(inds);
end
%%
wordFile = fopen('WebKB/webkb_old/wa.data', 'r');
tokens = textscan(wordFile, '%f\t%f\t%f\t%s\t%s', 'CollectOutput');
I = tokens{2};
J = tokens{3};
[~, school_id] = ismember(tokens{5}, schoolDicts);
for i = 1:length(schools)
inds = school_id == i;
J(inds) = dicts(J(inds), i);
Xwo{i} = sparse(I(inds), J(inds), true(nnz(inds),1));
end
X = sparse(I,J,ones(size(I))) > 0;
fclose(wordFile);
%% print some random documents
counts = sum(X,1);
[~,inds] = sort(counts, 'descend');
fprintf('most commonly used words:\n');
allWords(inds(1:20))
%
% for i = 1:4
%
% counts = sum(Xwo{i},1);
%
% [~,inds] = sort(counts, 'descend');
%
% fprintf('most commonly used words:\n');
% allWords(wordMap(inds(1:20)))
% end
%%
linkFile = fopen('WebKB/webkb_old/links.data', 'r');
tokens = textscan(linkFile, '%f\t%f\t%f\t%s', 'CollectOutput');
fclose(linkFile);
I = tokens{2};
J = tokens{3};
[~, Sc] = ismember(tokens{4}, schools);
A = sparse(I,J,ones(size(I)));
%%
counter = sparse(label(I), label(J), ones(size(I)));
%% split up schools
clear Y cites words wordsWo;
for i = 1:length(schools)
inds = school == i;
Y{i} = label(inds);
cites{i} = A(inds,inds);
words{i} = X(inds,:);
for j = 1:length(schools)
wordsWo{j}{i} = Xwo{j}(inds,:);
end
end