-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpagerank.py
94 lines (77 loc) · 2.69 KB
/
pagerank.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
import os
import random
import re
import sys
DAMPING = 0.85
SAMPLES = 10000
def main():
if len(sys.argv) != 2:
sys.exit("Usage: python pagerank.py corpus")
corpus = crawl(sys.argv[1])
ranks = sample_pagerank(corpus, DAMPING, SAMPLES)
print(f"PageRank Results from Sampling (n = {SAMPLES})")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
ranks = iterate_pagerank(corpus, DAMPING)
print(f"PageRank Results from Iteration")
for page in sorted(ranks):
print(f" {page}: {ranks[page]:.4f}")
def crawl(directory):
pages = dict()
for filename in os.listdir(directory):
if not filename.endswith(".html"):
continue
with open(os.path.join(directory, filename)) as f:
contents = f.read()
links = re.findall(r"<a\s+(?:[^>]*?)href=\"([^\"]*)\"", contents)
pages[filename] = set(links) - {filename}
for filename in pages:
pages[filename] = set(
link for link in pages[filename]
if link in pages
)
return pages
def transition_model(corpus, page, damping_factor):
prob_dist = {}
pages = corpus.keys()
linked_pages = corpus[page]
if linked_pages:
for p in pages:
prob_dist[p] = (1 - damping_factor) / len(pages)
for link in linked_pages:
prob_dist[link] += damping_factor / len(linked_pages)
else:
for p in pages:
prob_dist[p] = 1 / len(pages)
return prob_dist
def sample_pagerank(corpus, damping_factor, n):
pagerank = {page: 0 for page in corpus}
page = random.choice(list(corpus.keys()))
for _ in range(n):
pagerank[page] += 1
model = transition_model(corpus, page, damping_factor)
page = random.choices(list(model.keys()), list(model.values()))[0]
pagerank = {page: rank / n for page, rank in pagerank.items()}
return pagerank
def iterate_pagerank(corpus, damping_factor):
N = len(corpus)
pagerank = {page: 1 / N for page in corpus}
new_pagerank = pagerank.copy()
converged = False
while not converged:
converged = True
for page in corpus:
total = (1 - damping_factor) / N
for p in corpus:
if page in corpus[p]:
total += damping_factor * pagerank[p] / len(corpus[p])
if not corpus[p]:
total += damping_factor * pagerank[p] / N
new_pagerank[page] = total
for page in pagerank:
if abs(new_pagerank[page] - pagerank[page]) > 0.001:
converged = False
pagerank = new_pagerank.copy()
return pagerank
if __name__ == "__main__":
main()