-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy path00_get_wikipedia_words.py
80 lines (61 loc) · 2.41 KB
/
00_get_wikipedia_words.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
import urllib.request
import urllib.parse
import json
import re
import random
import os
from collections import Counter
WORDCOUNT_GOAL = 10000000
WORD_REGEX = re.compile("[^\w\d\'\-]+")
cache_listing = os.listdir("cache")
random.shuffle(cache_listing)
pagecount = 0
pages = []
def getPage():
global pages
if len(pages) < 1:
response = json.loads(urllib.request.urlopen("https://en.wikipedia.org/w/api.php?format=json&action=query&generator=random&grnnamespace=0&grnlimit=100").read())
pages += [page["title"] for id,page in response["query"]["pages"].items()]
return pages.pop()
def getPageText(title):
print(title)
response = json.loads(urllib.request.urlopen("https://en.wikipedia.org/w/api.php?action=query&titles=" + urllib.parse.quote(title) + "&prop=extracts&format=json&explaintext=1&exsectionformat=plain&redirects=1").read())
page_id = list(response["query"]["pages"].values())[0]["pageid"]
text = list(response["query"]["pages"].values())[0]["extract"].lower()
with open("cache/{}.txt".format(page_id), "w") as cache:
cache.write(text)
return text
def splitIntoWords(text):
return WORD_REGEX.split(text)
def getWordsInRandomArticle():
global pagecount
pagecount += 1
if len(cache_listing) > 0:
cache_file = cache_listing.pop()
print(cache_file)
with open("cache/" + cache_file, "r") as f:
return splitIntoWords(f.read())
else:
return splitIntoWords(getPageText(getPage()))
enable = []
with open("02_enable.txt", "r") as enable_file:
enable = enable_file.read().splitlines()
# These words are used abnormally often on Wikipedia, so exclude them
blacklist = ["external", "links", "references", "bibliography"]
enable = set([item for item in enable if item not in blacklist])
words = []
while len(words) < WORDCOUNT_GOAL:
words += getWordsInRandomArticle()
frequencies = Counter(words)
by_frequency = frequencies.most_common()
csv = open("01_frequencies_wikipedia.csv", "w")
wordcount_total = len(words)
wordcount = 0
for word,freq in by_frequency:
if word in enable:
# print("{:10d} {:.5} {}".format(freq, freq / wordcount_total, word))
csv.write("{},{:.10f},{}\n".format(freq, freq / wordcount_total, word))
wordcount += 1
csv.close()
print("WORDS: {} (UNIQUE: {}, IN DICTIONARY: {})".format(wordcount_total, len(by_frequency), wordcount))
print("PAGES: {}".format(pagecount))