This repository has been archived by the owner on Dec 9, 2020. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
/
app.py
executable file
·200 lines (163 loc) · 7.15 KB
/
app.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import os
import random
from time import sleep
import json
import string
import pprint
from itertools import chain
from pathlib import Path
from mastodon import Mastodon
from decouple import config
from inscriptis import get_text
import nltk
API_BASE_URL = config('API_BASE_URL', default='https://mastodon.social')
ACCESS_TOKEN = config('ACCESS_TOKEN')
ACTUALLY_TOOT = config('ACTUALLY_TOOT', default=False, cast=bool)
MODEL_NAME = config('MODEL_NAME', default='derbynames')
NAME_BUFFER_SIZE = config('NAME_BUFFER_SIZE', default=100)
DEFAULT_TEMP = config('DEFAULT_TEMP', default=1.0)
USE_RANDOM_TEMPS = config('USE_RANDOM_TEMPS', default=True, cast=bool)
MIN_TEMP = config('MIN_TEMP', default=0.2, cast=float)
MAX_TEMP = config('MAX_TEMP', default=1.0, cast=float)
DO_WAIT = config('DO_WAIT', default=False, cast=bool)
MIN_WAIT = config('MIN_WAIT', default=10, cast=int)
MAX_WAIT = config('MAX_WAIT', default=300, cast=int)
registered_names_filename = 'data/registered_names.json'
registered_names_file = Path(registered_names_filename)
generated_names_filename = 'data/generated_names.json'
generated_names_file = Path(generated_names_filename)
used_names_filename = 'data/used_names.json'
used_names_file = Path(used_names_filename)
nltk.download('words')
dictionary_words = nltk.corpus.words.words()
def is_dictionary_word(word):
if word in dictionary_words or word.lower().strip() in dictionary_words:
return True
else:
return False
def download_registered_names():
from bs4 import BeautifulSoup
import requests
name_set = set()
session = requests.Session()
url1 = "https://www.twoevils.org/rollergirls/"
print("Downloading names from %s" % url1)
r1 = session.get(url1)
soup1 = BeautifulSoup(r1.text, "lxml")
rows1 = soup1.find_all('tr', {'class': ['trc1', 'trc2']})
for idx, row in enumerate(rows1):
td = row.find('td')
name_set.add(td.get_text())
url2 = "http://www.derbyrollcall.com/everyone"
print("Downloading names from %s" % url2)
r2 = session.get(url2)
soup2 = BeautifulSoup(r2.text, "lxml")
rows2 = soup2.find_all('td', {'class': 'name'})
for idx, td in enumerate(rows2):
name_set.add(td.get_text())
initial_letters = string.ascii_uppercase
# Loop through initial letters (A-Z)
for letter in initial_letters:
url3 = "https://rollerderbyroster.com/view-names/?ini={}".format(
letter)
print("Downloading names from {}".format(url3))
r3 = session.get(url3)
d3 = r3.text
soup3 = BeautifulSoup(d3, "lxml")
rows3 = soup3.find_all('ul')
# Use only last unordered list - this is where names are!
for idx, li in enumerate(rows3[-1]):
# Name should be the text of the link within the list item
name = li.find('a').get_text()
name_set.add(name)
name_list = list(name_set)
print("Writing %s names to %s..." %
(len(name_list), registered_names_file))
registered_names_file.write_text(json.dumps(name_list))
def download_used_names(mastodon):
downloaded_names = list()
account_id = mastodon.account_verify_credentials()['id']
statuses = mastodon.account_statuses(account_id, exclude_replies=True)
names = [get_text(s.content).strip() for s in statuses]
downloaded_names.extend(names)
print("Downloaded {} used names...".format(len(downloaded_names)))
next_page = mastodon.fetch_next(statuses)
while next_page:
names = [get_text(s.content).strip() for s in next_page]
downloaded_names.extend(names)
print("Downloaded {} used names...".format(len(downloaded_names)))
used_names_file.write_text(json.dumps(downloaded_names))
next_page = mastodon.fetch_next(next_page)
print("Saved {} used names to {}".format(
len(downloaded_names), used_names_file))
def generate_new_names(skip_names=[], model_name=MODEL_NAME, batch_size=10, random_temps=True):
from textgenrnn import textgenrnn
textgen = textgenrnn(weights_path='model/{}_weights.hdf5'.format(model_name),
vocab_path='model/{}_vocab.json'.format(model_name),
config_path='model/{}_config.json'.format(model_name))
temperature = []
if random_temps is True:
for i in range(batch_size):
temp = round(random.uniform(MIN_TEMP, MAX_TEMP), 1)
temperature.append(temp)
else:
temperature.append(DEFAULT_TEMP)
new_names = textgen.generate(batch_size,
temperature=temperature, return_as_list=True)
# Discard used and short names
unused_names = [n.strip()
for n in new_names if (len(n.strip()) > 3) and (n not in skip_names) and (not is_dictionary_word(n))]
return(unused_names)
def main():
registered_names = list()
if not registered_names_file.is_file():
download_registered_names()
registered_names = sorted(json.loads(registered_names_file.read_text()))
print("Loaded %s existing names from %s" %
(len(registered_names), registered_names_file))
mastodon = Mastodon(
access_token=ACCESS_TOKEN,
api_base_url=API_BASE_URL,
ratelimit_method='pace'
)
print("Logging on to %s..." % API_BASE_URL)
used_names = list()
if not used_names_file.is_file():
print("Used names not found, downloading...")
download_used_names(mastodon)
used_names = json.loads(used_names_file.read_text())
print("Loaded %s existing names from %s" %
(len(used_names), used_names_file))
skip_names = used_names + registered_names
generated_names = list()
if generated_names_file.is_file():
json_data = generated_names_file.open().read()
# print(json_data)
generated_names = list(set(json.loads(json_data)))
print("Loaded %s existing names from %s" %
(len(generated_names), generated_names_file))
print("Filtering generated names...")
generated_names = [
n for n in generated_names if n not in skip_names]
generated_names.sort()
if len(generated_names) < NAME_BUFFER_SIZE:
new_names = generate_new_names(
skip_names=skip_names)
print("Generated names: {}".format(new_names))
generated_names.extend(new_names)
print("%s generated names ready!" % len(generated_names))
chosen_name = random.choice(generated_names)
print("Chosen name: {}".format(chosen_name))
if DO_WAIT is True:
sleep_time = random.randint(MIN_WAIT, MAX_WAIT)
print("Waiting {} seconds...".format(sleep_time))
sleep(sleep_time)
if ACTUALLY_TOOT is True:
print("Tooting name: {}".format(chosen_name))
mastodon.toot(chosen_name)
generated_names.remove(chosen_name)
generated_names_file.write_text(json.dumps(generated_names))
used_names.append(chosen_name)
used_names_file.write_text(json.dumps(used_names))
if __name__ == "__main__":
main()