forked from prust/wikipedia-movie-data
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathindex.js
211 lines (172 loc) · 5.91 KB
/
index.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
let fs = require('fs');
let promisify = require('util').promisify;
let request = require('request');
request = promisify(request);
let cheerio = require('cheerio');
let $ = cheerio;
let genre_replacements = JSON.parse(fs.readFileSync('genre-replacements.json'));
// lowercase-to-proper-case whitelist
let whitelist = {};
JSON.parse(fs.readFileSync('genres.json')).forEach(function(genre) {
whitelist[genre.toLowerCase()] = genre;
});
let years = [];
for (let year = 2020; year <= 2025; year++)
years.push(year);
invalid_genres = {};
main();
async function main() {
let movies = [];
for (let year of years) {
movies = movies.concat(await scrapeMoviesForYear(year));
}
for (let movie of movies) {
await timeout(50);
let res = await request(`https://en.wikipedia.org/api/rest_v1/page/summary/${movie.href}`);
if (res.statusCode == 404) {
continue;
}
if (res.statusCode != 200) {
throw new Error('wikipedia returned an error response: ' + res.statusCode);
}
let summary_data = JSON.parse(res.body);
movie.extract = summary_data.extract;
if (summary_data.thumbnail) {
movie.thumbnail = summary_data.thumbnail.source;
movie.thumbnail_width = summary_data.thumbnail.width;
movie.thumbnail_height = summary_data.thumbnail.height;
}
console.log(`${movie.title}: "${movie.extract.split('.')[0]}"`);
}
fs.writeFileSync('movies-2020s.json', JSON.stringify(movies, null, 2), { encoding: 'utf8' });
}
async function scrapeMoviesForYear(year) {
// setTimeout() so wikipedia doesn't hate us for slamming their servers
await timeout(1000);
console.log('loading movies from ' + year);
let url = 'https://en.wikipedia.org/wiki/List_of_American_films_of_' + year;
let res = await request(url);//, function(err, res, body) {
if (res.statusCode != 200)
throw new Error('wikipedia returned an error response: ' + res.statusCode);
let body = res.body;
let $ = cheerio.load(body);
let tables = $('table.wikitable');
if (!tables.length) {
console.log(body);
throw new Error('Did not find a table w/ class "wikitable" in Wikipedia\'s response');
}
let movies = [];
tables.each(function(ix, table) {
let rows = $(table).find('tr');
// skip 'Top-grossing Films' tables (heading row includes "Gross")
if ($(rows[0]).text().toLowerCase().includes('gross'))
return;
// 2003 has **both** an alphabetical full-year table and an opening-date season/quarter tables
// skip the first (full-year) table to avoid dupes
if (year == 2003 && $(rows[0]).text().toLowerCase().includes('opening')) {
return;
}
rows.each(function(ix, el) {
// the first row just has headings
if (ix == 0)
return;
let cells = $(el).find('td');
let title_cell = $(cells[0]);
if (isDateCell(title_cell))
title_cell = $(cells[1]);
if (isDateCell(title_cell))
title_cell = $(cells[2]);
if (isDateCell(title_cell))
throw new Error('Unexpected: a 3 cells in a row with rowspans');
// often there are empty rows with just rowspans
// perhaps leftover from when there was an anticipated release in that month
if (!title_cell.text().trim())
return;
title_cell.find('.sortkey').remove();
let cast_cell = title_cell.next().next();
let genre_cell = cast_cell.next();
let href = title_cell.find('a').attr('href');
// action=edit are placeholders for wikipedia pages not yet created; screen them out
if (href && href.includes('action=edit')) {
href = null;
}
else if (href) { // these are normal links, clean them up
assert(href.includes('/wiki/'), `Expected "${href}" to include "/wiki/"`);
href = href.replace('/wiki/', '');
}
let movie_data = {
title: title_cell.text().trim(),
year: year,
cast: toArray(cast_cell),
genres: cleanGenres(toArray(genre_cell), year),
href: href
};
console.log(`${movie_data.year} ${movie_data.genres.join(',')} (${movie_data.cast.join(', ')}) "${movie_data.title}"`)
movies.push(movie_data);
});
});
return movies;
}
function isDateCell(cell) {
return cell.attr('rowspan') ||
(cell.attr('style') && cell.attr('style').indexOf('center') > -1) ||
(cell.attr('align') && cell.attr('align').indexOf('center') > -1);
}
function cleanGenres(genres, year) {
let cleaned_genres = [];
genres.forEach(function(genre) {
genre = genre.toLowerCase();
if (whitelist[genre])
cleaned_genres.push(whitelist[genre]);
else if (genre_replacements[genre]) {
cleaned_genres = cleaned_genres.concat(genre_replacements[genre]);
}
else {
let genres = genre.split(/ |-|–|\/|\./);
if (genres.length > 1) {
cleaned_genres = cleaned_genres.concat(cleanGenres(genres, year));
}
else {
if (!invalid_genres[genre]) {
invalid_genres[genre] = true;
}
}
}
});
return cleaned_genres;
}
function toArray(cell) {
let arr = [];
if (!cell)
return arr;
cell.contents().each(function(ix, el) {
let text = $(el).text().trim();
let text_parts = text.split(/\n|,|;|\//);
text_parts.forEach(function(text, ix) {
text = text.trim();
if (!text)
return;
if (text == 'Jr.')
arr[arr.length - 1] += ', Jr.';
else if (text == '-')
return;
// don't include directors or screenwriters (it gets complicated
// to parse the different ways they're listed; they're always at the top of the list)
else if (text.indexOf('director') > -1 || text.indexOf('screenplay') > -1)
arr = [];
else
arr.push(text);
});
});
return arr;
}
function assert(val, msg) {
if (!val) {
throw new Error(msg || 'Assertion failed');
}
}
async function timeout(ms) {
return new Promise(function (resolve) {
setTimeout(resolve, ms);
});
}