-
Notifications
You must be signed in to change notification settings - Fork 0
/
scraper.js
190 lines (164 loc) · 5.13 KB
/
scraper.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
const { writeFile, readFile, appendFile } = require("fs/promises");
const path = require("path");
function matchTag(text, tag) {
return text.match(`<${tag}([^>]*)>(.*?)</${tag}>`);
}
const argRegex = /<span class='[^']+name'>([^<]+)<\/span> <span class='argtype'>\(([^<]+)\)<\/span> – (.*?)<br\/>/g;
function parseArgs(paragraph) {
const decodedParagraph = paragraph.replace(/</g, '<').replace(/>/g, '>');
return [...decodedParagraph.matchAll(argRegex)].map(([_, name, type, desc]) => {
const optional = type.indexOf(', optional');
return {
name,
desc,
optional: optional > -1,
type: optional > -1 ? type.substring(0, optional) : type
};
});
}
function parseExample(paragraph) {
const decodedParagraph = paragraph.replace(/</g, '<').replace(/>/g, '>');
const m = decodedParagraph.match(/<pre class='example'>([\s\S]*?)<\/pre>/m);
return m && m[1].trim();
}
function extractTables(textInput) {
const decodedTextInput = textInput.replace(/</g, '<').replace(/>/g, '>');
const tablesPart = {};
const textResult = [];
let pos = 0;
for (const match of decodedTextInput.matchAll(/<table[^>]*>([\s\S]*?)<\/?table\/?>/gm)) {
textResult.push(decodedTextInput.substring(pos, match.index));
pos = match.index + match[0].length;
const table = [];
for (const [_, row] of match[1].matchAll(/<tr>(.*?)<\/tr>/g)) {
table.push([...row.matchAll(/<td[^>]*>(.*?)<\/td>/g)].map(([_, column]) => column.replace(/ /gm, ' ').trim()));
}
const name = table[0][0];
textResult.push(`\${table:${name}}`);
table.splice(0, 1);
tablesPart[name] = table;
}
textResult.push(decodedTextInput.substring(pos));
return {
textPart: textResult.join('').trim(),
tablesPart,
}
}
function formatDescription(description) {
const words = description.split(/\s+/);
let currentLine = "";
let formattedDescription = "";
for (const word of words) {
if (currentLine.length + word.length + 1 <= 80) {
currentLine += (currentLine ? " " : "") + word;
} else {
formattedDescription += (formattedDescription ? "\n" : "") + currentLine;
currentLine = word;
}
}
if (currentLine) {
formattedDescription += (formattedDescription ? "\n" : "") + currentLine;
}
return formattedDescription;
}
function parseFunction(text) {
const name = matchTag(text, 'h3');
if (!name) return;
const paragraphs = text.split('<p>');
let rawDescription = paragraphs[3].trim();
if (paragraphs.length > 5) {
rawDescription += '\n\n' + paragraphs[4].trim();
}
const { textPart, tablesPart } = extractTables(rawDescription);
return {
name: name[2],
arguments: parseArgs(paragraphs[1]),
returns: parseArgs(paragraphs[2]),
examples: [parseExample(paragraphs[paragraphs.length - 1])],
description: formatDescription(textPart),
tables: tablesPart,
};
}
function parseCategory(text) {
const name = matchTag(text, 'h2');
if (!name) return;
let tables = {};
const description = [];
const entries = [];
for (const part of text.split('<p>').slice(1)) {
const a = matchTag(part, 'a');
if (a) {
for (const [_, name] of part.matchAll(/<a href='#([^']*)'/g)) {
entries.push(name);
}
} else {
const {textPart, tablesPart} = extractTables(part);
tables = { ...tables, ...tablesPart };
if (textPart !== '') {
description.push(textPart);
}
}
}
return {
name: name[2],
description: description.join('\n\n'),
tables,
entries,
}
}
async function scrapeAPI(url) {
const data = await fetch(url).then(data => data.text());
const version = data.match(/<h1>.*?\(([\d\.]+)\)<\/h1>/)[1];
const categories = [];
const functions = [];
for (const part of data.split("<hr/>")) {
const category = parseCategory(part);
if (category) {
category.entries.sort();
categories.push(category);
continue;
}
const func = parseFunction(part);
if (func) {
functions.push(func);
}
}
return {
version,
categories,
functions
}
}
async function outputData(root, data) {
const localVersion = (
await readFile(path.join(root, "version")).catch(() => "")
).toString();
if (localVersion === data.version) {
console.log("Up to date");
process.exit(1);
}
if (process.env.GITHUB_OUTPUT) {
await appendFile(process.env.GITHUB_OUTPUT, `version=${data.version}`)
}
await writeFile(path.join(root, "version"), data.version || data.name);
// Save only the functions in a single JSON file
const combinedData = {
version: data.version,
functions: data.functions
};
const fileName = path.join(root, 'teardown_api2.json');
await writeFile(fileName, JSON.stringify(combinedData, null, 2));
}
exports.scrapeAPI = scrapeAPI;
exports.outputData = outputData;
const root = './output'; // Change this to the desired output directory
const url = 'https://teardowngame.com/modding/api.html'; // The URL of the Teardown API site
(async () => {
try {
const data = await scrapeAPI(url);
await outputData(root, data);
console.log('Scraping completed successfully.');
} catch (error) {
console.error('Error while scraping:', error);
}
})();