forked from wesbos/wesbos
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathscrape-tips.js
132 lines (119 loc) · 4.08 KB
/
scrape-tips.js
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
import fetch from 'isomorphic-fetch';
import { promises as fs } from 'fs';
import { Html5Entities } from 'html-entities';
import replaceAll from 'string.prototype.replaceall';
import jsdom, { JSDOM } from 'jsdom';
import FileType from 'file-type';
import getSlug from 'speakingurl';
import tips from './tips';
import urlExpander from 'expand-url';
import { promisify } from 'util';
const expand = promisify(urlExpander.expand);
const DLIMAGES = false;
// const expression = /[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-zA-Z0-9()]{1,6}\b([-a-zA-Z0-9()@:%_\+.~#?&//=]*)?/gi;
const expression = /(?:(?:https?|ftp|file):\/\/|www\.|ftp\.)(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[-A-Z0-9+&@#\/%=~_|$?!:,.])*(?:\([-A-Z0-9+&@#\/%=~_|$?!:,.]*\)|[A-Z0-9+&@#\/%=~_|$])/igm;
const linkRegex = new RegExp(expression);
function getImageName(path) {
return path.split('/').pop();
}
function findImagePaths(content) {
const virtualConsole = new jsdom.VirtualConsole();
// virtualConsole.on('log', console.log);
const DOM = new JSDOM(content);
const imgs = DOM.window.document.querySelectorAll(
'.AdaptiveMedia-container img'
);
const videos = DOM.window.document.querySelectorAll(
'[data-playable-media-url]'
);
return {
images: Array.from(imgs).map(img => img.src),
videos: Array.from(videos).map(video =>
video.style['background-image']
.replace('url(', '')
.replace(')', '')
.replace('.jpg', '.mp4')
.replace('tweet_video_thumb', 'tweet_video')
),
// https://video.twimg.com/tweet_video/EIodxyoXYAI8fLE.mp4
};
}
async function downloadImage(remotePath, localFolder) {
console.log(`Downloading ${remotePath} to ${localFolder}`);
const imageData = await fetch(remotePath).then(res => res.buffer());
console.log(`~~~Doing ${remotePath}`);
const { ext = 'png' } = await FileType.fromBuffer(imageData) || {};
const imageName = getImageName(remotePath);
const [, extension] = imageName.split('.');
await fs.writeFile(
`${localFolder}/${imageName}${extension ? '' : `.${ext}`}`,
imageData
);
}
// const selectedTips = tips.slice(35, 40);
const selectedTips = tips.slice(85);
// const selectedTips = [
// {
// url: 'https://twitter.com/wesbos/status/1191797964429283331',
// time: 12345,
// text: 'Test tip',
// },
// ];
async function getTweets() {
await fs.mkdir(`./src/tips/`, { recursive: true });
for (const tip of selectedTips) {
const slug = getSlug(
tip.text
.split(' ')
.slice(0, 8)
.join(' ')
);
const folderPath = `./src/tips/${slug}`;
await fs.mkdir(folderPath, { recursive: true });
const html = await fetch(tip.url).then(x => x.text());
const { images, videos } = await findImagePaths(html);
// find links
const links = tip.text.match(linkRegex) || [];
const resourceLinks = links.slice(0, links.length - 1);
console.log(resourceLinks);
const expandedLinks = await Promise.all(resourceLinks.map(link => {
const url = link.startsWith('http') ? link : `http://${link}`;
return expand(url).catch(console.error);
}));
console.log(expandedLinks);
// Download images and videos
console.log('finding image paths');
console.log(videos);
await Promise.all(images.map(path => downloadImage(path, folderPath)));
await Promise.all(videos.map(path => downloadImage(path, folderPath)));
links.forEach(link => {
tip.text = tip.text.replace(link, '');
tip.text = tip.text.replace('http://', '');
tip.text = tip.text.replace('http://', '');
});
// time, url, text
// 1. Fetch the HTML of this page
// 2. Save the text, url, and time.
const content = `---
date: ${tip.time}
tweetURL: ${tip.url}
text: ${tip.text}
slug: ${slug}
images:
${images.map(x => `- ${x}`).join('\n')}
videos:
${videos.map(x => `- ${x}`).join('\n')}
links:
${expandedLinks.map(x => `- ${x}`).join('\n')}
---
${tip.text.trim()}
`;
await fs.writeFile(`${folderPath}/${slug}.mdx`, content, {
encoding: 'utf-8',
});
await fs.writeFile(`${folderPath}/${slug}.html`, html, {
encoding: 'utf-8',
});
}
}
getTweets();