-
Notifications
You must be signed in to change notification settings - Fork 340
/
Copy pathextract-page-content.test.ts
61 lines (55 loc) · 2.14 KB
/
extract-page-content.test.ts
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
/* eslint-env jest */
import { JSDOM } from 'jsdom'
import { extractRawPageContent } from '@worldbrain/memex-common/lib/page-indexing/content-extraction/extract-page-content'
import { transformPageHTML } from '@worldbrain/memex-stemmer/lib/transform-page-html.service-worker'
describe('Extract page content', () => {
// beforeAll(() => {
// browser.extension = {
// getURL: rel => path.resolve('extension/lib', rel.substr(1)),
// }
// })
// afterAll(() => {
// })
test('TO FINISH: extract content from PDF', async () => {
// const pdfUrl =
// 'http://cdn.linkdetox.com/wp-content/uploads/noindex/111-things-to-know-about-links.pdf'
// const pdfPath = path.resolve(
// 'test-content', 'pdf',
// '111-things-to-know-about-links.pdf',
// )
// const data = new Uint8Array(fs.readFileSync(pdfPath))
// fetch.mockResponseOnce(new Blob([data], { type: 'application/pdf' }))
// const result = await extractPageContent(null, pdfUrl)
})
// TODO: Fix this test
test.skip('extract content from an HTML page', () => {
// eslint-disable-next-line new-cap
const dom = new JSDOM(`
<!DOCTYPE html>
<html>
<head>
<title>My title</title>
<meta name="keywords" content="key words for all">
<meta name="description" content="some kind of description">
</head>
<body>
<p>Hello world</p>
</body>
</html>
`)
const rawContent = extractRawPageContent(
dom.window.document,
'https://test.com',
)
const metadata = rawContent.metadata
const fullText = transformPageHTML({ html: rawContent.body }).text
expect({ ...metadata, fullText }).toEqual({
fullText: ' Hello world ',
lang: 'en',
canonicalUrl: undefined,
title: 'My title',
keywords: ['key words for all'],
description: 'some kind of description',
})
})
})