Skip to content

Commit

Permalink
fix: Fix wk parsing
Browse files Browse the repository at this point in the history
  • Loading branch information
Coobaha committed Nov 4, 2023
1 parent 754da92 commit aab8412
Show file tree
Hide file tree
Showing 4 changed files with 49 additions and 126 deletions.
5 changes: 1 addition & 4 deletions server/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
"version": "1.0.0",
"author": "",
"dependencies": {
"@coobaha/typed-fastify": "1.2.0",
"@coobaha/typed-fastify": "^2.0.1",
"@fastify/autoload": "5.8.0",
"@fastify/caching": "8.3.0",
"@fastify/cors": "8.4.0",
Expand Down Expand Up @@ -54,9 +54,6 @@
"main": "app.mts",
"private": true,
"type": "module",
"resolutions": {
"ts-json-schema-generator": "0.82.0"
},
"scripts": {
"build:ts": "tsc",
"dev": "tsc && concurrently -k -p \"[{name}]\" -n \"TypeScript,App\" -c \"yellow.bold,cyan.bold\" \"tsc -w\" \"fastify start --esm -o -w --ignore-watch 'client node_modules .idea dist' --ignore-watch=.ts$ -l info -P dist/app.js\"",
Expand Down
50 changes: 0 additions & 50 deletions server/scratch.ts

This file was deleted.

100 changes: 32 additions & 68 deletions server/src/shared/search.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@ import * as qs from 'querystring';

import * as NodeURL from 'node:url';
import logger from './logger.js';
import type { Data, ImageT, SanakirjaData, WiktionaryData } from './types.js';
import type { ImageT, SanakirjaData, WiktionaryData } from './types.js';

import JSON5 from 'json5';

Expand Down Expand Up @@ -152,24 +152,35 @@ async function wiktionary(opts: {
term: string;
}): Promise<WiktionaryData | null> {
// Example query
// https://en.wiktionary.org/w/api.php?action=query&prop=extracts&titles=pomology&format=json
const url = NodeURL.format({
protocol: 'https',
hostname: `en.wiktionary.org`,
pathname: '/w/api.php',
query: {
action: 'parse',
format: 'json',
page: opts.term,
converttitles: true,
redirects: true,
},
});
// https://en.wiktionary.org/w/api.php?action=parse&page=l%C3%A4mmet%C3%A4&format=json&converttitles=true&redirects=true
const url = new NodeURL.URL('https://en.wiktionary.org/w/api.php');
url.searchParams.append('action', 'parse');
url.searchParams.append('format', 'json');
url.searchParams.append('page', opts.term);
url.searchParams.append('converttitles', 'true');
url.searchParams.append('redirects', 'true');

url.searchParams.append('prop', 'sections');
const sections: Response = await get(url).json();
if (sections.error) {
const logged = {
term: opts.term,
code: sections.error?.code,
info: sections.error?.info,
message: 'Error fetching WK Sections',
};
logger.error(logged);
throw Error('Error fetching WK sections ' + opts.term);
}

// console.time(url);
// console.time(`fetch_${url}`);
const finnishSection = sections.parse.sections.find((section) =>
section.line.includes('Finnish'),
);
url.searchParams.delete('prop');
if (finnishSection) {
url.searchParams.append('section', finnishSection.index);
}
const body: Response = await get(url).json();
// console.timeEnd(`fetch_${url}`);

if (body.error) {
const logged = {
Expand All @@ -181,15 +192,8 @@ async function wiktionary(opts: {
logger.error(logged);
throw Error('Error fetching WK term ' + opts.term);
}
const html = body.parse.text['*'];

const finnish = html
.split('<hr />')
.find((section) =>
section.includes('<span class="mw-headline" id="Finnish">Finnish</span>'),
);

if (!finnish) {
if (!finnishSection) {
const finnishLinks = body.parse.iwlinks
.filter((link) => link.prefix === 'fi')
.map((link) => {
Expand All @@ -210,7 +214,8 @@ async function wiktionary(opts: {
return null;
}

const $$ = load(DOMPurify.sanitize(finnish));
const html = body.parse.text['*'];
const $$ = load(DOMPurify.sanitize(html));
const $html = $$.root();
$html.find('#toc').remove();
$html.find('.mw-editsection').remove();
Expand Down Expand Up @@ -295,7 +300,7 @@ async function wiktionary(opts: {
.nextUntil('ol')
.next('ol');
plainTranslation.find(':empty').remove();
translations = `111111111111111111112222<ol>${plainTranslation.html()}</ol>`;
translations = `<ol>${plainTranslation.html()}</ol>`;
}

const etymology = $html
Expand Down Expand Up @@ -595,45 +600,6 @@ async function wiktionary(opts: {
};
}

const fetch: (term: string) => Promise<Data> = (term: string) =>
Promise.all([
sk({
term: term,
lang: 'en',
swap: false,
}),
sk({
term: term,
lang: 'ru',
swap: false,
}),
wiktionary({
term: term,
}),
]).then(([en, ru, wk]) => ({
Finnish: term,
sk_en_url: en.url,
sk_ru_url: ru.url,
wk_url: `https://en.wiktionary.org/wiki/${term}#Finnish`,
en_translation: en.translations,
en_synonyms: en.synonyms,

ru_translation: ru.translations,
ru_synonyms: ru.synonyms,
wk_translation: wk?.wk_translation,
wk_synonyms: wk?.wk_synonyms,
wk_antonyms: wk?.wk_antonyms,
wk_decl: wk?.wk_decl,
wk_notes: wk?.wk_notes,
wk_derived: wk?.wk_derived,
etymology: wk?.etymology,
suffix: wk?.suffix,
wordtype: wk?.wordtype,
wk_possessive: wk?.wk_possessive,
compounds: wk?.compounds,
meta: wk?.meta ?? {},
}));

export const fetchWiktionary: (term: string) => Promise<WiktionaryData> = (
term,
) =>
Expand Down Expand Up @@ -675,8 +641,6 @@ export const fetchSk: (
sk_translation_strings: sk.translationsStrings,
}));

export default fetch;

export const googleImages = async (search: string): Promise<ImageT[]> => {
const url = `https://www.google.com/search?q=${encodeURIComponent(
search,
Expand Down
20 changes: 16 additions & 4 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1262,16 +1262,18 @@
"@babel/helper-validator-identifier" "^7.19.1"
to-fast-properties "^2.0.0"

"@coobaha/typed-fastify@1.2.0":
version "1.2.0"
resolved "https://registry.yarnpkg.com/@coobaha/typed-fastify/-/typed-fastify-1.2.0.tgz#c1697f64e4ab42afcea2c66da72a9731c39fbdd2"
integrity sha512-Ie05vJOyAKTpLMNzZAovcqXMOlmEzl4YvBR/2//GnuxTc1mNUR3d4hImKAwQ2ZDQSQhHG7xydYCxd+Mpkq6Mhg==
"@coobaha/typed-fastify@^2.0.1":
version "2.0.1"
resolved "https://registry.yarnpkg.com/@coobaha/typed-fastify/-/typed-fastify-2.0.1.tgz#ae9bd0859fdde563476c26203de6c90981b724c4"
integrity sha512-XrzdeFuShww0xN0an+u0ryOpDOMWP/SPQeXZ9djoL/MEYws4yP442/CzTGvTsMF5Oe+zv5FOPC/XJaAROF0R8Q==
dependencies:
"@types/json-schema" "^7.0.13"
crypto-js "^4.1.1"
glob "^10.3.4"
json-schema-merge-allof "^0.8.1"
json-schema-traverse "^1.0.0"
std-env "3.4.3"
type-fest "^4.6.0"
typescript-json-schema "^0.61.0"
yargs "^17.7.2"

Expand Down Expand Up @@ -10456,6 +10458,11 @@ [email protected]:
resolved "https://registry.yarnpkg.com/statuses/-/statuses-1.5.0.tgz#161c7dac177659fd9811f43771fa99381478628c"
integrity sha1-Fhx9rBd2Wf2YEfQ3cfqZOBR4Yow=

[email protected]:
version "3.4.3"
resolved "https://registry.yarnpkg.com/std-env/-/std-env-3.4.3.tgz#326f11db518db751c83fd58574f449b7c3060910"
integrity sha512-f9aPhy8fYBuMN+sNfakZV18U39PbalgjXG3lLB9WkaYTxijru61wb57V9wxxNthXM5Sd88ETBWi29qLAsHO52Q==

stop-iteration-iterator@^1.0.0:
version "1.0.0"
resolved "https://registry.yarnpkg.com/stop-iteration-iterator/-/stop-iteration-iterator-1.0.0.tgz#6a60be0b4ee757d1ed5254858ec66b10c49285e4"
Expand Down Expand Up @@ -11283,6 +11290,11 @@ type-fest@^0.8.0:
resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-0.8.1.tgz#09e249ebde851d3b1e48d27c105444667f17b83d"
integrity sha512-4dbzIzqvjtgiM5rw1k5rEHtBANKmdudhGyBEajN01fEyhaAIhsoKNy6y7+IN93IfpFtwY9iqi7kD+xwKhQsNJA==

type-fest@^4.6.0:
version "4.6.0"
resolved "https://registry.yarnpkg.com/type-fest/-/type-fest-4.6.0.tgz#9c575f7e20530defef4f9cdc5e2c85d6e4ea0fc9"
integrity sha512-rLjWJzQFOq4xw7MgJrCZ6T1jIOvvYElXT12r+y0CC6u67hegDHaxcPqb2fZHOGlqxugGQPNB1EnTezjBetkwkw==

type-is@^1.6.16, type-is@^1.6.18:
version "1.6.18"
resolved "https://registry.yarnpkg.com/type-is/-/type-is-1.6.18.tgz#4e552cd05df09467dcbc4ef739de89f2cf37c131"
Expand Down

0 comments on commit aab8412

Please sign in to comment.