Skip to content
This repository has been archived by the owner on Mar 15, 2024. It is now read-only.

Commit

Permalink
Better validation for XML schema types
Browse files Browse the repository at this point in the history
  • Loading branch information
lionel-rowe committed Mar 13, 2024
1 parent 844ab65 commit 2db763e
Show file tree
Hide file tree
Showing 2 changed files with 106 additions and 26 deletions.
46 changes: 32 additions & 14 deletions src/core/imageMap.ts
Original file line number Diff line number Diff line change
Expand Up @@ -111,12 +111,8 @@ function getAllTexts($: CheerioAPI, rectType: RectType): PositionedText[] {
}
}

const nonSpaceDelimitedScripts = ['Han', 'Hiragana', 'Katakana', 'Thai', 'Khmer', 'Lao', 'Myanmar', 'Javanese'] as const
const nonSpaceDelimitedExts = nonSpaceDelimitedScripts.map((x) => `\\p{Script_Extensions=${x}}` as const)
const fullWidthPunct = '!"#$%&'()*+,-./:;<=>?[\]^_`{|}~'
const nonSpaceDelimitedPunct = '\\-/'
const nonSpaceDelimited = `[${[...nonSpaceDelimitedExts, fullWidthPunct, nonSpaceDelimitedPunct].join('')}]` as const
const nonSpaceDelimitedRe = new RegExp(nonSpaceDelimited, 'u')
export const NON_SPACE_DELIMITED =
/[\-/\p{White_Space}\p{scx=Han}\p{scx=Hiragana}\p{scx=Katakana}\p{scx=Thai}\p{scx=Khmer}\p{scx=Lao}\p{scx=Myanmar}\p{scx=Javanese}!"#$%&'()*+,-./:;<=>?[\]^_`{|}~]/u

/**
* Merge the text from multiple hard-wrapped lines, joining with or without a space, depending on whether the preceeding
Expand All @@ -139,7 +135,7 @@ export function mergeLineTexts(lines: string[]): string {
else {
const prevChar = text.match(/.$/u)?.[0] ?? ''
const nextChar = line.match(/^./u)?.[0] ?? ''
if (!nonSpaceDelimitedRe.test(prevChar) && !nonSpaceDelimitedRe.test(nextChar)) {
if (!NON_SPACE_DELIMITED.test(prevChar) && !NON_SPACE_DELIMITED.test(nextChar)) {
text += ' '
}
text += line
Expand All @@ -160,10 +156,10 @@ const mergeLines = mergeByStrategy(mergeLineTexts)
const mergeTabs = mergeByStrategy((texts) => texts.join('\t'))

function getRect($el: Cheerio<PositionedElement>): Rect {
const l = Number($el.attr('l'))
const t = Number($el.attr('t'))
const r = Number($el.attr('r'))
const b = Number($el.attr('b'))
const l = xs.integer($el, 'l')
const t = xs.integer($el, 't')
const r = xs.integer($el, 'r')
const b = xs.integer($el, 'b')

return new Rect({ l, t, r, b })
}
Expand All @@ -173,10 +169,32 @@ function getAspectRatio($el: Cheerio<PositionedElement>): number {
return width / height
}

function attrMissing($el: Cheerio<Element>, attrName: string): never {
throw new TypeError(`Attribute ${attrName} missing on ${$el.prop('tagName')}`)
}
/** Parse attributes conforming to XML schema datatypes */
const xs = {
/** [`xs:boolean`](https://www.w3.org/TR/xmlschema-2/#boolean) datatype */
boolean($el: Cheerio<Element>, attr: string, defaultVal?: boolean): boolean {
const val = $el.attr(attr)
return val && /^true|false|1|0$/.test(val) ? Boolean(JSON.parse(val)) : (defaultVal ?? attrMissing($el, attr))
},
/** [`xs:integer`](https://www.w3.org/TR/xmlschema-2/#integer) datatype */
integer($el: Cheerio<Element>, attr: string, defaultVal?: number): number {
const val = $el.attr(attr)
const int = val == null ? (defaultVal ?? attrMissing($el, attr)) : parseInt(val, 10)
assert(Number.isSafeInteger(int), `${int} is not a safe integer`)

return int
},
}

function getTextContent($char: Cheerio<PositionedElement>): string {
if ($char.attr('isTab')) return '\t'
if (xs.boolean($char, 'isTab', false)) return '\t'

const text = $char.text()

// if aspect ratio > 1 (width > height), we assume it's a tab
const isSoftTab = text.trim() === '' && getAspectRatio($char) > 1

return isSoftTab ? '\t' : text
Expand All @@ -198,8 +216,8 @@ function getTabDelimitedTexts($: CheerioAPI, $line: Cheerio<PositionedElement>):
const text = getTextContent($char)

if (text === '\t') {
positionedText.rect.r = Number($char.attr('l'))
texts.push({ text: '', rect: new Rect({ l: Number($char.attr('r')), t, r, b }) })
positionedText.rect.r = xs.integer($char, 'l')
texts.push({ text: '', rect: new Rect({ l: xs.integer($char, 'r'), t, r, b }) })
} else {
positionedText.text += text
}
Expand Down
86 changes: 74 additions & 12 deletions tests/core/imageMap.test.ts
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
import { imageMap, mergeLineTexts, Rect } from '../../src/core/imageMap.ts'
import { imageMap, mergeLineTexts, NON_SPACE_DELIMITED, Rect } from '../../src/core/imageMap.ts'
import { assertEquals } from 'std/assert/mod.ts'

const xml = String.raw

Deno.test(imageMap.name, async (t) => {
// simplified xml format
const xml = `<page>
const xmlContent = xml`<page>
<par>
<line l="0" t="0" r="15" b="10">
<charParams l="0" t="0" r="5" b="10">f</charParams>
Expand All @@ -19,22 +21,44 @@ Deno.test(imageMap.name, async (t) => {
</page>`

await t.step('line', () => {
const map = imageMap(xml, { rectType: 'line' })
const map = imageMap(xmlContent, { rectType: 'line' })
assertEquals(map.texts, [
{ text: 'foo', rect: new Rect({ l: 0, t: 0, r: 15, b: 10 }) },
{ text: 'bar', rect: new Rect({ l: 0, t: 10, r: 15, b: 20 }) },
])
})

await t.step('paragraph', () => {
const map = imageMap(xml, { rectType: 'paragraph' })
const map = imageMap(xmlContent, { rectType: 'paragraph' })
assertEquals(map.texts, [
{ text: 'foo bar', rect: new Rect({ l: 0, t: 0, r: 15, b: 20 }) },
])
})

await t.step('paragraph (Chinese)', () => {
const xmlContent = xml`<page>
<par>
<line l="0" t="0" r="15" b="10">
<charParams l="0" t="0" r="5" b="10">福</charParams>
<charParams l="5" t="0" r="10" b="10">呜</charParams>
<charParams l="10" t="0" r="15" b="10">呜</charParams>
</line>
<line l="0" t="10" r="15" b="20">
<charParams l="0" t="10" r="5" b="20">巴</charParams>
<charParams l="5" t="10" r="10" b="20">啊</charParams>
<charParams l="10" t="10" r="15" b="20">啊</charParams>
</line>
</par>
</page>`

const map = imageMap(xmlContent, { rectType: 'paragraph' })
assertEquals(map.texts, [
{ text: '福呜呜巴啊啊', rect: new Rect({ l: 0, t: 0, r: 15, b: 20 }) },
])
})

await t.step('non-tab space', () => {
const xml = `<page>
const xmlContent = xml`<page>
<par>
<line l="0" t="0" r="35" b="10">
<charParams l="0" t="0" r="5" b="10">f</charParams>
Expand All @@ -48,14 +72,14 @@ Deno.test(imageMap.name, async (t) => {
</par>
</page>`

const map = imageMap(xml)
const map = imageMap(xmlContent)
assertEquals(map.texts, [
{ text: 'foo bar', rect: new Rect({ l: 0, t: 0, r: 35, b: 10 }) },
])
})

await t.step('hard tab within single line splits rects', () => {
const xml = `<page>
const xmlContent = xml`<page>
<par>
<line l="0" t="0" r="35" b="10">
<charParams l="0" t="0" r="5" b="10">f</charParams>
Expand All @@ -69,15 +93,15 @@ Deno.test(imageMap.name, async (t) => {
</par>
</page>`

const map = imageMap(xml)
const map = imageMap(xmlContent)
assertEquals(map.texts, [
{ text: 'foo', rect: new Rect({ l: 0, t: 0, r: 15, b: 10 }) },
{ text: 'bar', rect: new Rect({ l: 20, t: 0, r: 35, b: 10 }) },
])
})

await t.step('soft tab (space with width > height) within single line splits rects', () => {
const xml = `<page>
const xmlContent = xml`<page>
<par>
<line l="0" t="0" r="45" b="10">
<charParams l="0" t="0" r="5" b="10">f</charParams>
Expand All @@ -91,15 +115,15 @@ Deno.test(imageMap.name, async (t) => {
</par>
</page>`

const map = imageMap(xml)
const map = imageMap(xmlContent)
assertEquals(map.texts, [
{ text: 'foo', rect: new Rect({ l: 0, t: 0, r: 15, b: 10 }) },
{ text: 'bar', rect: new Rect({ l: 30, t: 0, r: 45, b: 10 }) },
])
})

await t.step('soft tab within multi-line paragraph adds literal tab char', () => {
const xml = `<page>
const xmlContent = xml`<page>
<par>
<line l="0" t="0" r="45" b="20">
<charParams l="0" t="0" r="5" b="10">f</charParams>
Expand All @@ -118,7 +142,7 @@ Deno.test(imageMap.name, async (t) => {
</par>
</page>`

const map = imageMap(xml)
const map = imageMap(xmlContent)
assertEquals(map.texts, [
{ text: 'foo\tbar baz', rect: new Rect({ l: 0, t: 0, r: 45, b: 20 }) },
])
Expand Down Expand Up @@ -170,3 +194,41 @@ Deno.test(mergeLineTexts.name, async (t) => {
})
})
})

Deno.test('NON_SPACE_DELIMITED regex', async (t) => {
await t.step('literal', () => {
// Logic to re-create the regular expression dynamically. We keep it under `tests`, rather than building the
// regex each time at runtime, for performance reasons.
const scripts = ['Han', 'Hiragana', 'Katakana', 'Thai', 'Khmer', 'Lao', 'Myanmar', 'Javanese'] as const
const exts = scripts.map((x) => `\\p{scx=${x}}` as const)
const fullWidthPunct = '!"#$%&'()*+,-./:;<=>?[\]^_`{|}~'
const nonSpaceDelimitedPunct = '\\-/'
const alreadySpace = '\\p{White_Space}'
const source = `[${[nonSpaceDelimitedPunct, alreadySpace, ...exts, fullWidthPunct].join('')}]` as const

assertEquals(NON_SPACE_DELIMITED, new RegExp(source, 'u'))
})

function assertNumNonSpaceDelimitedMatches(str: string, expected: number) {
const globalRe = new RegExp(NON_SPACE_DELIMITED.source, NON_SPACE_DELIMITED.flags + 'g')
const numMatches = [...str.matchAll(globalRe)].length
assertEquals(numMatches, expected)
}

await t.step('matches Chinese', () => {
assertNumNonSpaceDelimitedMatches('文字、标点符号', 7)
})

await t.step('doesn’t match empty string', () => {
assertNumNonSpaceDelimitedMatches('', 0)
})

await t.step('doesn’t match Latin alphabet', () => {
assertNumNonSpaceDelimitedMatches('abcdefg', 0)
})

await t.step('matches text that’s already whitespace itself', () => {
const nbsp = '\xa0'
assertNumNonSpaceDelimitedMatches(` \t\n${nbsp}`, 4)
})
})

0 comments on commit 2db763e

Please sign in to comment.