Better validation for XML schema types

clearlylocal · Mar 13, 2024 · 2db763e · 2db763e
1 parent 844ab65
commit 2db763e
Show file tree

Hide file tree

Showing 2 changed files with 106 additions and 26 deletions.
diff --git a/src/core/imageMap.ts b/src/core/imageMap.ts
@@ -111,12 +111,8 @@ function getAllTexts($: CheerioAPI, rectType: RectType): PositionedText[] {
 	}
 }
 
-const nonSpaceDelimitedScripts = ['Han', 'Hiragana', 'Katakana', 'Thai', 'Khmer', 'Lao', 'Myanmar', 'Javanese'] as const
-const nonSpaceDelimitedExts = nonSpaceDelimitedScripts.map((x) => `\\p{Script_Extensions=${x}}` as const)
-const fullWidthPunct = '！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？［＼］＾＿｀｛｜｝～'
-const nonSpaceDelimitedPunct = '\\-/'
-const nonSpaceDelimited = `[${[...nonSpaceDelimitedExts, fullWidthPunct, nonSpaceDelimitedPunct].join('')}]` as const
-const nonSpaceDelimitedRe = new RegExp(nonSpaceDelimited, 'u')
+export const NON_SPACE_DELIMITED =
+	/[\-/\p{White_Space}\p{scx=Han}\p{scx=Hiragana}\p{scx=Katakana}\p{scx=Thai}\p{scx=Khmer}\p{scx=Lao}\p{scx=Myanmar}\p{scx=Javanese}！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？［＼］＾＿｀｛｜｝～]/u
 
 /**
  * Merge the text from multiple hard-wrapped lines, joining with or without a space, depending on whether the preceeding
@@ -139,7 +135,7 @@ export function mergeLineTexts(lines: string[]): string {
 		else {
 			const prevChar = text.match(/.$/u)?.[0] ?? ''
 			const nextChar = line.match(/^./u)?.[0] ?? ''
-			if (!nonSpaceDelimitedRe.test(prevChar) && !nonSpaceDelimitedRe.test(nextChar)) {
+			if (!NON_SPACE_DELIMITED.test(prevChar) && !NON_SPACE_DELIMITED.test(nextChar)) {
 				text += ' '
 			}
 			text += line
@@ -160,10 +156,10 @@ const mergeLines = mergeByStrategy(mergeLineTexts)
 const mergeTabs = mergeByStrategy((texts) => texts.join('\t'))
 
 function getRect($el: Cheerio<PositionedElement>): Rect {
-	const l = Number($el.attr('l'))
-	const t = Number($el.attr('t'))
-	const r = Number($el.attr('r'))
-	const b = Number($el.attr('b'))
+	const l = xs.integer($el, 'l')
+	const t = xs.integer($el, 't')
+	const r = xs.integer($el, 'r')
+	const b = xs.integer($el, 'b')
 
 	return new Rect({ l, t, r, b })
 }
@@ -173,10 +169,32 @@ function getAspectRatio($el: Cheerio<PositionedElement>): number {
 	return width / height
 }
 
+function attrMissing($el: Cheerio<Element>, attrName: string): never {
+	throw new TypeError(`Attribute ${attrName} missing on ${$el.prop('tagName')}`)
+}
+/** Parse attributes conforming to XML schema datatypes */
+const xs = {
+	/** [`xs:boolean`](https://www.w3.org/TR/xmlschema-2/#boolean) datatype */
+	boolean($el: Cheerio<Element>, attr: string, defaultVal?: boolean): boolean {
+		const val = $el.attr(attr)
+		return val && /^true|false|1|0$/.test(val) ? Boolean(JSON.parse(val)) : (defaultVal ?? attrMissing($el, attr))
+	},
+	/** [`xs:integer`](https://www.w3.org/TR/xmlschema-2/#integer) datatype */
+	integer($el: Cheerio<Element>, attr: string, defaultVal?: number): number {
+		const val = $el.attr(attr)
+		const int = val == null ? (defaultVal ?? attrMissing($el, attr)) : parseInt(val, 10)
+		assert(Number.isSafeInteger(int), `${int} is not a safe integer`)
+
+		return int
+	},
+}
+
 function getTextContent($char: Cheerio<PositionedElement>): string {
-	if ($char.attr('isTab')) return '\t'
+	if (xs.boolean($char, 'isTab', false)) return '\t'
 
 	const text = $char.text()
+
+	// if aspect ratio > 1 (width > height), we assume it's a tab
 	const isSoftTab = text.trim() === '' && getAspectRatio($char) > 1
 
 	return isSoftTab ? '\t' : text
@@ -198,8 +216,8 @@ function getTabDelimitedTexts($: CheerioAPI, $line: Cheerio<PositionedElement>):
 		const text = getTextContent($char)
 
 		if (text === '\t') {
-			positionedText.rect.r = Number($char.attr('l'))
-			texts.push({ text: '', rect: new Rect({ l: Number($char.attr('r')), t, r, b }) })
+			positionedText.rect.r = xs.integer($char, 'l')
+			texts.push({ text: '', rect: new Rect({ l: xs.integer($char, 'r'), t, r, b }) })
 		} else {
 			positionedText.text += text
 		}

diff --git a/tests/core/imageMap.test.ts b/tests/core/imageMap.test.ts
@@ -1,9 +1,11 @@
-import { imageMap, mergeLineTexts, Rect } from '../../src/core/imageMap.ts'
+import { imageMap, mergeLineTexts, NON_SPACE_DELIMITED, Rect } from '../../src/core/imageMap.ts'
 import { assertEquals } from 'std/assert/mod.ts'
 
+const xml = String.raw
+
 Deno.test(imageMap.name, async (t) => {
 	// simplified xml format
-	const xml = `<page>
+	const xmlContent = xml`<page>
 		<par>
 			<line l="0" t="0" r="15" b="10">
 				<charParams l="0" t="0" r="5" b="10">f</charParams>
@@ -19,22 +21,44 @@ Deno.test(imageMap.name, async (t) => {
 	</page>`
 
 	await t.step('line', () => {
-		const map = imageMap(xml, { rectType: 'line' })
+		const map = imageMap(xmlContent, { rectType: 'line' })
 		assertEquals(map.texts, [
 			{ text: 'foo', rect: new Rect({ l: 0, t: 0, r: 15, b: 10 }) },
 			{ text: 'bar', rect: new Rect({ l: 0, t: 10, r: 15, b: 20 }) },
 		])
 	})
 
 	await t.step('paragraph', () => {
-		const map = imageMap(xml, { rectType: 'paragraph' })
+		const map = imageMap(xmlContent, { rectType: 'paragraph' })
 		assertEquals(map.texts, [
 			{ text: 'foo bar', rect: new Rect({ l: 0, t: 0, r: 15, b: 20 }) },
 		])
 	})
 
+	await t.step('paragraph (Chinese)', () => {
+		const xmlContent = xml`<page>
+			<par>
+				<line l="0" t="0" r="15" b="10">
+					<charParams l="0" t="0" r="5" b="10">福</charParams>
+					<charParams l="5" t="0" r="10" b="10">呜</charParams>
+					<charParams l="10" t="0" r="15" b="10">呜</charParams>
+				</line>
+				<line l="0" t="10" r="15" b="20">
+					<charParams l="0" t="10" r="5" b="20">巴</charParams>
+					<charParams l="5" t="10" r="10" b="20">啊</charParams>
+					<charParams l="10" t="10" r="15" b="20">啊</charParams>
+				</line>
+			</par>
+		</page>`
+
+		const map = imageMap(xmlContent, { rectType: 'paragraph' })
+		assertEquals(map.texts, [
+			{ text: '福呜呜巴啊啊', rect: new Rect({ l: 0, t: 0, r: 15, b: 20 }) },
+		])
+	})
+
 	await t.step('non-tab space', () => {
-		const xml = `<page>
+		const xmlContent = xml`<page>
 			<par>
 				<line l="0" t="0" r="35" b="10">
 					<charParams l="0" t="0" r="5" b="10">f</charParams>
@@ -48,14 +72,14 @@ Deno.test(imageMap.name, async (t) => {
 			</par>
 		</page>`
 
-		const map = imageMap(xml)
+		const map = imageMap(xmlContent)
 		assertEquals(map.texts, [
 			{ text: 'foo bar', rect: new Rect({ l: 0, t: 0, r: 35, b: 10 }) },
 		])
 	})
 
 	await t.step('hard tab within single line splits rects', () => {
-		const xml = `<page>
+		const xmlContent = xml`<page>
 			<par>
 				<line l="0" t="0" r="35" b="10">
 					<charParams l="0" t="0" r="5" b="10">f</charParams>
@@ -69,15 +93,15 @@ Deno.test(imageMap.name, async (t) => {
 			</par>
 		</page>`
 
-		const map = imageMap(xml)
+		const map = imageMap(xmlContent)
 		assertEquals(map.texts, [
 			{ text: 'foo', rect: new Rect({ l: 0, t: 0, r: 15, b: 10 }) },
 			{ text: 'bar', rect: new Rect({ l: 20, t: 0, r: 35, b: 10 }) },
 		])
 	})
 
 	await t.step('soft tab (space with width > height) within single line splits rects', () => {
-		const xml = `<page>
+		const xmlContent = xml`<page>
 			<par>
 				<line l="0" t="0" r="45" b="10">
 					<charParams l="0" t="0" r="5" b="10">f</charParams>
@@ -91,15 +115,15 @@ Deno.test(imageMap.name, async (t) => {
 			</par>
 		</page>`
 
-		const map = imageMap(xml)
+		const map = imageMap(xmlContent)
 		assertEquals(map.texts, [
 			{ text: 'foo', rect: new Rect({ l: 0, t: 0, r: 15, b: 10 }) },
 			{ text: 'bar', rect: new Rect({ l: 30, t: 0, r: 45, b: 10 }) },
 		])
 	})
 
 	await t.step('soft tab within multi-line paragraph adds literal tab char', () => {
-		const xml = `<page>
+		const xmlContent = xml`<page>
 			<par>
 				<line l="0" t="0" r="45" b="20">
 					<charParams l="0" t="0" r="5" b="10">f</charParams>
@@ -118,7 +142,7 @@ Deno.test(imageMap.name, async (t) => {
 			</par>
 		</page>`
 
-		const map = imageMap(xml)
+		const map = imageMap(xmlContent)
 		assertEquals(map.texts, [
 			{ text: 'foo\tbar baz', rect: new Rect({ l: 0, t: 0, r: 45, b: 20 }) },
 		])
@@ -170,3 +194,41 @@ Deno.test(mergeLineTexts.name, async (t) => {
 		})
 	})
 })
+
+Deno.test('NON_SPACE_DELIMITED regex', async (t) => {
+	await t.step('literal', () => {
+		// Logic to re-create the regular expression dynamically. We keep it under `tests`, rather than building the
+		// regex each time at runtime, for performance reasons.
+		const scripts = ['Han', 'Hiragana', 'Katakana', 'Thai', 'Khmer', 'Lao', 'Myanmar', 'Javanese'] as const
+		const exts = scripts.map((x) => `\\p{scx=${x}}` as const)
+		const fullWidthPunct = '！＂＃＄％＆＇（）＊＋，－．／：；＜＝＞？［＼］＾＿｀｛｜｝～'
+		const nonSpaceDelimitedPunct = '\\-/'
+		const alreadySpace = '\\p{White_Space}'
+		const source = `[${[nonSpaceDelimitedPunct, alreadySpace, ...exts, fullWidthPunct].join('')}]` as const
+
+		assertEquals(NON_SPACE_DELIMITED, new RegExp(source, 'u'))
+	})
+
+	function assertNumNonSpaceDelimitedMatches(str: string, expected: number) {
+		const globalRe = new RegExp(NON_SPACE_DELIMITED.source, NON_SPACE_DELIMITED.flags + 'g')
+		const numMatches = [...str.matchAll(globalRe)].length
+		assertEquals(numMatches, expected)
+	}
+
+	await t.step('matches Chinese', () => {
+		assertNumNonSpaceDelimitedMatches('文字、标点符号', 7)
+	})
+
+	await t.step('doesn’t match empty string', () => {
+		assertNumNonSpaceDelimitedMatches('', 0)
+	})
+
+	await t.step('doesn’t match Latin alphabet', () => {
+		assertNumNonSpaceDelimitedMatches('abcdefg', 0)
+	})
+
+	await t.step('matches text that’s already whitespace itself', () => {
+		const nbsp = '\xa0'
+		assertNumNonSpaceDelimitedMatches(` \t\n${nbsp}`, 4)
+	})
+})