diff --git a/.changeset/rotten-timers-worry.md b/.changeset/rotten-timers-worry.md new file mode 100644 index 000000000..0d50492e5 --- /dev/null +++ b/.changeset/rotten-timers-worry.md @@ -0,0 +1,6 @@ +--- +'myst-transforms': patch +'myst-cli': patch +--- + +Improve HTML transforms for grouping and processing diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts index 933f57f2c..f60a512b4 100644 --- a/packages/myst-cli/src/process/mdast.ts +++ b/packages/myst-cli/src/process/mdast.ts @@ -24,6 +24,7 @@ import { joinGatesPlugin, glossaryPlugin, abbreviationPlugin, + reconstructHtmlPlugin, } from 'myst-transforms'; import { unified } from 'unified'; import { VFile } from 'vfile'; @@ -164,9 +165,10 @@ export async function transformMdast( liftCodeMetadataToBlock(session, vfile, mdast); const pipe = unified() + .use(reconstructHtmlPlugin) // We need to group and link the HTML first + .use(htmlPlugin, { htmlHandlers }) // Some of the HTML plugins need to operate on the transformed html, e.g. figure caption transforms .use(basicTransformationsPlugin) .use(inlineExpressionsPlugin) // Happens before math and images! - .use(htmlPlugin, { htmlHandlers }) .use(mathPlugin, { macros: frontmatter.math }) .use(glossaryPlugin, { state }) // This should be before the enumerate plugins .use(abbreviationPlugin, { abbreviations: frontmatter.abbreviations }) diff --git a/packages/myst-transforms/src/html.spec.ts b/packages/myst-transforms/src/html.spec.ts index 25ece935f..20f52e366 100644 --- a/packages/myst-transforms/src/html.spec.ts +++ b/packages/myst-transforms/src/html.spec.ts @@ -1,5 +1,5 @@ import { describe, expect, test } from 'vitest'; -import { reconstructHtmlTransform } from './html'; +import { htmlTransform, reconstructHtmlTransform } from './html'; describe('Test reconstructHtmlTransform', () => { test('tree without html returns self', async () => { @@ -162,4 +162,178 @@ describe('Test reconstructHtmlTransform', () => { children: [{ type: 'html', value: '' }], }); }); + test('self-closing tags', async () => { + const mdast = { + type: 'root', + children: [ + { + type: 'html', + value: '', + }, + { + type: 'html', + value: '', + }, + { + type: 'html', + value: '
', + }, + { + type: 'html', + value: '
', + }, + { type: 'html', value: '
' }, + ], + }; + reconstructHtmlTransform(mdast); + expect(mdast).toEqual({ + type: 'root', + children: [ + { + type: 'html', + value: + '\n
\n
', + }, + ], + }); + }); + test('figure captions', async () => { + const mdast = { + type: 'root', + children: [ + { + type: 'html', + value: '
', + }, + { + type: 'html', + value: '', + }, + { + type: 'html', + value: '
', + }, + { + type: 'text', + value: 'my caption', + }, + { + type: 'html', + value: '
', + }, + { + type: 'html', + value: '
', + }, + ], + }; + reconstructHtmlTransform(mdast); + expect(mdast).toEqual({ + type: 'root', + children: [ + { + type: 'html', + value: + '
\n
my caption
', + }, + ], + }); + htmlTransform(mdast); + expect(mdast).toEqual({ + type: 'root', + children: [ + { + type: 'container', + children: [ + { type: 'image', url: 'img.png', class: 'big', identifier: 'my-img', label: 'my-img' }, + { type: 'caption', children: [{ type: 'text', value: 'my caption' }] }, + ], + }, + ], + }); + }); + test('no paragraph when in a paragraph', async () => { + const mdast = { + type: 'root', + children: [ + { + type: 'paragraph', + children: [ + { + type: 'text', + value: 'See ', + }, + { + type: 'html', + value: '', + }, + { + type: 'text', + value: 'here', + }, + { + type: 'html', + value: '', + }, + { + type: 'text', + value: '.', + }, + ], + }, + ], + }; + reconstructHtmlTransform(mdast); + expect(mdast).toEqual({ + type: 'root', + children: [ + { + type: 'paragraph', + children: [ + { + type: 'text', + value: 'See ', + }, + { + type: 'html', + value: 'here', + }, + { + type: 'text', + value: '.', + }, + ], + }, + ], + }); + htmlTransform(mdast); + expect(mdast).toEqual({ + type: 'root', + children: [ + { + type: 'paragraph', + children: [ + { + type: 'text', + value: 'See ', + }, + { + type: 'link', + url: 'link.html', + children: [ + { + type: 'text', + value: 'here', + }, + ], + }, + { + type: 'text', + value: '.', + }, + ], + }, + ], + }); + }); }); diff --git a/packages/myst-transforms/src/html.ts b/packages/myst-transforms/src/html.ts index f3093d018..fe35655e8 100644 --- a/packages/myst-transforms/src/html.ts +++ b/packages/myst-transforms/src/html.ts @@ -1,6 +1,6 @@ import { unified } from 'unified'; import type { Plugin } from 'unified'; -import { liftChildren } from 'myst-common'; +import { liftChildren, normalizeLabel } from 'myst-common'; import type { GenericNode, GenericParent } from 'myst-common'; import type { Parent } from 'myst-spec'; import { mystToHtml } from 'myst-to-html'; @@ -20,6 +20,17 @@ export type HtmlTransformOptions = { htmlHandlers?: { [x: string]: Handle }; }; +function addClassAndIdentifier(node: GenericNode, attrs: Record = {}) { + const props = node.properties ?? {}; + if (props.id) { + const normalized = normalizeLabel(props.id); + if (normalized?.identifier) attrs.identifier = normalized.identifier; + if (normalized?.label) attrs.label = normalized.label; + } + if (props.className) attrs.class = props.className.join(' '); + return attrs; +} + const defaultHtmlToMdastOptions: Record = { keepBreaks: true, htmlHandlers: { @@ -34,6 +45,27 @@ const defaultHtmlToMdastOptions: Record = { _brKeep(h: H, node: any) { return h(node, '_break'); }, + a(h: H, node: any) { + const attrs = addClassAndIdentifier(node); + attrs.url = String(node.properties.href || ''); + if (node.properties.title) attrs.title = node.properties.title; + if (node.properties.className) attrs.class = node.properties.className.join(' '); + return h(node, 'link', attrs, all(h, node)); + }, + img(h: H, node: any) { + const attrs = addClassAndIdentifier(node); + attrs.url = String(node.properties.src || ''); + if (node.properties.title) attrs.title = node.properties.title; + if (node.properties.alt) attrs.alt = node.properties.alt; + return h(node, 'image', attrs); + }, + figure(h: H, node: any) { + const attrs = addClassAndIdentifier(node); + return h(node, 'container', attrs, all(h, node)); + }, + figcaption(h: H, node: any) { + return h(node, 'caption', all(h, node)); + }, comment(h: any, node: any) { // Prevents HTML comments from showing up as text in web const result = h(node, 'comment'); @@ -66,6 +98,12 @@ export function htmlTransform(tree: GenericParent, opts?: HtmlTransformOptions) node.children = mdast.children as Parent[]; visit(node, (n: any) => delete n.position); }); + selectAll('paragraph > htmlParsed', tree).forEach((parsed) => { + const node = parsed as GenericParent; + if (node?.children?.length === 1 && node.children[0].type === 'paragraph') { + node.children = node.children[0].children as GenericNode[]; + } + }); liftChildren(tree, 'htmlParsed'); selectAll('_break', tree).forEach((node: any) => { node.type = 'break'; @@ -103,17 +141,41 @@ function finalizeNode(htmlOpenNodeWithChildren: GenericParent, htmlCloseNode: Ge delete (htmlOpenNodeWithChildren as GenericNode).children; } +// https://html.spec.whatwg.org/multipage/syntax.html#elements-2 +const HTML_EMPTY_ELEMENTS = [ + 'area', + 'base', + 'br', + 'col', + 'embed', + 'hr', + 'img', + 'input', + 'keygen', + 'link', + 'meta', + 'param', + 'source', + 'track', + 'wbr', +]; + function reconstructHtml(tree: GenericParent) { const htmlOpenNodes: GenericParent[] = []; tree.children.forEach((child: GenericNode) => { if (child.type === 'html') { const value = child.value?.trim(); - if (value?.startsWith('')) || + value?.match(new RegExp(`<(${HTML_EMPTY_ELEMENTS.join('|')})([^>]*)?/?>`)); + if (selfClosing) { + if (htmlOpenNodes.length) { + htmlOpenNodes[htmlOpenNodes.length - 1].children.push(child); + } + } else if (value?.startsWith(' = () => (tree) => { + reconstructHtmlTransform(tree); +}; + export const htmlPlugin: Plugin<[HtmlTransformOptions?], GenericParent, GenericParent> = (opts) => (tree) => { htmlTransform(tree, opts); diff --git a/packages/myst-transforms/src/index.ts b/packages/myst-transforms/src/index.ts index 212b397ad..a388926e8 100644 --- a/packages/myst-transforms/src/index.ts +++ b/packages/myst-transforms/src/index.ts @@ -6,7 +6,12 @@ export { } from './admonitions.js'; export { captionParagraphPlugin, captionParagraphTransform } from './caption.js'; export { footnotesPlugin, footnotesTransform } from './footnotes.js'; -export { htmlPlugin, htmlTransform, reconstructHtmlTransform } from './html.js'; +export { + htmlPlugin, + htmlTransform, + reconstructHtmlTransform, + reconstructHtmlPlugin, +} from './html.js'; export { htmlIdsPlugin, htmlIdsTransform } from './htmlIds.js'; export { keysPlugin, keysTransform } from './keys.js'; export { diff --git a/packages/myst-transforms/tests/html.yml b/packages/myst-transforms/tests/html.yml index 2a02aa8cf..ffab869f7 100644 --- a/packages/myst-transforms/tests/html.yml +++ b/packages/myst-transforms/tests/html.yml @@ -41,7 +41,6 @@ cases: - type: image url: example.jpg title: example - alt: '' - title: table before: type: root