diff --git a/.changeset/rotten-timers-worry.md b/.changeset/rotten-timers-worry.md
new file mode 100644
index 000000000..0d50492e5
--- /dev/null
+++ b/.changeset/rotten-timers-worry.md
@@ -0,0 +1,6 @@
+---
+'myst-transforms': patch
+'myst-cli': patch
+---
+
+Improve HTML transforms for grouping and processing
diff --git a/packages/myst-cli/src/process/mdast.ts b/packages/myst-cli/src/process/mdast.ts
index 933f57f2c..f60a512b4 100644
--- a/packages/myst-cli/src/process/mdast.ts
+++ b/packages/myst-cli/src/process/mdast.ts
@@ -24,6 +24,7 @@ import {
joinGatesPlugin,
glossaryPlugin,
abbreviationPlugin,
+ reconstructHtmlPlugin,
} from 'myst-transforms';
import { unified } from 'unified';
import { VFile } from 'vfile';
@@ -164,9 +165,10 @@ export async function transformMdast(
liftCodeMetadataToBlock(session, vfile, mdast);
const pipe = unified()
+ .use(reconstructHtmlPlugin) // We need to group and link the HTML first
+ .use(htmlPlugin, { htmlHandlers }) // Some of the HTML plugins need to operate on the transformed html, e.g. figure caption transforms
.use(basicTransformationsPlugin)
.use(inlineExpressionsPlugin) // Happens before math and images!
- .use(htmlPlugin, { htmlHandlers })
.use(mathPlugin, { macros: frontmatter.math })
.use(glossaryPlugin, { state }) // This should be before the enumerate plugins
.use(abbreviationPlugin, { abbreviations: frontmatter.abbreviations })
diff --git a/packages/myst-transforms/src/html.spec.ts b/packages/myst-transforms/src/html.spec.ts
index 25ece935f..20f52e366 100644
--- a/packages/myst-transforms/src/html.spec.ts
+++ b/packages/myst-transforms/src/html.spec.ts
@@ -1,5 +1,5 @@
import { describe, expect, test } from 'vitest';
-import { reconstructHtmlTransform } from './html';
+import { htmlTransform, reconstructHtmlTransform } from './html';
describe('Test reconstructHtmlTransform', () => {
test('tree without html returns self', async () => {
@@ -162,4 +162,178 @@ describe('Test reconstructHtmlTransform', () => {
children: [{ type: 'html', value: '' }],
});
});
+ test('self-closing tags', async () => {
+ const mdast = {
+ type: 'root',
+ children: [
+ {
+ type: 'html',
+ value: '',
+ },
+ {
+ type: 'html',
+ value: '
',
+ },
+ {
+ type: 'html',
+ value: '
',
+ },
+ {
+ type: 'html',
+ value: '
',
+ },
+ { type: 'html', value: '' },
+ ],
+ };
+ reconstructHtmlTransform(mdast);
+ expect(mdast).toEqual({
+ type: 'root',
+ children: [
+ {
+ type: 'html',
+ value:
+ '
\n
\n
',
+ },
+ ],
+ });
+ });
+ test('figure captions', async () => {
+ const mdast = {
+ type: 'root',
+ children: [
+ {
+ type: 'html',
+ value: '',
+ },
+ {
+ type: 'html',
+ value: '
',
+ },
+ {
+ type: 'html',
+ value: '',
+ },
+ {
+ type: 'text',
+ value: 'my caption',
+ },
+ {
+ type: 'html',
+ value: '',
+ },
+ {
+ type: 'html',
+ value: '',
+ },
+ ],
+ };
+ reconstructHtmlTransform(mdast);
+ expect(mdast).toEqual({
+ type: 'root',
+ children: [
+ {
+ type: 'html',
+ value:
+ '
\nmy caption',
+ },
+ ],
+ });
+ htmlTransform(mdast);
+ expect(mdast).toEqual({
+ type: 'root',
+ children: [
+ {
+ type: 'container',
+ children: [
+ { type: 'image', url: 'img.png', class: 'big', identifier: 'my-img', label: 'my-img' },
+ { type: 'caption', children: [{ type: 'text', value: 'my caption' }] },
+ ],
+ },
+ ],
+ });
+ });
+ test('no paragraph when in a paragraph', async () => {
+ const mdast = {
+ type: 'root',
+ children: [
+ {
+ type: 'paragraph',
+ children: [
+ {
+ type: 'text',
+ value: 'See ',
+ },
+ {
+ type: 'html',
+ value: '',
+ },
+ {
+ type: 'text',
+ value: 'here',
+ },
+ {
+ type: 'html',
+ value: '',
+ },
+ {
+ type: 'text',
+ value: '.',
+ },
+ ],
+ },
+ ],
+ };
+ reconstructHtmlTransform(mdast);
+ expect(mdast).toEqual({
+ type: 'root',
+ children: [
+ {
+ type: 'paragraph',
+ children: [
+ {
+ type: 'text',
+ value: 'See ',
+ },
+ {
+ type: 'html',
+ value: 'here',
+ },
+ {
+ type: 'text',
+ value: '.',
+ },
+ ],
+ },
+ ],
+ });
+ htmlTransform(mdast);
+ expect(mdast).toEqual({
+ type: 'root',
+ children: [
+ {
+ type: 'paragraph',
+ children: [
+ {
+ type: 'text',
+ value: 'See ',
+ },
+ {
+ type: 'link',
+ url: 'link.html',
+ children: [
+ {
+ type: 'text',
+ value: 'here',
+ },
+ ],
+ },
+ {
+ type: 'text',
+ value: '.',
+ },
+ ],
+ },
+ ],
+ });
+ });
});
diff --git a/packages/myst-transforms/src/html.ts b/packages/myst-transforms/src/html.ts
index f3093d018..fe35655e8 100644
--- a/packages/myst-transforms/src/html.ts
+++ b/packages/myst-transforms/src/html.ts
@@ -1,6 +1,6 @@
import { unified } from 'unified';
import type { Plugin } from 'unified';
-import { liftChildren } from 'myst-common';
+import { liftChildren, normalizeLabel } from 'myst-common';
import type { GenericNode, GenericParent } from 'myst-common';
import type { Parent } from 'myst-spec';
import { mystToHtml } from 'myst-to-html';
@@ -20,6 +20,17 @@ export type HtmlTransformOptions = {
htmlHandlers?: { [x: string]: Handle };
};
+function addClassAndIdentifier(node: GenericNode, attrs: Record = {}) {
+ const props = node.properties ?? {};
+ if (props.id) {
+ const normalized = normalizeLabel(props.id);
+ if (normalized?.identifier) attrs.identifier = normalized.identifier;
+ if (normalized?.label) attrs.label = normalized.label;
+ }
+ if (props.className) attrs.class = props.className.join(' ');
+ return attrs;
+}
+
const defaultHtmlToMdastOptions: Record = {
keepBreaks: true,
htmlHandlers: {
@@ -34,6 +45,27 @@ const defaultHtmlToMdastOptions: Record = {
_brKeep(h: H, node: any) {
return h(node, '_break');
},
+ a(h: H, node: any) {
+ const attrs = addClassAndIdentifier(node);
+ attrs.url = String(node.properties.href || '');
+ if (node.properties.title) attrs.title = node.properties.title;
+ if (node.properties.className) attrs.class = node.properties.className.join(' ');
+ return h(node, 'link', attrs, all(h, node));
+ },
+ img(h: H, node: any) {
+ const attrs = addClassAndIdentifier(node);
+ attrs.url = String(node.properties.src || '');
+ if (node.properties.title) attrs.title = node.properties.title;
+ if (node.properties.alt) attrs.alt = node.properties.alt;
+ return h(node, 'image', attrs);
+ },
+ figure(h: H, node: any) {
+ const attrs = addClassAndIdentifier(node);
+ return h(node, 'container', attrs, all(h, node));
+ },
+ figcaption(h: H, node: any) {
+ return h(node, 'caption', all(h, node));
+ },
comment(h: any, node: any) {
// Prevents HTML comments from showing up as text in web
const result = h(node, 'comment');
@@ -66,6 +98,12 @@ export function htmlTransform(tree: GenericParent, opts?: HtmlTransformOptions)
node.children = mdast.children as Parent[];
visit(node, (n: any) => delete n.position);
});
+ selectAll('paragraph > htmlParsed', tree).forEach((parsed) => {
+ const node = parsed as GenericParent;
+ if (node?.children?.length === 1 && node.children[0].type === 'paragraph') {
+ node.children = node.children[0].children as GenericNode[];
+ }
+ });
liftChildren(tree, 'htmlParsed');
selectAll('_break', tree).forEach((node: any) => {
node.type = 'break';
@@ -103,17 +141,41 @@ function finalizeNode(htmlOpenNodeWithChildren: GenericParent, htmlCloseNode: Ge
delete (htmlOpenNodeWithChildren as GenericNode).children;
}
+// https://html.spec.whatwg.org/multipage/syntax.html#elements-2
+const HTML_EMPTY_ELEMENTS = [
+ 'area',
+ 'base',
+ 'br',
+ 'col',
+ 'embed',
+ 'hr',
+ 'img',
+ 'input',
+ 'keygen',
+ 'link',
+ 'meta',
+ 'param',
+ 'source',
+ 'track',
+ 'wbr',
+];
+
function reconstructHtml(tree: GenericParent) {
const htmlOpenNodes: GenericParent[] = [];
tree.children.forEach((child: GenericNode) => {
if (child.type === 'html') {
const value = child.value?.trim();
- if (value?.startsWith('')) {
+ const selfClosing =
+ (value?.startsWith('<') && value?.endsWith('/>')) ||
+ value?.match(new RegExp(`<(${HTML_EMPTY_ELEMENTS.join('|')})([^>]*)?/?>`));
+ if (selfClosing) {
+ if (htmlOpenNodes.length) {
+ htmlOpenNodes[htmlOpenNodes.length - 1].children.push(child);
+ }
+ } else if (value?.startsWith('')) {
// In this case, child is a standalone closing html node
const htmlOpenNode = htmlOpenNodes.pop();
- if (!htmlOpenNode) {
- return;
- }
+ if (!htmlOpenNode) return;
finalizeNode(htmlOpenNode, child);
if (htmlOpenNodes.length) {
htmlOpenNodes[htmlOpenNodes.length - 1].children.push(htmlOpenNode);
@@ -152,6 +214,10 @@ export function reconstructHtmlTransform(tree: GenericParent) {
return tree;
}
+export const reconstructHtmlPlugin: Plugin<[], GenericParent, GenericParent> = () => (tree) => {
+ reconstructHtmlTransform(tree);
+};
+
export const htmlPlugin: Plugin<[HtmlTransformOptions?], GenericParent, GenericParent> =
(opts) => (tree) => {
htmlTransform(tree, opts);
diff --git a/packages/myst-transforms/src/index.ts b/packages/myst-transforms/src/index.ts
index 212b397ad..a388926e8 100644
--- a/packages/myst-transforms/src/index.ts
+++ b/packages/myst-transforms/src/index.ts
@@ -6,7 +6,12 @@ export {
} from './admonitions.js';
export { captionParagraphPlugin, captionParagraphTransform } from './caption.js';
export { footnotesPlugin, footnotesTransform } from './footnotes.js';
-export { htmlPlugin, htmlTransform, reconstructHtmlTransform } from './html.js';
+export {
+ htmlPlugin,
+ htmlTransform,
+ reconstructHtmlTransform,
+ reconstructHtmlPlugin,
+} from './html.js';
export { htmlIdsPlugin, htmlIdsTransform } from './htmlIds.js';
export { keysPlugin, keysTransform } from './keys.js';
export {
diff --git a/packages/myst-transforms/tests/html.yml b/packages/myst-transforms/tests/html.yml
index 2a02aa8cf..ffab869f7 100644
--- a/packages/myst-transforms/tests/html.yml
+++ b/packages/myst-transforms/tests/html.yml
@@ -41,7 +41,6 @@ cases:
- type: image
url: example.jpg
title: example
- alt: ''
- title: table
before:
type: root