Skip to content

Commit

Permalink
πŸ‘©β€πŸ’» Improve HTML processing
Browse files Browse the repository at this point in the history
  • Loading branch information
rowanc1 committed Oct 16, 2023
1 parent 45e79dd commit 93cf5ae
Show file tree
Hide file tree
Showing 6 changed files with 261 additions and 9 deletions.
6 changes: 6 additions & 0 deletions .changeset/rotten-timers-worry.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
---
'myst-transforms': patch
'myst-cli': patch
---

Improve HTML transforms for grouping and processing
4 changes: 3 additions & 1 deletion packages/myst-cli/src/process/mdast.ts
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ import {
joinGatesPlugin,
glossaryPlugin,
abbreviationPlugin,
reconstructHtmlPlugin,
} from 'myst-transforms';
import { unified } from 'unified';
import { VFile } from 'vfile';
Expand Down Expand Up @@ -164,9 +165,10 @@ export async function transformMdast(
liftCodeMetadataToBlock(session, vfile, mdast);

const pipe = unified()
.use(reconstructHtmlPlugin) // We need to group and link the HTML first
.use(htmlPlugin, { htmlHandlers }) // Some of the HTML plugins need to operate on the transformed html, e.g. figure caption transforms
.use(basicTransformationsPlugin)
.use(inlineExpressionsPlugin) // Happens before math and images!
.use(htmlPlugin, { htmlHandlers })
.use(mathPlugin, { macros: frontmatter.math })
.use(glossaryPlugin, { state }) // This should be before the enumerate plugins
.use(abbreviationPlugin, { abbreviations: frontmatter.abbreviations })
Expand Down
176 changes: 175 additions & 1 deletion packages/myst-transforms/src/html.spec.ts
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import { describe, expect, test } from 'vitest';
import { reconstructHtmlTransform } from './html';
import { htmlTransform, reconstructHtmlTransform } from './html';

describe('Test reconstructHtmlTransform', () => {
test('tree without html returns self', async () => {
Expand Down Expand Up @@ -162,4 +162,178 @@ describe('Test reconstructHtmlTransform', () => {
children: [{ type: 'html', value: '<script>alert("error")</script>' }],
});
});
test('self-closing tags', async () => {
const mdast = {
type: 'root',
children: [
{
type: 'html',
value: '<a href="https://mystmd.org">',
},
{
type: 'html',
value: '<img src="https://mystmd.org/logo.png" />',
},
{
type: 'html',
value: '<hr>',
},
{
type: 'html',
value: '<br>',
},
{ type: 'html', value: '</a>' },
],
};
reconstructHtmlTransform(mdast);
expect(mdast).toEqual({
type: 'root',
children: [
{
type: 'html',
value:
'<a href="https://mystmd.org"><img src="https://mystmd.org/logo.png">\n<hr>\n<br></a>',
},
],
});
});
test('figure captions', async () => {
const mdast = {
type: 'root',
children: [
{
type: 'html',
value: '<figure>',
},
{
type: 'html',
value: '<img src="img.png" class="big" id="my-img">',
},
{
type: 'html',
value: '<figcaption>',
},
{
type: 'text',
value: 'my caption',
},
{
type: 'html',
value: '</figcaption>',
},
{
type: 'html',
value: '</figure>',
},
],
};
reconstructHtmlTransform(mdast);
expect(mdast).toEqual({
type: 'root',
children: [
{
type: 'html',
value:
'<figure><img src="img.png" class="big" id="my-img">\n<figcaption>my caption</figcaption></figure>',
},
],
});
htmlTransform(mdast);
expect(mdast).toEqual({
type: 'root',
children: [
{
type: 'container',
children: [
{ type: 'image', url: 'img.png', class: 'big', identifier: 'my-img', label: 'my-img' },
{ type: 'caption', children: [{ type: 'text', value: 'my caption' }] },
],
},
],
});
});
test('no paragraph when in a paragraph', async () => {
const mdast = {
type: 'root',
children: [
{
type: 'paragraph',
children: [
{
type: 'text',
value: 'See ',
},
{
type: 'html',
value: '<a href="link.html">',
},
{
type: 'text',
value: 'here',
},
{
type: 'html',
value: '</a>',
},
{
type: 'text',
value: '.',
},
],
},
],
};
reconstructHtmlTransform(mdast);
expect(mdast).toEqual({
type: 'root',
children: [
{
type: 'paragraph',
children: [
{
type: 'text',
value: 'See ',
},
{
type: 'html',
value: '<a href="link.html">here</a>',
},
{
type: 'text',
value: '.',
},
],
},
],
});
htmlTransform(mdast);
expect(mdast).toEqual({
type: 'root',
children: [
{
type: 'paragraph',
children: [
{
type: 'text',
value: 'See ',
},
{
type: 'link',
url: 'link.html',
children: [
{
type: 'text',
value: 'here',
},
],
},
{
type: 'text',
value: '.',
},
],
},
],
});
});
});
76 changes: 71 additions & 5 deletions packages/myst-transforms/src/html.ts
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import { unified } from 'unified';
import type { Plugin } from 'unified';
import { liftChildren } from 'myst-common';
import { liftChildren, normalizeLabel } from 'myst-common';
import type { GenericNode, GenericParent } from 'myst-common';
import type { Parent } from 'myst-spec';
import { mystToHtml } from 'myst-to-html';
Expand All @@ -20,6 +20,17 @@ export type HtmlTransformOptions = {
htmlHandlers?: { [x: string]: Handle };
};

function addClassAndIdentifier(node: GenericNode, attrs: Record<string, string> = {}) {
const props = node.properties ?? {};
if (props.id) {
const normalized = normalizeLabel(props.id);
if (normalized?.identifier) attrs.identifier = normalized.identifier;
if (normalized?.label) attrs.label = normalized.label;
}
if (props.className) attrs.class = props.className.join(' ');
return attrs;
}

const defaultHtmlToMdastOptions: Record<keyof HtmlTransformOptions, any> = {
keepBreaks: true,
htmlHandlers: {
Expand All @@ -34,6 +45,27 @@ const defaultHtmlToMdastOptions: Record<keyof HtmlTransformOptions, any> = {
_brKeep(h: H, node: any) {
return h(node, '_break');
},
a(h: H, node: any) {
const attrs = addClassAndIdentifier(node);
attrs.url = String(node.properties.href || '');
if (node.properties.title) attrs.title = node.properties.title;
if (node.properties.className) attrs.class = node.properties.className.join(' ');
return h(node, 'link', attrs, all(h, node));
},
img(h: H, node: any) {
const attrs = addClassAndIdentifier(node);
attrs.url = String(node.properties.src || '');
if (node.properties.title) attrs.title = node.properties.title;
if (node.properties.alt) attrs.alt = node.properties.alt;
return h(node, 'image', attrs);
},
figure(h: H, node: any) {
const attrs = addClassAndIdentifier(node);
return h(node, 'container', attrs, all(h, node));
},
figcaption(h: H, node: any) {
return h(node, 'caption', all(h, node));
},
comment(h: any, node: any) {
// Prevents HTML comments from showing up as text in web
const result = h(node, 'comment');
Expand Down Expand Up @@ -66,6 +98,12 @@ export function htmlTransform(tree: GenericParent, opts?: HtmlTransformOptions)
node.children = mdast.children as Parent[];
visit(node, (n: any) => delete n.position);
});
selectAll('paragraph > htmlParsed', tree).forEach((parsed) => {
const node = parsed as GenericParent;
if (node?.children?.length === 1 && node.children[0].type === 'paragraph') {
node.children = node.children[0].children as GenericNode[];
}
});
liftChildren(tree, 'htmlParsed');
selectAll('_break', tree).forEach((node: any) => {
node.type = 'break';
Expand Down Expand Up @@ -103,17 +141,41 @@ function finalizeNode(htmlOpenNodeWithChildren: GenericParent, htmlCloseNode: Ge
delete (htmlOpenNodeWithChildren as GenericNode).children;
}

// https://html.spec.whatwg.org/multipage/syntax.html#elements-2
const HTML_EMPTY_ELEMENTS = [
'area',
'base',
'br',
'col',
'embed',
'hr',
'img',
'input',
'keygen',
'link',
'meta',
'param',
'source',
'track',
'wbr',
];

function reconstructHtml(tree: GenericParent) {
const htmlOpenNodes: GenericParent[] = [];
tree.children.forEach((child: GenericNode) => {
if (child.type === 'html') {
const value = child.value?.trim();
if (value?.startsWith('</')) {
const selfClosing =
(value?.startsWith('<') && value?.endsWith('/>')) ||
value?.match(new RegExp(`<(${HTML_EMPTY_ELEMENTS.join('|')})([^>]*)?/?>`));
if (selfClosing) {
if (htmlOpenNodes.length) {
htmlOpenNodes[htmlOpenNodes.length - 1].children.push(child);
}
} else if (value?.startsWith('</')) {
// In this case, child is a standalone closing html node
const htmlOpenNode = htmlOpenNodes.pop();
if (!htmlOpenNode) {
return;
}
if (!htmlOpenNode) return;
finalizeNode(htmlOpenNode, child);
if (htmlOpenNodes.length) {
htmlOpenNodes[htmlOpenNodes.length - 1].children.push(htmlOpenNode);
Expand Down Expand Up @@ -152,6 +214,10 @@ export function reconstructHtmlTransform(tree: GenericParent) {
return tree;
}

export const reconstructHtmlPlugin: Plugin<[], GenericParent, GenericParent> = () => (tree) => {
reconstructHtmlTransform(tree);
};

export const htmlPlugin: Plugin<[HtmlTransformOptions?], GenericParent, GenericParent> =
(opts) => (tree) => {
htmlTransform(tree, opts);
Expand Down
7 changes: 6 additions & 1 deletion packages/myst-transforms/src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,12 @@ export {
} from './admonitions.js';
export { captionParagraphPlugin, captionParagraphTransform } from './caption.js';
export { footnotesPlugin, footnotesTransform } from './footnotes.js';
export { htmlPlugin, htmlTransform, reconstructHtmlTransform } from './html.js';
export {
htmlPlugin,
htmlTransform,
reconstructHtmlTransform,
reconstructHtmlPlugin,
} from './html.js';
export { htmlIdsPlugin, htmlIdsTransform } from './htmlIds.js';
export { keysPlugin, keysTransform } from './keys.js';
export {
Expand Down
1 change: 0 additions & 1 deletion packages/myst-transforms/tests/html.yml
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,6 @@ cases:
- type: image
url: example.jpg
title: example
alt: ''
- title: table
before:
type: root
Expand Down

0 comments on commit 93cf5ae

Please sign in to comment.