Improve bench

VikParuchuri · Jan 31, 2025 · d487f46 · d487f46
2 parents 720f09a + 4ec1467
commit d487f46
Show file tree

Hide file tree

Showing 10 changed files with 484 additions and 185 deletions.
diff --git a/benchmarks/overall/clean.py b/benchmarks/overall/clean.py
@@ -0,0 +1,125 @@
+import re
+import subprocess
+import tempfile
+from pathlib import Path
+
+import latex2mathml.converter
+
+from marker.renderers.markdown import MarkdownRenderer
+
+class MarkdownCleaner:
+    def __init__(self):
+        pass
+
+    def __call__(self, markdown):
+        markdown = self.normalize_markdown(markdown)  # Use pandoc to normalize
+
+        # Replace math expressions with latexml
+        pattern = r'(?<!\\)\$(?:\$([^$]+)\$\$|\s*([^$\n]+?)\s*\$)'
+        markdown = re.sub(pattern, self.standardize_math, markdown)
+
+        # Replace image urls with a generic tag
+        pattern = r'!\[(.*?)\]\((https?://[^\s\)]+)\)'
+        markdown = re.sub(pattern, r'![link]', markdown)
+
+        # Clean up stray html tags
+        markdown = markdown.replace("<br>", "\n")
+        markdown = re.sub(r"<sub>(.*?)</sub>", r"\1", markdown)
+        markdown = re.sub(r"<sup>(.*?)</sup>", r"\1", markdown)
+        markdown = re.sub(r"<span.*?>(.*?)</span>", r"\1", markdown)  # Remove span tags and keep content
+
+        # Clean up markdown formatting
+        markdown = re.sub(r"\s+", " ", markdown)
+        markdown = re.sub(r"\n+", "\n", markdown)
+        markdown = re.sub("\\.+", ".",
+                          markdown)  # Replace repeated periods with a single period, like in table of contents
+        markdown = re.sub("#+", "#", markdown)  # Replace repeated headers with a single header
+        markdown = markdown.encode().decode('unicode-escape', errors="ignore")  # Decode unicode characters properly
+        return markdown.strip().lower()
+
+    @staticmethod
+    def normalize_markdown(md_text: str) -> str:
+        with tempfile.TemporaryDirectory() as tmp_dir:
+            dirpath = Path(tmp_dir)
+            input_file = dirpath / 'input.md'
+            input_file.write_text(md_text, encoding='utf-8')
+
+            # Markdown to HTML
+            html_file = dirpath / 'temp.html'
+            subprocess.run(
+                [
+                    'pandoc',
+                    str(input_file),
+                    '-f', 'markdown+tex_math_dollars',
+                    '-t', 'html',
+                    '-o', str(html_file),
+                    '--quiet'
+                ],
+                check=True
+            )
+
+            # HTML to Markdown
+            output_file = dirpath / 'output.md'
+            subprocess.run(
+                [
+                    'pandoc',
+                    str(html_file),
+                    '-f', 'html',
+                    '-t', 'markdown+tex_math_dollars',
+                    '-o', str(output_file),
+                    '--quiet'
+                ],
+                check=True
+            )
+
+            # Read back the normalized Markdown
+            normalized_md = output_file.read_text(encoding='utf-8')
+
+        return normalized_md
+
+    def standardize_math(self, match):
+        try:
+            delim = "$$" if match.group(0).startswith('$$') else "$"
+            math_content = match.group(1) or match.group(2)
+            if delim == "$$":
+                math_content = latex2mathml.converter.convert(math_content)
+            else:
+                math_content = self.clean_latex(math_content)
+            return f'{delim}{math_content}{delim}'
+        except Exception as e:
+            print(f"Failed to standardize math expression: {match.group(0)} with error: {e}")
+            return match.group(0)
+
+    @staticmethod
+    def clean_latex(latex_str):
+        latex_str = re.sub(r'\s+', ' ', latex_str.strip())
+        for tag in [r'\\text', r'\\mathrm', r'\\mathbf', r'\\textbf']:
+            latex_str = re.sub(tag + r'\{([^}]+)\}', r'\1', latex_str)
+
+        replacements = {
+            '\\times': '*',
+            '\\cdot': '*',
+            '\\div': '/',
+            '\\le': '<=',
+            '\\ge': '>=',
+            '\\neq': '!=',
+            '\\to': '\\rightarrow',
+        }
+
+        for old, new in replacements.items():
+            latex_str = latex_str.replace(old, new)
+
+        return latex_str
+
+
+def convert_to_md(html):
+    md = MarkdownRenderer()
+    markdown = md.md_cls.convert(html)
+    return markdown
+
+def clean_input(markdown):
+    cleaner = MarkdownCleaner()
+    return cleaner(markdown)
+
+
+
diff --git a/benchmarks/overall/inference.py b/benchmarks/overall/inference.py
@@ -1,38 +1,37 @@
 import tempfile
 import time
 
-from bs4 import BeautifulSoup
-
-from benchmarks.overall.scoring import score_blocks
+from benchmarks.overall.clean import clean_input
 from benchmarks.overall.schema import BlockScores
+from benchmarks.overall.scoring import score_blocks
 from marker.converters.pdf import PdfConverter
 
-def get_marker_html(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
+def get_marker_markdown(marker_models: dict, pdf_bytes: bytes, use_llm: bool):
     block_converter = PdfConverter(
         artifact_dict=marker_models,
-        config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm},
-        renderer="marker.renderers.html.HTMLRenderer"
+        config={"page_range": [0], "disable_tqdm": True, "use_llm": use_llm}
     )
+
     with tempfile.NamedTemporaryFile(suffix=".pdf", mode="wb") as f:
         f.write(pdf_bytes)
         rendered = block_converter(f.name)
-    html = rendered.html
-    soup = BeautifulSoup(html, "html.parser")
-    inner_html = str(soup.find("body").decode_contents())
-    return inner_html
+
+    return rendered.markdown
 
 
-def marker_scoring_func(model_dict, sample, gt_html, use_llm=False, **kwargs) -> BlockScores:
+def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs) -> BlockScores:
     pdf_bytes = sample["pdf"]  # This is a single page PDF
     start = time.time()
-    marker_html = get_marker_html(model_dict, pdf_bytes, use_llm)
+    marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
+    marker_md = clean_input(marker_md)
     total = time.time() - start
-    scores = score_blocks(gt_html, marker_html)
+    scores = score_blocks(gt_markdown, marker_md)
     scores["time"] = total
+    scores["markdown"] = marker_md
     return scores
 
 
-def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs) -> BlockScores:
+def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwargs) -> BlockScores:
     uuid = sample["uuid"]
     data = None
     for row in mathpix_ds:
@@ -42,7 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_html, mathpix_ds=None, **kwargs)
     if not data:
         raise ValueError(f"Could not find data for uuid {uuid}")
 
-    mathpix_md = data["md"]
-    scores = score_blocks(gt_html, mathpix_md, convert=False)
+    mathpix_md = clean_input(data["md"])
+    scores = score_blocks(gt_markdown, mathpix_md)
     scores["time"] = data["time"]
+    scores["markdown"] = mathpix_md
     return scores
diff --git a/benchmarks/overall/overall.py b/benchmarks/overall/overall.py
@@ -7,9 +7,11 @@
 import click
 import datasets
 import tabulate
+from benchmarks.overall.render import build_dataset
 from tqdm import tqdm
 import pypdfium2 as pdfium
 
+from benchmarks.overall.clean import convert_to_md, clean_input
 from benchmarks.overall.inference import marker_scoring_func, mathpix_scoring_func
 from benchmarks.overall.schema import FullResult
 from marker.logger import configure_logging
@@ -32,7 +34,8 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
 
         try:
             gt_html = [block["html"] for block in gt_blocks if len(block["html"]) > 0]
-            scores = score_func(model_dict, sample, gt_html, **kwargs)
+            gt_markdown = [clean_input(convert_to_md(block)) for block in gt_html]
+            scores = score_func(model_dict, sample, gt_markdown, **kwargs)
         except ValueError as e:
             print(f"Error with sample {idx}: {e}")
             continue
@@ -101,12 +104,14 @@ def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="
 
 @click.command(help="Benchmark PDF to MD conversion.")
 @click.option("--dataset", type=str, help="Path to the benchmark dataset", default="datalab-to/marker_benchmark")
+@click.option("--out_dataset", type=str, help="Path to the output dataset", default=None)
 @click.option("--other_methods", type=str, help="Comma separated list of other methods to compare against.  Possible values: mathpix", default="")
 @click.option("--result_path", type=str, default=os.path.join(settings.OUTPUT_DIR, "benchmark", "overall"), help="Output path for results.")
 @click.option("--max_rows", type=int, default=None, help="Maximum number of rows to process.")
 @click.option("--use_llm", is_flag=True, help="Use the LLM model for better marker quality.")
 def main(
         dataset: str,
+        out_dataset: str,
         other_methods: str,
         result_path: str,
         max_rows: int,
@@ -142,6 +147,11 @@ def main(
 
     print(f"Results saved to {out_path}.")
 
+    # Push up comparison dataset
+    if out_dataset is not None:
+        out_ds = build_dataset(ds, all_scores)
+        out_ds.push_to_hub(out_dataset)
+
 if __name__ == "__main__":
     main()
 
diff --git a/benchmarks/overall/render.py b/benchmarks/overall/render.py
@@ -0,0 +1,109 @@
+import subprocess
+import tempfile
+import pypdfium2 as pdfium
+from typing import Dict
+from collections import defaultdict
+import re
+import io
+import json
+
+from PIL import Image
+import datasets
+import markdown2
+from playwright.sync_api import sync_playwright
+
+from benchmarks.overall.schema import FullResult
+
+def convert_to_html(md: str):
+    block_placeholders = []
+    inline_placeholders = []
+
+    # Add placeholders for the math
+    def block_sub(match):
+        content = match.group(1)
+        placeholder = f"1BLOCKMATH{len(block_placeholders)}1"
+        block_placeholders.append((placeholder, f"$${content}$$"))
+        return placeholder
+
+    def inline_sub(match):
+        content = match.group(1)
+        placeholder = f"1INLINEMATH{len(inline_placeholders)}1"
+        inline_placeholders.append((placeholder, f"${content}$"))
+        return placeholder
+
+    md = re.sub(r'\${2}(.*?)\${2}', block_sub, md, flags=re.DOTALL)
+    md = re.sub(r'\$(.*?)\$', inline_sub, md)
+
+    html = markdown2.markdown(md, extras=['tables'])
+
+    # Replace placeholders
+    for placeholder, math_str in block_placeholders:
+        html = html.replace(placeholder, math_str)
+    for placeholder, math_str in inline_placeholders:
+        html = html.replace(placeholder, math_str)
+
+    return html
+
+
+def markdown_to_image(md: str) -> Image.Image:
+    html = convert_to_html(md)
+    with sync_playwright() as p:
+        browser = p.chromium.launch()
+        page = browser.new_page()
+        page.set_content(f"""
+            <head>
+                <link rel="stylesheet" href="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.css" integrity="sha384-zh0CIslj+VczCZtlzBcjt5ppRcsAmDnRem7ESsYwWwg3m/OaJ2l4x7YBZl9Kxxib" crossorigin="anonymous">
+                <!-- The loading of KaTeX is deferred to speed up page rendering -->
+                <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/katex.min.js" integrity="sha384-Rma6DA2IPUwhNxmrB/7S3Tno0YY7sFu9WSYMCuulLhIqYSGZ2gKCJWIqhBWqMQfh" crossorigin="anonymous"></script>
+                <!-- To automatically render math in text elements, include the auto-render extension: -->
+                <script defer src="https://cdn.jsdelivr.net/npm/[email protected]/dist/contrib/auto-render.min.js" integrity="sha384-hCXGrW6PitJEwbkoStFjeJxv+fSOOQKOPbJxSfM6G5sWZjAyWhXiTIIAmQqnlLlh" crossorigin="anonymous"></script>
+            </head>
+            <body>
+                {html}
+                    <script>
+                        renderMathInElement(document.body, {{
+                            delimiters: [
+                                {{left: '$$', right: '$$', display: true}},
+                                {{left: '$', right: '$', display: false}}
+                            ]
+                        }});
+                    </script>
+            </body>
+        """)
+        page.set_viewport_size({"width": 1200, "height": 800})
+        page.wait_for_timeout(500) # Wait for KaTeX to render
+        screenshot_bytes = page.screenshot(full_page=True)
+        browser.close()
+
+    return Image.open(io.BytesIO(screenshot_bytes))
+
+
+def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> datasets.Dataset:
+    # Get all the dataset indices that went through inference
+    full_idxs = None
+    for method in all_scores:
+        result_idxs = list(all_scores[method]["raw_scores"].keys())
+        if full_idxs is None:
+            full_idxs = sorted(result_idxs)
+        else:
+            full_idxs = [f for f in full_idxs if f in result_idxs]
+
+    ds_rows = defaultdict(dict)
+    for idx in full_idxs:
+        row = ds[idx] # img, gt_blocks, classification, language, uuid
+        for method in all_scores:
+            method_row = all_scores[method]["raw_scores"][idx]
+            ds_rows[idx].update({
+                f"{method}_score": method_row["overall_score"],
+                f"{method}_markdown": method_row["markdown"],
+                f"{method}_image": markdown_to_image(method_row["markdown"]),
+                f"{method}_time": method_row["time"]
+            })
+        gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])])
+        ds_rows[idx].update({
+            "gt_markdown": gt_md,
+            "gt_image": markdown_to_image(gt_md)
+        })
+    out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
+    return out_dataset
+
diff --git a/benchmarks/overall/schema.py b/benchmarks/overall/schema.py
@@ -4,10 +4,9 @@
 class BlockScores(TypedDict):
     scores: List[float]
     order_score: float
-    gt: List[str]
-    method: str
     overall_score: float
     time: Optional[float]
+    markdown: str
 
 
 class FullResult(TypedDict):