From 01626db2c412a5d8d848aca75ec224986eb2c697 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Manuel=20Aristar=C3=A1n?= <jazzido@jazzido.com>
Date: Thu, 23 Jan 2025 12:57:14 -0300
Subject: [PATCH 01/11] [streamlit_app] Visualize extracted blocks

---
 marker/scripts/streamlit_app.py              |  59 ++++-
 marker/scripts/streamlit_app_blocks_viz.html | 234 +++++++++++++++++++
 pyproject.toml                               |   3 +-
 3 files changed, 293 insertions(+), 3 deletions(-)
 create mode 100644 marker/scripts/streamlit_app_blocks_viz.html

diff --git a/marker/scripts/streamlit_app.py b/marker/scripts/streamlit_app.py
index 7d7fe555..15d9e229 100644
--- a/marker/scripts/streamlit_app.py
+++ b/marker/scripts/streamlit_app.py
@@ -7,18 +7,41 @@
 
 import base64
 import io
+import json
 import re
+import string
 import tempfile
 from typing import Any, Dict
 
 import pypdfium2
 import streamlit as st
+import streamlit.components.v1 as components
 from PIL import Image
 
 from marker.converters.pdf import PdfConverter
 from marker.models import create_model_dict
 from marker.config.parser import ConfigParser
 from marker.output import text_from_rendered
+from marker.schema import BlockTypes
+
+COLORS = [
+    "#4e79a7",
+    "#f28e2c",
+    "#e15759",
+    "#76b7b2",
+    "#59a14f",
+    "#edc949",
+    "#af7aa1",
+    "#ff9da7",
+    "#9c755f",
+    "#bab0ab"
+]
+
+with open(
+    os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html")
+) as f:
+    BLOCKS_VIZ_TMPL = string.Template(f.read())
+
 
 @st.cache_resource()
 def load_models():
@@ -83,6 +106,31 @@ def page_count(pdf_file):
     return len(doc) - 1
 
 
+def pillow_image_to_base64_string(img: Image) -> str:
+    buffered = io.BytesIO()
+    img.save(buffered, format="JPEG")
+    return base64.b64encode(buffered.getvalue()).decode("utf-8")
+
+
+def block_display(image: Image, blocks: dict = {}, dpi=96):
+    image_data_url = (
+        'data:image/jpeg;base64,' + pillow_image_to_base64_string(image)
+    )
+
+    template_values = {
+        "image_data_url": image_data_url,
+        "image_width": image.width, "image_height": image.height,
+        "blocks_json": blocks, "colors_json": json.dumps(COLORS),
+        "block_types_json": json.dumps({
+            bt.name: i for i, bt in enumerate(BlockTypes)
+        })
+    }
+    return components.html(
+        BLOCKS_VIZ_TMPL.substitute(**template_values),
+        height=image.height, width=image.width
+    )
+
+
 st.set_page_config(layout="wide")
 col1, col2 = st.columns([.5, .5])
 
@@ -108,14 +156,18 @@ def page_count(pdf_file):
     page_count = page_count(in_file)
     page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count)
     pil_image = get_page_image(in_file, page_number)
+    image_placeholder = st.empty()
+
+    with image_placeholder:
+        block_display(pil_image)
 
-    st.image(pil_image, caption="PDF file (preview)", use_container_width=True)
 
 page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}")
 output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0)
 run_marker = st.sidebar.button("Run Marker")
 
 use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False)
+show_blocks = st.sidebar.checkbox("Show Blocks", help="Display detected blocks, only when output is JSON", value=False, disabled=output_format != "json")
 force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False)
 strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False)
 debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False)
@@ -155,6 +207,10 @@ def page_count(pdf_file):
     elif output_format == "html":
         st.html(text)
 
+if output_format == "json" and show_blocks:
+    with image_placeholder:
+        block_display(pil_image, text)
+
 if debug:
     with col1:
         debug_data_path = rendered.metadata.get("debug_data_path")
@@ -165,4 +221,3 @@ def page_count(pdf_file):
             layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png")
             img = Image.open(layout_image_path)
             st.image(img, caption="Layout debug image", use_container_width=True)
-
diff --git a/marker/scripts/streamlit_app_blocks_viz.html b/marker/scripts/streamlit_app_blocks_viz.html
new file mode 100644
index 00000000..b31ee0a8
--- /dev/null
+++ b/marker/scripts/streamlit_app_blocks_viz.html
@@ -0,0 +1,234 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="UTF-8" />
+    <meta name="viewport" content="width=device-width, initial-scale=1.0" />
+    <style>
+        body {
+            font-family: "Source Sans Pro",sans-serif;
+            font-weight: 400;
+            -moz-osx-font-smoothing: auto
+        }
+
+        .tippy-box {
+            font-size: 10px
+        }
+
+        .image-container {
+            position: relative;
+            width: 90%
+        }
+
+        .image-container img {
+            width: 100%;
+            height: auto
+        }
+
+        .blocks-overlay {
+            position: absolute;
+            top: 0;
+            left: 0;
+            width: 100%;
+            height: 100%
+        }
+
+        .blocks-overlay rect.block {
+            fill-opacity: .2;
+            stroke-opacity: .5
+        }
+
+        .blocks-overlay rect.block:hover {
+            stroke-opacity: 1;
+            cursor: pointer
+        }
+
+        #block-info-dialog {
+            width: 65%
+        }
+
+        #block-info-dialog button.close-button {
+            font-size: 20px;
+            position: absolute;
+            top: 0;
+            right: 0;
+            margin: 0;
+            border: 0;
+            background: 0 0;
+            padding: 0 4px 0 0;
+            cursor: pointer
+        }
+
+        #block-info-dialog button.close-button:focus {
+            outline: 0
+        }
+
+        #block-info-dialog button.close-button::after {
+            content: "╳"
+        }
+
+        #block-info-dialog button.copy-json-button {
+            font-size: 10px;
+            color: #bababa;
+            cursor: pointer;
+            position: absolute;
+            bottom: 3px;
+            right: 3px;
+            border: 0;
+            background: 0 0
+        }
+
+        #block-info-dialog button.copy-json-button:hover {
+            color: #666
+        }
+
+        #block-info-dialog button.copy-json-button:active {
+            color: #000
+        }
+
+        #block-info-dialog h1 {
+            margin: 0 0 10px;
+            text-align: left;
+            font-size: 1em
+        }
+
+        #block-info-dialog .text-content {
+            overflow-y: auto;
+            font-family: monospace;
+            white-space: pre
+        }
+
+        #block-info-dialog .images {
+            display: flex;
+            flex-wrap: wrap;
+            justify-content: center;
+            gap: 10px;
+            margin-top: 10px
+        }
+
+        #block-info-dialog .images img {
+            max-width: 40%;
+            height: auto
+        }
+    </style>
+  </head>
+  <body>
+    <div style="text-align: center" class="image-container">
+      <dialog id="block-info-dialog">
+        <button 
+          class="close-button"
+          onclick="document.querySelector('#block-info-dialog').close()"
+        ></button>
+        <h1></h1>
+        <div class="text-content"></div>
+        <div class="images"></div>
+        <button
+          class="copy-json-button"
+          onclick="navigator.clipboard.writeText(this.parentNode.dataset.blockJSON)">
+          copy block JSON
+        </button>
+      </dialog>
+      <img
+        src="$image_data_url"
+        style="max-width: 100%; height: auto"
+        alt="Image"
+      />
+      <svg
+        class="blocks-overlay"
+        width="$image_width"
+        height="$image_height"
+      ></svg>
+    </div>
+    <script src="https://unpkg.com/@popperjs/core@2"></script>
+    <script src="https://unpkg.com/tippy.js@6"></script>
+    <script>
+      const f = () => {
+        const BLOCKS = $blocks_json;
+        const COLORS = $colors_json;
+        const BLOCK_TYPES = $block_types_json;
+        const blocksById = {};
+        const blockInfoDialog = document.querySelector("dialog#block-info-dialog");
+  
+        function blockTypeColor(blockType) {
+          return COLORS[BLOCK_TYPES[blockType] % COLORS.length];
+        }
+  
+        function traverseAndGenerateSVG(block) {
+          let svg = "";
+  
+          if (block.polygon) {
+            const color = blockTypeColor(block.block_type);
+  
+            // dollar signs are escaped because this files gets read into a template string
+            svg += `<rect id="$${block.id}"
+                                class="block type-$${block.block_type}"
+                                data-type="$${block.block_type}"
+                                x="$${block.polygon[0][0]}" y="$${block.polygon[0][1]}"
+                                width="$${
+                                  block.polygon[1][0] - block.polygon[0][0]
+                                }"
+                                height="$${
+                                  block.polygon[3][1] - block.polygon[1][1]
+                                }"
+                                fill=$${color} stroke=$${color}>
+                          </rect>`;
+  
+            blocksById[block.id] = block;
+          }
+  
+          if (Array.isArray(block.children) && block.children.length > 0) {
+            block.children.forEach((child) => {
+              svg += traverseAndGenerateSVG(child);
+            });
+          }
+  
+          return svg;
+        }
+  
+        if (Object.keys(BLOCKS).length == 0) {
+          // bail out if no blocks
+          return;
+        }
+  
+        const [vbWidth, vbHeight] = BLOCKS.children[0].polygon[2];
+        document
+            .querySelector("svg")
+            .setAttribute("viewBox", `0 0 $${vbWidth} $${vbHeight}`);
+    
+        const blocksOverlay = document.querySelector(".blocks-overlay");
+        blocksOverlay.innerHTML = traverseAndGenerateSVG(BLOCKS.children[0]);
+    
+        tippy("rect.block", {
+            content: (block) => block.getAttribute("data-type"),
+            placement: "top-start",
+            arrow: false,
+            offset: [0, 5],
+        });
+  
+        blocksOverlay.addEventListener("click", (event) => {
+            if (event.target.tagName !== "rect") return;
+  
+            const blockId = event.target.id;
+            const block = blocksById[blockId];
+        
+            blockInfoDialog.querySelector("h1").innerHTML = `
+              $${blockId} <span style="color: $${blockTypeColor(block.block_type)}">($${block.block_type})</span>
+            `;
+            blockInfoDialog.querySelector(".text-content").textContent = block.html;
+  
+            blockInfoDialog.dataset.blockJSON = JSON.stringify(block, null, 2);
+        
+            if (block.images) {
+                const imagesDiv = blockInfoDialog.querySelector(".images");
+                imagesDiv.innerHTML = "";
+                for ([id, image] of Object.entries(block.images)) {
+                  const img = document.createElement("img");
+                  img.src = "data:image/jpeg;base64," + image;
+                  imagesDiv.appendChild(img);
+                }
+            }
+            blockInfoDialog.showModal();
+        });
+      }; f();
+    </script>
+  </body>
+</html>
diff --git a/pyproject.toml b/pyproject.toml
index 4250b072..48437f9f 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -11,7 +11,8 @@ packages = [
     {include = "marker"}
 ]
 include = [
-    "marker/scripts/*.sh"
+    "marker/scripts/*.sh",
+    "marker/scripts/marker/scripts/streamlit_app_blocks_viz.html",
 ]
 
 [tool.poetry.dependencies]

From 71189217c2276ada85a92ef95ecb05d3f9501959 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 28 Jan 2025 11:25:18 -0500
Subject: [PATCH 02/11] Small bugfix

---
 README.md                 |  2 +-
 marker/output.py          | 20 +++++++++++++++++++-
 marker/processors/list.py |  4 ++++
 pyproject.toml            |  2 +-
 4 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/README.md b/README.md
index 880dba9e..e6834b8a 100644
--- a/README.md
+++ b/README.md
@@ -256,7 +256,7 @@ Pages have the keys:
 
 - `id` - unique id for the block.
 - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`.  As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"]
-- `html` - the HTML for the page.  Note that this will have recursive references to children.  The `content-ref` tags must be replaced with the child content if you want the full html.  You can see an example of this at `marker/renderers/__init__.py:BaseRender.extract_block_html`.
+- `html` - the HTML for the page.  Note that this will have recursive references to children.  The `content-ref` tags must be replaced with the child content if you want the full html.  You can see an example of this at `marker/output.py:json_to_html`.  That function will take in a single block from the json output, and turn it into HTML.
 - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format.  (x1,y1) is the top left, and coordinates go clockwise.
 - `children` - the child blocks.
 
diff --git a/marker/output.py b/marker/output.py
index 2a6ee126..1c4d92d4 100644
--- a/marker/output.py
+++ b/marker/output.py
@@ -1,13 +1,31 @@
 import json
 import os
 
+from bs4 import BeautifulSoup
 from pydantic import BaseModel
 
 from marker.renderers.html import HTMLOutput
-from marker.renderers.json import JSONOutput
+from marker.renderers.json import JSONOutput, JSONBlockOutput
 from marker.renderers.markdown import MarkdownOutput
 from marker.settings import settings
 
+def json_to_html(block: JSONBlockOutput):
+    # Utility function to take in json block output and give html for the block.
+    if not getattr(block, "children", None):
+        return block.html
+    else:
+        child_html = [json_to_html(child) for child in block.children]
+        child_ids = [child.id for child in block.children]
+
+        soup = BeautifulSoup(block.html, "html.parser")
+        content_refs = soup.find_all("content-ref")
+        for ref in content_refs:
+            src_id = ref.attrs["src"]
+            if src_id in child_ids:
+                child_soup = BeautifulSoup(child_html[child_ids.index(src_id)], "html.parser")
+                ref.replace_with(child_soup)
+        return str(soup)
+
 
 def output_exists(output_dir: str, fname_base: str):
     exts = ["md", "html", "json"]
diff --git a/marker/processors/list.py b/marker/processors/list.py
index 9d7105ee..4e137987 100644
--- a/marker/processors/list.py
+++ b/marker/processors/list.py
@@ -66,6 +66,10 @@ def list_group_indentation(self, document: Document):
                 for list_item_id in block.structure:
                     list_item_block: ListItem = page.get_block(list_item_id)
 
+                    # This can be a line sometimes
+                    if list_item_block.block_type != BlockTypes.ListItem:
+                        continue
+
                     while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width):
                         stack.pop()
 
diff --git a/pyproject.toml b/pyproject.toml
index 0377a77a..67f77313 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -28,7 +28,7 @@ texify = "^0.2.1"
 rapidfuzz = "^3.8.1"
 surya-ocr = "~0.9.3"
 regex = "^2024.4.28"
-pdftext = "~0.5.0"
+pdftext = "~0.5.1"
 markdownify = "^0.13.1"
 click = "^8.1.7"
 google-generativeai = "^0.8.3"

From 98bdbbb6ee3e24479a0586db4ee826df4d118108 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 28 Jan 2025 16:37:19 -0500
Subject: [PATCH 03/11] Fix html path

---
 poetry.lock    | 76 +++++++++++++++++++++++++-------------------------
 pyproject.toml |  2 +-
 2 files changed, 39 insertions(+), 39 deletions(-)

diff --git a/poetry.lock b/poetry.lock
index c371d07c..1ddc9a0f 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -320,13 +320,13 @@ files = [
 
 [[package]]
 name = "attrs"
-version = "24.3.0"
+version = "25.1.0"
 description = "Classes Without Boilerplate"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"},
-    {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"},
+    {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"},
+    {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"},
 ]
 
 [package.extras]
@@ -1082,13 +1082,13 @@ protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4
 
 [[package]]
 name = "google-api-core"
-version = "2.24.0"
+version = "2.24.1"
 description = "Google API client core library"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "google_api_core-2.24.0-py3-none-any.whl", hash = "sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9"},
-    {file = "google_api_core-2.24.0.tar.gz", hash = "sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf"},
+    {file = "google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1"},
+    {file = "google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a"},
 ]
 
 [package.dependencies]
@@ -1117,13 +1117,13 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"]
 
 [[package]]
 name = "google-api-python-client"
-version = "2.159.0"
+version = "2.160.0"
 description = "Google API Client Library for Python"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"},
-    {file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"},
+    {file = "google_api_python_client-2.160.0-py2.py3-none-any.whl", hash = "sha256:63d61fb3e4cf3fb31a70a87f45567c22f6dfe87bbfa27252317e3e2c42900db4"},
+    {file = "google_api_python_client-2.160.0.tar.gz", hash = "sha256:a8ccafaecfa42d15d5b5c3134ced8de08380019717fc9fb1ed510ca58eca3b7e"},
 ]
 
 [package.dependencies]
@@ -1367,13 +1367,13 @@ zstd = ["zstandard (>=0.18.0)"]
 
 [[package]]
 name = "huggingface-hub"
-version = "0.27.1"
+version = "0.28.0"
 description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub"
 optional = false
 python-versions = ">=3.8.0"
 files = [
-    {file = "huggingface_hub-0.27.1-py3-none-any.whl", hash = "sha256:1c5155ca7d60b60c2e2fc38cbb3ffb7f7c3adf48f824015b219af9061771daec"},
-    {file = "huggingface_hub-0.27.1.tar.gz", hash = "sha256:c004463ca870283909d715d20f066ebd6968c2207dae9393fdffb3c1d4d8f98b"},
+    {file = "huggingface_hub-0.28.0-py3-none-any.whl", hash = "sha256:71cff4e500efe68061d94b7f6d3114e183715088be7a90bf4dd84af83b5f5cdb"},
+    {file = "huggingface_hub-0.28.0.tar.gz", hash = "sha256:c2b18c02a47d4384763caddb4d0ab2a8fc6c16e0800d6de4d55d0a896244aba3"},
 ]
 
 [package.dependencies]
@@ -1386,13 +1386,13 @@ tqdm = ">=4.42.1"
 typing-extensions = ">=3.7.4.3"
 
 [package.extras]
-all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 cli = ["InquirerPy (==0.3.4)"]
-dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
+dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"]
 fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"]
 hf-transfer = ["hf-transfer (>=0.1.4)"]
 inference = ["aiohttp"]
-quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.5.0)"]
+quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"]
 tensorflow = ["graphviz", "pydot", "tensorflow"]
 tensorflow-testing = ["keras (<3.0)", "tensorflow"]
 testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"]
@@ -2085,13 +2085,13 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"]
 
 [[package]]
 name = "markdown2"
-version = "2.5.2"
+version = "2.5.3"
 description = "A fast and complete Python implementation of Markdown"
 optional = false
 python-versions = "<4,>=3.9"
 files = [
-    {file = "markdown2-2.5.2-py3-none-any.whl", hash = "sha256:bed80d301a33845be633acde47a67cf265c57ddf9cbe3cb11c49c18016c2f581"},
-    {file = "markdown2-2.5.2.tar.gz", hash = "sha256:3ac02226a901c4b2f6fc21dbd17c26d118d2c25bcbb28cee093a1f8b5c46f3f1"},
+    {file = "markdown2-2.5.3-py3-none-any.whl", hash = "sha256:a8ebb7e84b8519c37bf7382b3db600f1798a22c245bfd754a1f87ca8d7ea63b3"},
+    {file = "markdown2-2.5.3.tar.gz", hash = "sha256:4d502953a4633408b0ab3ec503c5d6984d1b14307e32b325ec7d16ea57524895"},
 ]
 
 [package.extras]
@@ -2212,13 +2212,13 @@ files = [
 
 [[package]]
 name = "mistune"
-version = "3.1.0"
+version = "3.1.1"
 description = "A sane and fast Markdown parser with useful plugins and renderers"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "mistune-3.1.0-py3-none-any.whl", hash = "sha256:b05198cf6d671b3deba6c87ec6cf0d4eb7b72c524636eddb6dbf13823b52cee1"},
-    {file = "mistune-3.1.0.tar.gz", hash = "sha256:dbcac2f78292b9dc066cd03b7a3a26b62d85f8159f2ea5fd28e55df79908d667"},
+    {file = "mistune-3.1.1-py3-none-any.whl", hash = "sha256:02106ac2aa4f66e769debbfa028509a275069dcffce0dfa578edd7b991ee700a"},
+    {file = "mistune-3.1.1.tar.gz", hash = "sha256:e0740d635f515119f7d1feb6f9b192ee60f0cc649f80a8f944f905706a21654c"},
 ]
 
 [package.dependencies]
@@ -2371,13 +2371,13 @@ dill = ">=0.3.8"
 
 [[package]]
 name = "narwhals"
-version = "1.23.0"
+version = "1.24.0"
 description = "Extremely lightweight compatibility layer between dataframe libraries"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "narwhals-1.23.0-py3-none-any.whl", hash = "sha256:8d6e7fa0b13af01784837efc060e2a663e5d888decf31f261ff8fc06a7cefeb4"},
-    {file = "narwhals-1.23.0.tar.gz", hash = "sha256:3da4b1e7675b3d8ed69bd40c263b135066248af28354f104ea36c788b23d1e3e"},
+    {file = "narwhals-1.24.0-py3-none-any.whl", hash = "sha256:73ff60578641059221de2e4f337bfdf0260378fb1553f787d27411602cfc5e72"},
+    {file = "narwhals-1.24.0.tar.gz", hash = "sha256:23f0a05efbe29864d184842dd6bf11c044210bca1d443d6dbffe7e65a70bf063"},
 ]
 
 [package.extras]
@@ -2419,13 +2419,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>=
 
 [[package]]
 name = "nbconvert"
-version = "7.16.5"
+version = "7.16.6"
 description = "Converting Jupyter Notebooks (.ipynb files) to other formats.  Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script.  nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)."
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "nbconvert-7.16.5-py3-none-any.whl", hash = "sha256:e12eac052d6fd03040af4166c563d76e7aeead2e9aadf5356db552a1784bd547"},
-    {file = "nbconvert-7.16.5.tar.gz", hash = "sha256:c83467bb5777fdfaac5ebbb8e864f300b277f68692ecc04d6dab72f2d8442344"},
+    {file = "nbconvert-7.16.6-py3-none-any.whl", hash = "sha256:1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b"},
+    {file = "nbconvert-7.16.6.tar.gz", hash = "sha256:576a7e37c6480da7b8465eefa66c17844243816ce1ccc372633c6b71c3c0f582"},
 ]
 
 [package.dependencies]
@@ -2923,13 +2923,13 @@ testing = ["docopt", "pytest"]
 
 [[package]]
 name = "pdftext"
-version = "0.5.0"
+version = "0.5.1"
 description = "Extract structured text from pdfs quickly"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "pdftext-0.5.0-py3-none-any.whl", hash = "sha256:e14179c5039c711dc5c490ecb1bc15c92ab920e5f7715034b7ae5a387b3b2787"},
-    {file = "pdftext-0.5.0.tar.gz", hash = "sha256:f6487d170abc97867d7539774fecdb0a17599965ba88287b3b89731f5cd7d612"},
+    {file = "pdftext-0.5.1-py3-none-any.whl", hash = "sha256:6de0406473846f6486b969fb4b1832b94ebe4c92a4bae5f3d1ead645d43d9994"},
+    {file = "pdftext-0.5.1.tar.gz", hash = "sha256:81646068c98df4874064f739f507908543188e93e1a5d84b30a0989329f32af6"},
 ]
 
 [package.dependencies]
@@ -3201,13 +3201,13 @@ files = [
 
 [[package]]
 name = "proto-plus"
-version = "1.25.0"
-description = "Beautiful, Pythonic protocol buffers."
+version = "1.26.0"
+description = "Beautiful, Pythonic protocol buffers"
 optional = false
 python-versions = ">=3.7"
 files = [
-    {file = "proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961"},
-    {file = "proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91"},
+    {file = "proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7"},
+    {file = "proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22"},
 ]
 
 [package.dependencies]
@@ -4032,13 +4032,13 @@ all = ["numpy"]
 
 [[package]]
 name = "referencing"
-version = "0.36.1"
+version = "0.36.2"
 description = "JSON Referencing + Python"
 optional = false
 python-versions = ">=3.9"
 files = [
-    {file = "referencing-0.36.1-py3-none-any.whl", hash = "sha256:363d9c65f080d0d70bc41c721dce3c7f3e77fc09f269cd5c8813da18069a6794"},
-    {file = "referencing-0.36.1.tar.gz", hash = "sha256:ca2e6492769e3602957e9b831b94211599d2aade9477f5d44110d2530cf9aade"},
+    {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"},
+    {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"},
 ]
 
 [package.dependencies]
@@ -5489,4 +5489,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "6eb647ac20025351bfd8048a8407855c8f0a51760a2944f1da6c3685b9a8ada7"
+content-hash = "33297ed1b238e67f880534882876a29012559d3263ceba2ba0cdd738598af00c"
diff --git a/pyproject.toml b/pyproject.toml
index c5a0b4e1..d838b9e0 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -12,7 +12,7 @@ packages = [
 ]
 include = [
     "marker/scripts/*.sh",
-    "marker/scripts/marker/scripts/streamlit_app_blocks_viz.html",
+    "marker/scripts/*.html",
 ]
 
 [tool.poetry.dependencies]

From e534a14a2e2a9e73301983779060087aa36a21f0 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 28 Jan 2025 16:37:45 -0500
Subject: [PATCH 04/11] Bump surya

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index d838b9e0..6878e04e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -27,7 +27,7 @@ tqdm = "^4.66.1"
 ftfy = "^6.1.1"
 texify = "^0.2.1"
 rapidfuzz = "^3.8.1"
-surya-ocr = "~0.9.3"
+surya-ocr = "~0.10.0"
 regex = "^2024.4.28"
 pdftext = "~0.5.1"
 markdownify = "^0.13.1"

From 5fdb25b8b50794e05d6329c8589322b3fc073fa0 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Tue, 28 Jan 2025 17:56:07 -0500
Subject: [PATCH 05/11] Swap to new texify model

---
 marker/models.py | 33 ++-------------------------------
 1 file changed, 2 insertions(+), 31 deletions(-)

diff --git a/marker/models.py b/marker/models.py
index 908fb863..80dc254c 100644
--- a/marker/models.py
+++ b/marker/models.py
@@ -1,41 +1,12 @@
 import os
-
-from marker.settings import settings
-
-os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS
-
-from typing import List
-from PIL import Image
+os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS
 
 from surya.detection import DetectionPredictor
 from surya.layout import LayoutPredictor
 from surya.ocr_error import OCRErrorPredictor
 from surya.recognition import RecognitionPredictor
 from surya.table_rec import TableRecPredictor
-
-from texify.model.model import load_model as load_texify_model
-from texify.model.processor import load_processor as load_texify_processor
-from texify.inference import batch_inference
-
-class TexifyPredictor:
-    def __init__(self, device=None, dtype=None):
-        if not device:
-            device = settings.TORCH_DEVICE_MODEL
-        if not dtype:
-            dtype = settings.TEXIFY_DTYPE
-
-        self.model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype)
-        self.processor = load_texify_processor()
-        self.device = device
-        self.dtype = dtype
-
-    def __call__(self, batch_images: List[Image.Image], max_tokens: int):
-        return batch_inference(
-            batch_images,
-            self.model,
-            self.processor,
-            max_tokens=max_tokens
-        )
+from surya.texify import TexifyPredictor
 
 
 def create_model_dict(device=None, dtype=None) -> dict:

From 597db72f9a30acd0304774585cdfbe64e8e0d34f Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 07:34:43 -0500
Subject: [PATCH 06/11] Integrate new texify model

---
 marker/processors/equation.py | 85 ++++-------------------------------
 marker/renderers/markdown.py  | 11 +++--
 2 files changed, 16 insertions(+), 80 deletions(-)

diff --git a/marker/processors/equation.py b/marker/processors/equation.py
index 868f98a2..5f5be17c 100644
--- a/marker/processors/equation.py
+++ b/marker/processors/equation.py
@@ -22,7 +22,7 @@ class EquationProcessor(BaseProcessor):
     model_max_length: Annotated[
         int,
         "The maximum number of tokens to allow for the Texify model.",
-    ] = 384
+    ] = 768
     texify_batch_size: Annotated[
         Optional[int],
         "The batch size to use for the Texify model.",
@@ -65,27 +65,7 @@ def __call__(self, document: Document):
                 continue
 
             block = document.get_block(equation_d["block_id"])
-            block.html = self.parse_latex_to_html(prediction)
-
-    def parse_latex_to_html(self, latex: str):
-        html_out = ""
-        try:
-            latex = self.parse_latex(latex)
-        except ValueError as e:
-            # If we have mismatched delimiters, we'll treat it as a single block
-            # Strip the $'s from the latex
-            latex = [
-                {"class": "block", "content": latex.replace("$", "")}
-            ]
-
-        for el in latex:
-            if el["class"] == "block":
-                html_out += f'<math display="block">{el["content"]}</math>'
-            elif el["class"] == "inline":
-                html_out += f'<math display="inline">{el["content"]}</math>'
-            else:
-                html_out += f" {el['content']} "
-        return html_out.strip()
+            block.html = prediction
 
     def get_batch_size(self):
         if self.texify_batch_size is not None:
@@ -106,71 +86,22 @@ def get_latex_batched(self, equation_data: List[dict]):
             max_idx = min(min_idx + batch_size, len(equation_data))
 
             batch_equations = equation_data[min_idx:max_idx]
-            max_length = max([eq["token_count"] for eq in batch_equations])
-            max_length = min(max_length, self.model_max_length)
-            max_length += self.token_buffer
-
             batch_images = [eq["image"] for eq in batch_equations]
 
             model_output = self.texify_model(
-                batch_images,
-                max_tokens=max_length
+                batch_images
             )
 
             for j, output in enumerate(model_output):
-                token_count = self.get_total_texify_tokens(output)
-                if token_count >= max_length - 1:
-                    output = ""
+                token_count = self.get_total_texify_tokens(output.text)
+                if token_count >= self.model_max_length - 1:
+                    output.text = ""
 
                 image_idx = i + j
-                predictions[image_idx] = output
+                predictions[image_idx] = output.text
         return predictions
 
     def get_total_texify_tokens(self, text):
         tokenizer = self.texify_model.processor.tokenizer
         tokens = tokenizer(text)
-        return len(tokens["input_ids"])
-
-
-    @staticmethod
-    def parse_latex(text: str):
-        if text.count("$") % 2 != 0:
-            raise ValueError("Mismatched delimiters in LaTeX")
-
-        DELIMITERS = [
-            ("$$", "block"),
-            ("$", "inline")
-        ]
-
-        text = text.replace("\n", "<br>")  # we can't handle \n's inside <p> properly if we don't do this
-
-        i = 0
-        stack = []
-        result = []
-        buffer = ""
-
-        while i < len(text):
-            for delim, class_name in DELIMITERS:
-                if text[i:].startswith(delim):
-                    if stack and stack[-1] == delim:  # Closing
-                        stack.pop()
-                        result.append({"class": class_name, "content": buffer})
-                        buffer = ""
-                        i += len(delim)
-                        break
-                    elif not stack:  # Opening
-                        if buffer:
-                            result.append({"class": "text", "content": buffer})
-                        stack.append(delim)
-                        buffer = ""
-                        i += len(delim)
-                        break
-                    else:
-                        raise ValueError(f"Nested {class_name} delimiters not supported")
-            else:  # No delimiter match
-                buffer += text[i]
-                i += 1
-
-        if buffer:
-            result.append({"class": "text", "content": buffer})
-        return result
+        return len(tokens["input_ids"])
\ No newline at end of file
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index 0762ab3c..f6b78933 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -12,12 +12,16 @@
 from marker.schema.document import Document
 
 
+def escape_dollars(text):
+    return text.replace("$", r"\$")
+
 def cleanup_text(full_text):
     full_text = re.sub(r'\n{3,}', '\n\n', full_text)
     full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text)
     return full_text.strip()
 
 def get_formatted_table_text(element):
+
     text = []
     for content in element.contents:
         if content is None:
@@ -26,13 +30,14 @@ def get_formatted_table_text(element):
         if isinstance(content, NavigableString):
             stripped = content.strip()
             if stripped:
-                text.append(stripped)
+                text.append(escape_dollars(stripped))
         elif content.name == 'br':
             text.append('<br>')
         elif content.name == "math":
             text.append("$" + content.text + "$")
         else:
-            text.append(str(content))
+            content_str = escape_dollars(str(content))
+            text.append(content_str)
 
     full_text = ""
     for i, t in enumerate(text):
@@ -120,7 +125,7 @@ def convert_table(self, el, text, convert_as_inline):
                             if r == 0 and c == 0:
                                 grid[row_idx][col_idx] = value
                             else:
-                                grid[row_idx + r][col_idx + c] = ''
+                                grid[row_idx + r][col_idx + c] = '' # Empty cell due to rowspan/colspan
                         except IndexError:
                             # Sometimes the colspan/rowspan predictions can overflow
                             print(f"Overflow in columns: {col_idx + c} >= {total_cols}")

From 1a9734f4f78d80080a9af31ad1b131a9aa0fdd0f Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 09:55:55 -0500
Subject: [PATCH 07/11] Add test for partial row splitting

---
 marker/processors/table.py               |  3 ++-
 tests/processors/test_table_processor.py | 14 ++++++++++++++
 2 files changed, 16 insertions(+), 1 deletion(-)

diff --git a/marker/processors/table.py b/marker/processors/table.py
index 1f455848..783a73ce 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -226,7 +226,8 @@ def split_combined_rows(self, tables: List[TableResult]):
                             new_cell_count += 1
 
                     # For each new row we add, shift up subsequent rows
-                    shift_up += line_lens[0] - 1
+                    # The max is to account for partial rows
+                    shift_up += max(line_lens) - 1
                 else:
                     for cell in row_cells:
                         cell.row_id += shift_up
diff --git a/tests/processors/test_table_processor.py b/tests/processors/test_table_processor.py
index 79224a58..72e2a04b 100644
--- a/tests/processors/test_table_processor.py
+++ b/tests/processors/test_table_processor.py
@@ -1,3 +1,5 @@
+from typing import List
+
 import pytest
 from marker.renderers.json import JSONRenderer
 
@@ -63,3 +65,15 @@ def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_m
     table_output = renderer(pdf_document)
     assert "1.2E-38" in table_output.markdown
 
+
+@pytest.mark.config({"page_range": [11]})
+def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model):
+    processor = TableProcessor(detection_model, recognition_model, table_rec_model)
+    processor(pdf_document)
+
+    table = pdf_document.contained_blocks((BlockTypes.Table,))[-1]
+    cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,))
+    unique_rows = len(set([cell.row_id for cell in cells]))
+    assert unique_rows == 6
+
+

From 40fcb727f906867a9c0938c557f9f019ffa41e3c Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 10:07:41 -0500
Subject: [PATCH 08/11] Update surya and pdftext

---
 .github/workflows/benchmark.yml |  3 +--
 poetry.lock                     | 14 +++++++-------
 2 files changed, 8 insertions(+), 9 deletions(-)

diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml
index 0fcb2380..5d49aa1c 100644
--- a/.github/workflows/benchmark.yml
+++ b/.github/workflows/benchmark.yml
@@ -4,11 +4,10 @@ on: [push]
 
 env:
   TORCH_DEVICE: "cpu"
-  OCR_ENGINE: "surya"
 
 jobs:
   benchmark:
-    runs-on: ubuntu-latest
+    runs-on: [ubuntu-latest, windows-latest]
     steps:
       - uses: actions/checkout@v3
       - name: Set up Python 3.11
diff --git a/poetry.lock b/poetry.lock
index 1ddc9a0f..652f9c68 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -2371,13 +2371,13 @@ dill = ">=0.3.8"
 
 [[package]]
 name = "narwhals"
-version = "1.24.0"
+version = "1.24.1"
 description = "Extremely lightweight compatibility layer between dataframe libraries"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "narwhals-1.24.0-py3-none-any.whl", hash = "sha256:73ff60578641059221de2e4f337bfdf0260378fb1553f787d27411602cfc5e72"},
-    {file = "narwhals-1.24.0.tar.gz", hash = "sha256:23f0a05efbe29864d184842dd6bf11c044210bca1d443d6dbffe7e65a70bf063"},
+    {file = "narwhals-1.24.1-py3-none-any.whl", hash = "sha256:d8983fe14851c95d60576ddca37c094bd4ed24ab9ea98396844fb20ad9aaf184"},
+    {file = "narwhals-1.24.1.tar.gz", hash = "sha256:b09b8253d945f23cdb683a84685abf3afb9f96114d89e9f35dc876e143f65007"},
 ]
 
 [package.extras]
@@ -4641,13 +4641,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[
 
 [[package]]
 name = "surya-ocr"
-version = "0.9.3"
+version = "0.10.0"
 description = "OCR, layout, reading order, and table recognition in 90+ languages"
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "surya_ocr-0.9.3-py3-none-any.whl", hash = "sha256:6013131f3af004f93ab5422dfa8c49a83aa72beb2f8120fd59dca04803d98009"},
-    {file = "surya_ocr-0.9.3.tar.gz", hash = "sha256:a69347a3c85c04d48e3df62d11f045dc13e22ab8b3efebfdae1dd94f05a25b99"},
+    {file = "surya_ocr-0.10.0-py3-none-any.whl", hash = "sha256:ccad25a308eefd61a21b2c97fc3f5b8364887e09f197a3aaa5fee30c03f81ae1"},
+    {file = "surya_ocr-0.10.0.tar.gz", hash = "sha256:966bc0c1aef346df42e458d2c1cbc95665004ea61020577e1656789107d09119"},
 ]
 
 [package.dependencies]
@@ -5489,4 +5489,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "33297ed1b238e67f880534882876a29012559d3263ceba2ba0cdd738598af00c"
+content-hash = "d43373ff00de4feb00b0aed4fe98d2a84ecb5742d1a916cabbace5104f888d54"

From 5b1e205d030d3bfde49fd20728d58cb6eac9c530 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 10:23:16 -0500
Subject: [PATCH 09/11] Clean up table split logic

---
 marker/processors/table.py | 35 ++++++++++++++++++++++++++---------
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/marker/processors/table.py b/marker/processors/table.py
index 783a73ce..391ab4b3 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -53,6 +53,10 @@ class TableProcessor(BaseProcessor):
         int,
         "The number of workers to use for pdftext.",
     ] = 4
+    row_split_threshold: Annotated[
+        float,
+        "The percentage of rows that need to be split across the table before row splitting is active.",
+    ] = 0.5
 
     def __init__(
         self,
@@ -171,10 +175,7 @@ def split_combined_rows(self, tables: List[TableResult]):
                 # Skip empty tables
                 continue
             unique_rows = sorted(list(set([c.row_id for c in table.cells])))
-            new_cells = []
-            shift_up = 0
-            max_cell_id = max([c.cell_id for c in table.cells])
-            new_cell_count = 0
+            row_info = []
             for row in unique_rows:
                 # Cells in this row
                 # Deepcopy is because we do an in-place mutation later, and that can cause rows to shift to match rows in unique_rows
@@ -201,9 +202,25 @@ def split_combined_rows(self, tables: List[TableResult]):
                     len(line_lens_counter) == 2 and counter_keys[0] <= 1 and counter_keys[1] > 1 and line_lens_counter[counter_keys[0]] == 1, # Allow a single column with a single line - keys are the line lens, values are the counts
                 ])
                 should_split = should_split_entire_row or should_split_partial_row
-                if should_split:
-                    for i in range(0, max(line_lens)):
-                        for cell in row_cells:
+                row_info.append({
+                    "should_split": should_split,
+                    "row_cells": row_cells,
+                    "line_lens": line_lens
+                })
+
+            # Don't split if we're not splitting most of the rows in the table.  This avoids splitting stray multiline rows.
+            if sum([r["should_split"] for r in row_info]) / len(row_info) < self.row_split_threshold:
+                continue
+
+            new_cells = []
+            shift_up = 0
+            max_cell_id = max([c.cell_id for c in table.cells])
+            new_cell_count = 0
+            for row, item_info in zip(unique_rows, row_info):
+                max_lines = max(item_info["line_lens"])
+                if item_info["should_split"]:
+                    for i in range(0, max_lines):
+                        for cell in item_info["row_cells"]:
                             # Calculate height based on number of splits
                             split_height = cell.bbox[3] - cell.bbox[1]
                             current_bbox = [cell.bbox[0], cell.bbox[1] + i * split_height, cell.bbox[2], cell.bbox[1] + (i + 1) * split_height]
@@ -227,9 +244,9 @@ def split_combined_rows(self, tables: List[TableResult]):
 
                     # For each new row we add, shift up subsequent rows
                     # The max is to account for partial rows
-                    shift_up += max(line_lens) - 1
+                    shift_up += max_lines - 1
                 else:
-                    for cell in row_cells:
+                    for cell in item_info["row_cells"]:
                         cell.row_id += shift_up
                         new_cells.append(cell)
 

From b90ad5da71f5b2293ddde025840ddfabc94e21c6 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 10:37:32 -0500
Subject: [PATCH 10/11] Escape dollar signs for better equation rendering

---
 marker/processors/blockquote.py | 2 +-
 marker/processors/reference.py  | 1 -
 marker/processors/table.py      | 1 -
 marker/renderers/markdown.py    | 7 +++++++
 4 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/marker/processors/blockquote.py b/marker/processors/blockquote.py
index bc0b5bec..018a7ecf 100644
--- a/marker/processors/blockquote.py
+++ b/marker/processors/blockquote.py
@@ -63,4 +63,4 @@ def __call__(self, document: Document):
                         next_block.blockquote_level += 1
                 elif len(next_block.structure) >= 2 and (x_indent and y_indent):
                     next_block.blockquote = True
-                    next_block.blockquote_level = 1
+                    next_block.blockquote_level = 1
\ No newline at end of file
diff --git a/marker/processors/reference.py b/marker/processors/reference.py
index f3eb2d06..e38b55a9 100644
--- a/marker/processors/reference.py
+++ b/marker/processors/reference.py
@@ -7,7 +7,6 @@
 from marker.schema.groups.list import ListGroup
 from marker.schema.groups.table import TableGroup
 from marker.schema.registry import get_block_class
-from marker.schema.groups.picture import PictureGroup
 from marker.schema.groups.figure import FigureGroup
 
 
diff --git a/marker/processors/table.py b/marker/processors/table.py
index 391ab4b3..75b723c0 100644
--- a/marker/processors/table.py
+++ b/marker/processors/table.py
@@ -3,7 +3,6 @@
 from copy import deepcopy
 from typing import Annotated, List
 from collections import Counter
-from PIL import ImageDraw
 
 from ftfy import fix_text
 from surya.detection import DetectionPredictor
diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py
index f6b78933..9a48fa40 100644
--- a/marker/renderers/markdown.py
+++ b/marker/renderers/markdown.py
@@ -181,6 +181,12 @@ def convert_span(self, el, text, convert_as_inline):
         else:
             return text
 
+    def escape(self, text):
+        text = super().escape(text)
+        if self.options['escape_dollars']:
+            text = text.replace('$', r'\$')
+        return text
+
 class MarkdownOutput(BaseModel):
     markdown: str
     images: dict
@@ -203,6 +209,7 @@ def __call__(self, document: Document) -> MarkdownOutput:
             escape_misc=False,
             escape_underscores=False,
             escape_asterisks=False,
+            escape_dollars=True,
             sub_symbol="<sub>",
             sup_symbol="<sup>",
             inline_math_delimiters=self.inline_math_delimiters,

From c85d72ba4241aa79a3aacbc9693ddb669d307d57 Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Wed, 29 Jan 2025 10:40:10 -0500
Subject: [PATCH 11/11] Version bump

---
 pyproject.toml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/pyproject.toml b/pyproject.toml
index 6878e04e..08d8c72e 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "1.3.2"
+version = "1.3.3"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"