From 01626db2c412a5d8d848aca75ec224986eb2c697 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Manuel=20Aristar=C3=A1n?= Date: Thu, 23 Jan 2025 12:57:14 -0300 Subject: [PATCH 01/11] [streamlit_app] Visualize extracted blocks --- marker/scripts/streamlit_app.py | 59 ++++- marker/scripts/streamlit_app_blocks_viz.html | 234 +++++++++++++++++++ pyproject.toml | 3 +- 3 files changed, 293 insertions(+), 3 deletions(-) create mode 100644 marker/scripts/streamlit_app_blocks_viz.html diff --git a/marker/scripts/streamlit_app.py b/marker/scripts/streamlit_app.py index 7d7fe555..15d9e229 100644 --- a/marker/scripts/streamlit_app.py +++ b/marker/scripts/streamlit_app.py @@ -7,18 +7,41 @@ import base64 import io +import json import re +import string import tempfile from typing import Any, Dict import pypdfium2 import streamlit as st +import streamlit.components.v1 as components from PIL import Image from marker.converters.pdf import PdfConverter from marker.models import create_model_dict from marker.config.parser import ConfigParser from marker.output import text_from_rendered +from marker.schema import BlockTypes + +COLORS = [ + "#4e79a7", + "#f28e2c", + "#e15759", + "#76b7b2", + "#59a14f", + "#edc949", + "#af7aa1", + "#ff9da7", + "#9c755f", + "#bab0ab" +] + +with open( + os.path.join(os.path.dirname(__file__), "streamlit_app_blocks_viz.html") +) as f: + BLOCKS_VIZ_TMPL = string.Template(f.read()) + @st.cache_resource() def load_models(): @@ -83,6 +106,31 @@ def page_count(pdf_file): return len(doc) - 1 +def pillow_image_to_base64_string(img: Image) -> str: + buffered = io.BytesIO() + img.save(buffered, format="JPEG") + return base64.b64encode(buffered.getvalue()).decode("utf-8") + + +def block_display(image: Image, blocks: dict = {}, dpi=96): + image_data_url = ( + 'data:image/jpeg;base64,' + pillow_image_to_base64_string(image) + ) + + template_values = { + "image_data_url": image_data_url, + "image_width": image.width, "image_height": image.height, + "blocks_json": blocks, "colors_json": json.dumps(COLORS), + "block_types_json": json.dumps({ + bt.name: i for i, bt in enumerate(BlockTypes) + }) + } + return components.html( + BLOCKS_VIZ_TMPL.substitute(**template_values), + height=image.height, width=image.width + ) + + st.set_page_config(layout="wide") col1, col2 = st.columns([.5, .5]) @@ -108,14 +156,18 @@ def page_count(pdf_file): page_count = page_count(in_file) page_number = st.number_input(f"Page number out of {page_count}:", min_value=0, value=0, max_value=page_count) pil_image = get_page_image(in_file, page_number) + image_placeholder = st.empty() + + with image_placeholder: + block_display(pil_image) - st.image(pil_image, caption="PDF file (preview)", use_container_width=True) page_range = st.sidebar.text_input("Page range to parse, comma separated like 0,5-10,20", value=f"{page_number}-{page_number}") output_format = st.sidebar.selectbox("Output format", ["markdown", "json", "html"], index=0) run_marker = st.sidebar.button("Run Marker") use_llm = st.sidebar.checkbox("Use LLM", help="Use LLM for higher quality processing", value=False) +show_blocks = st.sidebar.checkbox("Show Blocks", help="Display detected blocks, only when output is JSON", value=False, disabled=output_format != "json") force_ocr = st.sidebar.checkbox("Force OCR", help="Force OCR on all pages", value=False) strip_existing_ocr = st.sidebar.checkbox("Strip existing OCR", help="Strip existing OCR text from the PDF and re-OCR.", value=False) debug = st.sidebar.checkbox("Debug", help="Show debug information", value=False) @@ -155,6 +207,10 @@ def page_count(pdf_file): elif output_format == "html": st.html(text) +if output_format == "json" and show_blocks: + with image_placeholder: + block_display(pil_image, text) + if debug: with col1: debug_data_path = rendered.metadata.get("debug_data_path") @@ -165,4 +221,3 @@ def page_count(pdf_file): layout_image_path = os.path.join(debug_data_path, f"layout_page_{first_page}.png") img = Image.open(layout_image_path) st.image(img, caption="Layout debug image", use_container_width=True) - diff --git a/marker/scripts/streamlit_app_blocks_viz.html b/marker/scripts/streamlit_app_blocks_viz.html new file mode 100644 index 00000000..b31ee0a8 --- /dev/null +++ b/marker/scripts/streamlit_app_blocks_viz.html @@ -0,0 +1,234 @@ + + + + + + + + +
+ + +

+
+
+ +
+ Image + +
+ + + + + diff --git a/pyproject.toml b/pyproject.toml index 4250b072..48437f9f 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -11,7 +11,8 @@ packages = [ {include = "marker"} ] include = [ - "marker/scripts/*.sh" + "marker/scripts/*.sh", + "marker/scripts/marker/scripts/streamlit_app_blocks_viz.html", ] [tool.poetry.dependencies] From 71189217c2276ada85a92ef95ecb05d3f9501959 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 28 Jan 2025 11:25:18 -0500 Subject: [PATCH 02/11] Small bugfix --- README.md | 2 +- marker/output.py | 20 +++++++++++++++++++- marker/processors/list.py | 4 ++++ pyproject.toml | 2 +- 4 files changed, 25 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 880dba9e..e6834b8a 100644 --- a/README.md +++ b/README.md @@ -256,7 +256,7 @@ Pages have the keys: - `id` - unique id for the block. - `block_type` - the type of block. The possible block types can be seen in `marker/schema/__init__.py`. As of this writing, they are ["Line", "Span", "FigureGroup", "TableGroup", "ListGroup", "PictureGroup", "Page", "Caption", "Code", "Figure", "Footnote", "Form", "Equation", "Handwriting", "TextInlineMath", "ListItem", "PageFooter", "PageHeader", "Picture", "SectionHeader", "Table", "Text", "TableOfContents", "Document"] -- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/renderers/__init__.py:BaseRender.extract_block_html`. +- `html` - the HTML for the page. Note that this will have recursive references to children. The `content-ref` tags must be replaced with the child content if you want the full html. You can see an example of this at `marker/output.py:json_to_html`. That function will take in a single block from the json output, and turn it into HTML. - `polygon` - the 4-corner polygon of the page, in (x1,y1), (x2,y2), (x3, y3), (x4, y4) format. (x1,y1) is the top left, and coordinates go clockwise. - `children` - the child blocks. diff --git a/marker/output.py b/marker/output.py index 2a6ee126..1c4d92d4 100644 --- a/marker/output.py +++ b/marker/output.py @@ -1,13 +1,31 @@ import json import os +from bs4 import BeautifulSoup from pydantic import BaseModel from marker.renderers.html import HTMLOutput -from marker.renderers.json import JSONOutput +from marker.renderers.json import JSONOutput, JSONBlockOutput from marker.renderers.markdown import MarkdownOutput from marker.settings import settings +def json_to_html(block: JSONBlockOutput): + # Utility function to take in json block output and give html for the block. + if not getattr(block, "children", None): + return block.html + else: + child_html = [json_to_html(child) for child in block.children] + child_ids = [child.id for child in block.children] + + soup = BeautifulSoup(block.html, "html.parser") + content_refs = soup.find_all("content-ref") + for ref in content_refs: + src_id = ref.attrs["src"] + if src_id in child_ids: + child_soup = BeautifulSoup(child_html[child_ids.index(src_id)], "html.parser") + ref.replace_with(child_soup) + return str(soup) + def output_exists(output_dir: str, fname_base: str): exts = ["md", "html", "json"] diff --git a/marker/processors/list.py b/marker/processors/list.py index 9d7105ee..4e137987 100644 --- a/marker/processors/list.py +++ b/marker/processors/list.py @@ -66,6 +66,10 @@ def list_group_indentation(self, document: Document): for list_item_id in block.structure: list_item_block: ListItem = page.get_block(list_item_id) + # This can be a line sometimes + if list_item_block.block_type != BlockTypes.ListItem: + continue + while stack and list_item_block.polygon.x_start <= stack[-1].polygon.x_start + (self.min_x_indent * page.polygon.width): stack.pop() diff --git a/pyproject.toml b/pyproject.toml index 0377a77a..67f77313 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -28,7 +28,7 @@ texify = "^0.2.1" rapidfuzz = "^3.8.1" surya-ocr = "~0.9.3" regex = "^2024.4.28" -pdftext = "~0.5.0" +pdftext = "~0.5.1" markdownify = "^0.13.1" click = "^8.1.7" google-generativeai = "^0.8.3" From 98bdbbb6ee3e24479a0586db4ee826df4d118108 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 28 Jan 2025 16:37:19 -0500 Subject: [PATCH 03/11] Fix html path --- poetry.lock | 76 +++++++++++++++++++++++++------------------------- pyproject.toml | 2 +- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/poetry.lock b/poetry.lock index c371d07c..1ddc9a0f 100644 --- a/poetry.lock +++ b/poetry.lock @@ -320,13 +320,13 @@ files = [ [[package]] name = "attrs" -version = "24.3.0" +version = "25.1.0" description = "Classes Without Boilerplate" optional = false python-versions = ">=3.8" files = [ - {file = "attrs-24.3.0-py3-none-any.whl", hash = "sha256:ac96cd038792094f438ad1f6ff80837353805ac950cd2aa0e0625ef19850c308"}, - {file = "attrs-24.3.0.tar.gz", hash = "sha256:8f5c07333d543103541ba7be0e2ce16eeee8130cb0b3f9238ab904ce1e85baff"}, + {file = "attrs-25.1.0-py3-none-any.whl", hash = "sha256:c75a69e28a550a7e93789579c22aa26b0f5b83b75dc4e08fe092980051e1090a"}, + {file = "attrs-25.1.0.tar.gz", hash = "sha256:1c97078a80c814273a76b2a298a932eb681c87415c11dee0a6921de7f1b02c3e"}, ] [package.extras] @@ -1082,13 +1082,13 @@ protobuf = ">=3.20.2,<4.21.0 || >4.21.0,<4.21.1 || >4.21.1,<4.21.2 || >4.21.2,<4 [[package]] name = "google-api-core" -version = "2.24.0" +version = "2.24.1" description = "Google API client core library" optional = false python-versions = ">=3.7" files = [ - {file = "google_api_core-2.24.0-py3-none-any.whl", hash = "sha256:10d82ac0fca69c82a25b3efdeefccf6f28e02ebb97925a8cce8edbfe379929d9"}, - {file = "google_api_core-2.24.0.tar.gz", hash = "sha256:e255640547a597a4da010876d333208ddac417d60add22b6851a0c66a831fcaf"}, + {file = "google_api_core-2.24.1-py3-none-any.whl", hash = "sha256:bc78d608f5a5bf853b80bd70a795f703294de656c096c0968320830a4bc280f1"}, + {file = "google_api_core-2.24.1.tar.gz", hash = "sha256:f8b36f5456ab0dd99a1b693a40a31d1e7757beea380ad1b38faaf8941eae9d8a"}, ] [package.dependencies] @@ -1117,13 +1117,13 @@ grpcio-gcp = ["grpcio-gcp (>=0.2.2,<1.0.dev0)"] [[package]] name = "google-api-python-client" -version = "2.159.0" +version = "2.160.0" description = "Google API Client Library for Python" optional = false python-versions = ">=3.7" files = [ - {file = "google_api_python_client-2.159.0-py2.py3-none-any.whl", hash = "sha256:baef0bb631a60a0bd7c0bf12a5499e3a40cd4388484de7ee55c1950bf820a0cf"}, - {file = "google_api_python_client-2.159.0.tar.gz", hash = "sha256:55197f430f25c907394b44fa078545ffef89d33fd4dca501b7db9f0d8e224bd6"}, + {file = "google_api_python_client-2.160.0-py2.py3-none-any.whl", hash = "sha256:63d61fb3e4cf3fb31a70a87f45567c22f6dfe87bbfa27252317e3e2c42900db4"}, + {file = "google_api_python_client-2.160.0.tar.gz", hash = "sha256:a8ccafaecfa42d15d5b5c3134ced8de08380019717fc9fb1ed510ca58eca3b7e"}, ] [package.dependencies] @@ -1367,13 +1367,13 @@ zstd = ["zstandard (>=0.18.0)"] [[package]] name = "huggingface-hub" -version = "0.27.1" +version = "0.28.0" description = "Client library to download and publish models, datasets and other repos on the huggingface.co hub" optional = false python-versions = ">=3.8.0" files = [ - {file = "huggingface_hub-0.27.1-py3-none-any.whl", hash = "sha256:1c5155ca7d60b60c2e2fc38cbb3ffb7f7c3adf48f824015b219af9061771daec"}, - {file = "huggingface_hub-0.27.1.tar.gz", hash = "sha256:c004463ca870283909d715d20f066ebd6968c2207dae9393fdffb3c1d4d8f98b"}, + {file = "huggingface_hub-0.28.0-py3-none-any.whl", hash = "sha256:71cff4e500efe68061d94b7f6d3114e183715088be7a90bf4dd84af83b5f5cdb"}, + {file = "huggingface_hub-0.28.0.tar.gz", hash = "sha256:c2b18c02a47d4384763caddb4d0ab2a8fc6c16e0800d6de4d55d0a896244aba3"}, ] [package.dependencies] @@ -1386,13 +1386,13 @@ tqdm = ">=4.42.1" typing-extensions = ">=3.7.4.3" [package.extras] -all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +all = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] cli = ["InquirerPy (==0.3.4)"] -dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.5.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] +dev = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "libcst (==1.4.0)", "mypy (==1.5.1)", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "ruff (>=0.9.0)", "soundfile", "types-PyYAML", "types-requests", "types-simplejson", "types-toml", "types-tqdm", "types-urllib3", "typing-extensions (>=4.8.0)", "urllib3 (<2.0)"] fastai = ["fastai (>=2.4)", "fastcore (>=1.3.27)", "toml"] hf-transfer = ["hf-transfer (>=0.1.4)"] inference = ["aiohttp"] -quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.5.0)"] +quality = ["libcst (==1.4.0)", "mypy (==1.5.1)", "ruff (>=0.9.0)"] tensorflow = ["graphviz", "pydot", "tensorflow"] tensorflow-testing = ["keras (<3.0)", "tensorflow"] testing = ["InquirerPy (==0.3.4)", "Jinja2", "Pillow", "aiohttp", "fastapi", "gradio (>=4.0.0)", "jedi", "numpy", "pytest (>=8.1.1,<8.2.2)", "pytest-asyncio", "pytest-cov", "pytest-env", "pytest-mock", "pytest-rerunfailures", "pytest-vcr", "pytest-xdist", "soundfile", "urllib3 (<2.0)"] @@ -2085,13 +2085,13 @@ testing = ["coverage", "pytest", "pytest-cov", "pytest-regressions"] [[package]] name = "markdown2" -version = "2.5.2" +version = "2.5.3" description = "A fast and complete Python implementation of Markdown" optional = false python-versions = "<4,>=3.9" files = [ - {file = "markdown2-2.5.2-py3-none-any.whl", hash = "sha256:bed80d301a33845be633acde47a67cf265c57ddf9cbe3cb11c49c18016c2f581"}, - {file = "markdown2-2.5.2.tar.gz", hash = "sha256:3ac02226a901c4b2f6fc21dbd17c26d118d2c25bcbb28cee093a1f8b5c46f3f1"}, + {file = "markdown2-2.5.3-py3-none-any.whl", hash = "sha256:a8ebb7e84b8519c37bf7382b3db600f1798a22c245bfd754a1f87ca8d7ea63b3"}, + {file = "markdown2-2.5.3.tar.gz", hash = "sha256:4d502953a4633408b0ab3ec503c5d6984d1b14307e32b325ec7d16ea57524895"}, ] [package.extras] @@ -2212,13 +2212,13 @@ files = [ [[package]] name = "mistune" -version = "3.1.0" +version = "3.1.1" description = "A sane and fast Markdown parser with useful plugins and renderers" optional = false python-versions = ">=3.8" files = [ - {file = "mistune-3.1.0-py3-none-any.whl", hash = "sha256:b05198cf6d671b3deba6c87ec6cf0d4eb7b72c524636eddb6dbf13823b52cee1"}, - {file = "mistune-3.1.0.tar.gz", hash = "sha256:dbcac2f78292b9dc066cd03b7a3a26b62d85f8159f2ea5fd28e55df79908d667"}, + {file = "mistune-3.1.1-py3-none-any.whl", hash = "sha256:02106ac2aa4f66e769debbfa028509a275069dcffce0dfa578edd7b991ee700a"}, + {file = "mistune-3.1.1.tar.gz", hash = "sha256:e0740d635f515119f7d1feb6f9b192ee60f0cc649f80a8f944f905706a21654c"}, ] [package.dependencies] @@ -2371,13 +2371,13 @@ dill = ">=0.3.8" [[package]] name = "narwhals" -version = "1.23.0" +version = "1.24.0" description = "Extremely lightweight compatibility layer between dataframe libraries" optional = false python-versions = ">=3.8" files = [ - {file = "narwhals-1.23.0-py3-none-any.whl", hash = "sha256:8d6e7fa0b13af01784837efc060e2a663e5d888decf31f261ff8fc06a7cefeb4"}, - {file = "narwhals-1.23.0.tar.gz", hash = "sha256:3da4b1e7675b3d8ed69bd40c263b135066248af28354f104ea36c788b23d1e3e"}, + {file = "narwhals-1.24.0-py3-none-any.whl", hash = "sha256:73ff60578641059221de2e4f337bfdf0260378fb1553f787d27411602cfc5e72"}, + {file = "narwhals-1.24.0.tar.gz", hash = "sha256:23f0a05efbe29864d184842dd6bf11c044210bca1d443d6dbffe7e65a70bf063"}, ] [package.extras] @@ -2419,13 +2419,13 @@ test = ["flaky", "ipykernel (>=6.19.3)", "ipython", "ipywidgets", "nbconvert (>= [[package]] name = "nbconvert" -version = "7.16.5" +version = "7.16.6" description = "Converting Jupyter Notebooks (.ipynb files) to other formats. Output formats include asciidoc, html, latex, markdown, pdf, py, rst, script. nbconvert can be used both as a Python library (`import nbconvert`) or as a command line tool (invoked as `jupyter nbconvert ...`)." optional = false python-versions = ">=3.8" files = [ - {file = "nbconvert-7.16.5-py3-none-any.whl", hash = "sha256:e12eac052d6fd03040af4166c563d76e7aeead2e9aadf5356db552a1784bd547"}, - {file = "nbconvert-7.16.5.tar.gz", hash = "sha256:c83467bb5777fdfaac5ebbb8e864f300b277f68692ecc04d6dab72f2d8442344"}, + {file = "nbconvert-7.16.6-py3-none-any.whl", hash = "sha256:1375a7b67e0c2883678c48e506dc320febb57685e5ee67faa51b18a90f3a712b"}, + {file = "nbconvert-7.16.6.tar.gz", hash = "sha256:576a7e37c6480da7b8465eefa66c17844243816ce1ccc372633c6b71c3c0f582"}, ] [package.dependencies] @@ -2923,13 +2923,13 @@ testing = ["docopt", "pytest"] [[package]] name = "pdftext" -version = "0.5.0" +version = "0.5.1" description = "Extract structured text from pdfs quickly" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "pdftext-0.5.0-py3-none-any.whl", hash = "sha256:e14179c5039c711dc5c490ecb1bc15c92ab920e5f7715034b7ae5a387b3b2787"}, - {file = "pdftext-0.5.0.tar.gz", hash = "sha256:f6487d170abc97867d7539774fecdb0a17599965ba88287b3b89731f5cd7d612"}, + {file = "pdftext-0.5.1-py3-none-any.whl", hash = "sha256:6de0406473846f6486b969fb4b1832b94ebe4c92a4bae5f3d1ead645d43d9994"}, + {file = "pdftext-0.5.1.tar.gz", hash = "sha256:81646068c98df4874064f739f507908543188e93e1a5d84b30a0989329f32af6"}, ] [package.dependencies] @@ -3201,13 +3201,13 @@ files = [ [[package]] name = "proto-plus" -version = "1.25.0" -description = "Beautiful, Pythonic protocol buffers." +version = "1.26.0" +description = "Beautiful, Pythonic protocol buffers" optional = false python-versions = ">=3.7" files = [ - {file = "proto_plus-1.25.0-py3-none-any.whl", hash = "sha256:c91fc4a65074ade8e458e95ef8bac34d4008daa7cce4a12d6707066fca648961"}, - {file = "proto_plus-1.25.0.tar.gz", hash = "sha256:fbb17f57f7bd05a68b7707e745e26528b0b3c34e378db91eef93912c54982d91"}, + {file = "proto_plus-1.26.0-py3-none-any.whl", hash = "sha256:bf2dfaa3da281fc3187d12d224c707cb57214fb2c22ba854eb0c105a3fb2d4d7"}, + {file = "proto_plus-1.26.0.tar.gz", hash = "sha256:6e93d5f5ca267b54300880fff156b6a3386b3fa3f43b1da62e680fc0c586ef22"}, ] [package.dependencies] @@ -4032,13 +4032,13 @@ all = ["numpy"] [[package]] name = "referencing" -version = "0.36.1" +version = "0.36.2" description = "JSON Referencing + Python" optional = false python-versions = ">=3.9" files = [ - {file = "referencing-0.36.1-py3-none-any.whl", hash = "sha256:363d9c65f080d0d70bc41c721dce3c7f3e77fc09f269cd5c8813da18069a6794"}, - {file = "referencing-0.36.1.tar.gz", hash = "sha256:ca2e6492769e3602957e9b831b94211599d2aade9477f5d44110d2530cf9aade"}, + {file = "referencing-0.36.2-py3-none-any.whl", hash = "sha256:e8699adbbf8b5c7de96d8ffa0eb5c158b3beafce084968e2ea8bb08c6794dcd0"}, + {file = "referencing-0.36.2.tar.gz", hash = "sha256:df2e89862cd09deabbdba16944cc3f10feb6b3e6f18e902f7cc25609a34775aa"}, ] [package.dependencies] @@ -5489,4 +5489,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "6eb647ac20025351bfd8048a8407855c8f0a51760a2944f1da6c3685b9a8ada7" +content-hash = "33297ed1b238e67f880534882876a29012559d3263ceba2ba0cdd738598af00c" diff --git a/pyproject.toml b/pyproject.toml index c5a0b4e1..d838b9e0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -12,7 +12,7 @@ packages = [ ] include = [ "marker/scripts/*.sh", - "marker/scripts/marker/scripts/streamlit_app_blocks_viz.html", + "marker/scripts/*.html", ] [tool.poetry.dependencies] From e534a14a2e2a9e73301983779060087aa36a21f0 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 28 Jan 2025 16:37:45 -0500 Subject: [PATCH 04/11] Bump surya --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index d838b9e0..6878e04e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ tqdm = "^4.66.1" ftfy = "^6.1.1" texify = "^0.2.1" rapidfuzz = "^3.8.1" -surya-ocr = "~0.9.3" +surya-ocr = "~0.10.0" regex = "^2024.4.28" pdftext = "~0.5.1" markdownify = "^0.13.1" From 5fdb25b8b50794e05d6329c8589322b3fc073fa0 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Tue, 28 Jan 2025 17:56:07 -0500 Subject: [PATCH 05/11] Swap to new texify model --- marker/models.py | 33 ++------------------------------- 1 file changed, 2 insertions(+), 31 deletions(-) diff --git a/marker/models.py b/marker/models.py index 908fb863..80dc254c 100644 --- a/marker/models.py +++ b/marker/models.py @@ -1,41 +1,12 @@ import os - -from marker.settings import settings - -os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for a simple op, which is not supported on MPS - -from typing import List -from PIL import Image +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" # Transformers uses .isin for an op, which is not supported on MPS from surya.detection import DetectionPredictor from surya.layout import LayoutPredictor from surya.ocr_error import OCRErrorPredictor from surya.recognition import RecognitionPredictor from surya.table_rec import TableRecPredictor - -from texify.model.model import load_model as load_texify_model -from texify.model.processor import load_processor as load_texify_processor -from texify.inference import batch_inference - -class TexifyPredictor: - def __init__(self, device=None, dtype=None): - if not device: - device = settings.TORCH_DEVICE_MODEL - if not dtype: - dtype = settings.TEXIFY_DTYPE - - self.model = load_texify_model(checkpoint=settings.TEXIFY_MODEL_NAME, device=device, dtype=dtype) - self.processor = load_texify_processor() - self.device = device - self.dtype = dtype - - def __call__(self, batch_images: List[Image.Image], max_tokens: int): - return batch_inference( - batch_images, - self.model, - self.processor, - max_tokens=max_tokens - ) +from surya.texify import TexifyPredictor def create_model_dict(device=None, dtype=None) -> dict: From 597db72f9a30acd0304774585cdfbe64e8e0d34f Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 07:34:43 -0500 Subject: [PATCH 06/11] Integrate new texify model --- marker/processors/equation.py | 85 ++++------------------------------- marker/renderers/markdown.py | 11 +++-- 2 files changed, 16 insertions(+), 80 deletions(-) diff --git a/marker/processors/equation.py b/marker/processors/equation.py index 868f98a2..5f5be17c 100644 --- a/marker/processors/equation.py +++ b/marker/processors/equation.py @@ -22,7 +22,7 @@ class EquationProcessor(BaseProcessor): model_max_length: Annotated[ int, "The maximum number of tokens to allow for the Texify model.", - ] = 384 + ] = 768 texify_batch_size: Annotated[ Optional[int], "The batch size to use for the Texify model.", @@ -65,27 +65,7 @@ def __call__(self, document: Document): continue block = document.get_block(equation_d["block_id"]) - block.html = self.parse_latex_to_html(prediction) - - def parse_latex_to_html(self, latex: str): - html_out = "" - try: - latex = self.parse_latex(latex) - except ValueError as e: - # If we have mismatched delimiters, we'll treat it as a single block - # Strip the $'s from the latex - latex = [ - {"class": "block", "content": latex.replace("$", "")} - ] - - for el in latex: - if el["class"] == "block": - html_out += f'{el["content"]}' - elif el["class"] == "inline": - html_out += f'{el["content"]}' - else: - html_out += f" {el['content']} " - return html_out.strip() + block.html = prediction def get_batch_size(self): if self.texify_batch_size is not None: @@ -106,71 +86,22 @@ def get_latex_batched(self, equation_data: List[dict]): max_idx = min(min_idx + batch_size, len(equation_data)) batch_equations = equation_data[min_idx:max_idx] - max_length = max([eq["token_count"] for eq in batch_equations]) - max_length = min(max_length, self.model_max_length) - max_length += self.token_buffer - batch_images = [eq["image"] for eq in batch_equations] model_output = self.texify_model( - batch_images, - max_tokens=max_length + batch_images ) for j, output in enumerate(model_output): - token_count = self.get_total_texify_tokens(output) - if token_count >= max_length - 1: - output = "" + token_count = self.get_total_texify_tokens(output.text) + if token_count >= self.model_max_length - 1: + output.text = "" image_idx = i + j - predictions[image_idx] = output + predictions[image_idx] = output.text return predictions def get_total_texify_tokens(self, text): tokenizer = self.texify_model.processor.tokenizer tokens = tokenizer(text) - return len(tokens["input_ids"]) - - - @staticmethod - def parse_latex(text: str): - if text.count("$") % 2 != 0: - raise ValueError("Mismatched delimiters in LaTeX") - - DELIMITERS = [ - ("$$", "block"), - ("$", "inline") - ] - - text = text.replace("\n", "
") # we can't handle \n's inside

properly if we don't do this - - i = 0 - stack = [] - result = [] - buffer = "" - - while i < len(text): - for delim, class_name in DELIMITERS: - if text[i:].startswith(delim): - if stack and stack[-1] == delim: # Closing - stack.pop() - result.append({"class": class_name, "content": buffer}) - buffer = "" - i += len(delim) - break - elif not stack: # Opening - if buffer: - result.append({"class": "text", "content": buffer}) - stack.append(delim) - buffer = "" - i += len(delim) - break - else: - raise ValueError(f"Nested {class_name} delimiters not supported") - else: # No delimiter match - buffer += text[i] - i += 1 - - if buffer: - result.append({"class": "text", "content": buffer}) - return result + return len(tokens["input_ids"]) \ No newline at end of file diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index 0762ab3c..f6b78933 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -12,12 +12,16 @@ from marker.schema.document import Document +def escape_dollars(text): + return text.replace("$", r"\$") + def cleanup_text(full_text): full_text = re.sub(r'\n{3,}', '\n\n', full_text) full_text = re.sub(r'(\n\s){3,}', '\n\n', full_text) return full_text.strip() def get_formatted_table_text(element): + text = [] for content in element.contents: if content is None: @@ -26,13 +30,14 @@ def get_formatted_table_text(element): if isinstance(content, NavigableString): stripped = content.strip() if stripped: - text.append(stripped) + text.append(escape_dollars(stripped)) elif content.name == 'br': text.append('
') elif content.name == "math": text.append("$" + content.text + "$") else: - text.append(str(content)) + content_str = escape_dollars(str(content)) + text.append(content_str) full_text = "" for i, t in enumerate(text): @@ -120,7 +125,7 @@ def convert_table(self, el, text, convert_as_inline): if r == 0 and c == 0: grid[row_idx][col_idx] = value else: - grid[row_idx + r][col_idx + c] = '' + grid[row_idx + r][col_idx + c] = '' # Empty cell due to rowspan/colspan except IndexError: # Sometimes the colspan/rowspan predictions can overflow print(f"Overflow in columns: {col_idx + c} >= {total_cols}") From 1a9734f4f78d80080a9af31ad1b131a9aa0fdd0f Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 09:55:55 -0500 Subject: [PATCH 07/11] Add test for partial row splitting --- marker/processors/table.py | 3 ++- tests/processors/test_table_processor.py | 14 ++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/marker/processors/table.py b/marker/processors/table.py index 1f455848..783a73ce 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -226,7 +226,8 @@ def split_combined_rows(self, tables: List[TableResult]): new_cell_count += 1 # For each new row we add, shift up subsequent rows - shift_up += line_lens[0] - 1 + # The max is to account for partial rows + shift_up += max(line_lens) - 1 else: for cell in row_cells: cell.row_id += shift_up diff --git a/tests/processors/test_table_processor.py b/tests/processors/test_table_processor.py index 79224a58..72e2a04b 100644 --- a/tests/processors/test_table_processor.py +++ b/tests/processors/test_table_processor.py @@ -1,3 +1,5 @@ +from typing import List + import pytest from marker.renderers.json import JSONRenderer @@ -63,3 +65,15 @@ def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_m table_output = renderer(pdf_document) assert "1.2E-38" in table_output.markdown + +@pytest.mark.config({"page_range": [11]}) +def test_split_rows(pdf_document, detection_model, recognition_model, table_rec_model): + processor = TableProcessor(detection_model, recognition_model, table_rec_model) + processor(pdf_document) + + table = pdf_document.contained_blocks((BlockTypes.Table,))[-1] + cells: List[TableCell] = table.contained_blocks(pdf_document, (BlockTypes.TableCell,)) + unique_rows = len(set([cell.row_id for cell in cells])) + assert unique_rows == 6 + + From 40fcb727f906867a9c0938c557f9f019ffa41e3c Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 10:07:41 -0500 Subject: [PATCH 08/11] Update surya and pdftext --- .github/workflows/benchmark.yml | 3 +-- poetry.lock | 14 +++++++------- 2 files changed, 8 insertions(+), 9 deletions(-) diff --git a/.github/workflows/benchmark.yml b/.github/workflows/benchmark.yml index 0fcb2380..5d49aa1c 100644 --- a/.github/workflows/benchmark.yml +++ b/.github/workflows/benchmark.yml @@ -4,11 +4,10 @@ on: [push] env: TORCH_DEVICE: "cpu" - OCR_ENGINE: "surya" jobs: benchmark: - runs-on: ubuntu-latest + runs-on: [ubuntu-latest, windows-latest] steps: - uses: actions/checkout@v3 - name: Set up Python 3.11 diff --git a/poetry.lock b/poetry.lock index 1ddc9a0f..652f9c68 100644 --- a/poetry.lock +++ b/poetry.lock @@ -2371,13 +2371,13 @@ dill = ">=0.3.8" [[package]] name = "narwhals" -version = "1.24.0" +version = "1.24.1" description = "Extremely lightweight compatibility layer between dataframe libraries" optional = false python-versions = ">=3.8" files = [ - {file = "narwhals-1.24.0-py3-none-any.whl", hash = "sha256:73ff60578641059221de2e4f337bfdf0260378fb1553f787d27411602cfc5e72"}, - {file = "narwhals-1.24.0.tar.gz", hash = "sha256:23f0a05efbe29864d184842dd6bf11c044210bca1d443d6dbffe7e65a70bf063"}, + {file = "narwhals-1.24.1-py3-none-any.whl", hash = "sha256:d8983fe14851c95d60576ddca37c094bd4ed24ab9ea98396844fb20ad9aaf184"}, + {file = "narwhals-1.24.1.tar.gz", hash = "sha256:b09b8253d945f23cdb683a84685abf3afb9f96114d89e9f35dc876e143f65007"}, ] [package.extras] @@ -4641,13 +4641,13 @@ snowflake = ["snowflake-connector-python (>=2.8.0)", "snowflake-snowpark-python[ [[package]] name = "surya-ocr" -version = "0.9.3" +version = "0.10.0" description = "OCR, layout, reading order, and table recognition in 90+ languages" optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "surya_ocr-0.9.3-py3-none-any.whl", hash = "sha256:6013131f3af004f93ab5422dfa8c49a83aa72beb2f8120fd59dca04803d98009"}, - {file = "surya_ocr-0.9.3.tar.gz", hash = "sha256:a69347a3c85c04d48e3df62d11f045dc13e22ab8b3efebfdae1dd94f05a25b99"}, + {file = "surya_ocr-0.10.0-py3-none-any.whl", hash = "sha256:ccad25a308eefd61a21b2c97fc3f5b8364887e09f197a3aaa5fee30c03f81ae1"}, + {file = "surya_ocr-0.10.0.tar.gz", hash = "sha256:966bc0c1aef346df42e458d2c1cbc95665004ea61020577e1656789107d09119"}, ] [package.dependencies] @@ -5489,4 +5489,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "33297ed1b238e67f880534882876a29012559d3263ceba2ba0cdd738598af00c" +content-hash = "d43373ff00de4feb00b0aed4fe98d2a84ecb5742d1a916cabbace5104f888d54" From 5b1e205d030d3bfde49fd20728d58cb6eac9c530 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 10:23:16 -0500 Subject: [PATCH 09/11] Clean up table split logic --- marker/processors/table.py | 35 ++++++++++++++++++++++++++--------- 1 file changed, 26 insertions(+), 9 deletions(-) diff --git a/marker/processors/table.py b/marker/processors/table.py index 783a73ce..391ab4b3 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -53,6 +53,10 @@ class TableProcessor(BaseProcessor): int, "The number of workers to use for pdftext.", ] = 4 + row_split_threshold: Annotated[ + float, + "The percentage of rows that need to be split across the table before row splitting is active.", + ] = 0.5 def __init__( self, @@ -171,10 +175,7 @@ def split_combined_rows(self, tables: List[TableResult]): # Skip empty tables continue unique_rows = sorted(list(set([c.row_id for c in table.cells]))) - new_cells = [] - shift_up = 0 - max_cell_id = max([c.cell_id for c in table.cells]) - new_cell_count = 0 + row_info = [] for row in unique_rows: # Cells in this row # Deepcopy is because we do an in-place mutation later, and that can cause rows to shift to match rows in unique_rows @@ -201,9 +202,25 @@ def split_combined_rows(self, tables: List[TableResult]): len(line_lens_counter) == 2 and counter_keys[0] <= 1 and counter_keys[1] > 1 and line_lens_counter[counter_keys[0]] == 1, # Allow a single column with a single line - keys are the line lens, values are the counts ]) should_split = should_split_entire_row or should_split_partial_row - if should_split: - for i in range(0, max(line_lens)): - for cell in row_cells: + row_info.append({ + "should_split": should_split, + "row_cells": row_cells, + "line_lens": line_lens + }) + + # Don't split if we're not splitting most of the rows in the table. This avoids splitting stray multiline rows. + if sum([r["should_split"] for r in row_info]) / len(row_info) < self.row_split_threshold: + continue + + new_cells = [] + shift_up = 0 + max_cell_id = max([c.cell_id for c in table.cells]) + new_cell_count = 0 + for row, item_info in zip(unique_rows, row_info): + max_lines = max(item_info["line_lens"]) + if item_info["should_split"]: + for i in range(0, max_lines): + for cell in item_info["row_cells"]: # Calculate height based on number of splits split_height = cell.bbox[3] - cell.bbox[1] current_bbox = [cell.bbox[0], cell.bbox[1] + i * split_height, cell.bbox[2], cell.bbox[1] + (i + 1) * split_height] @@ -227,9 +244,9 @@ def split_combined_rows(self, tables: List[TableResult]): # For each new row we add, shift up subsequent rows # The max is to account for partial rows - shift_up += max(line_lens) - 1 + shift_up += max_lines - 1 else: - for cell in row_cells: + for cell in item_info["row_cells"]: cell.row_id += shift_up new_cells.append(cell) From b90ad5da71f5b2293ddde025840ddfabc94e21c6 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 10:37:32 -0500 Subject: [PATCH 10/11] Escape dollar signs for better equation rendering --- marker/processors/blockquote.py | 2 +- marker/processors/reference.py | 1 - marker/processors/table.py | 1 - marker/renderers/markdown.py | 7 +++++++ 4 files changed, 8 insertions(+), 3 deletions(-) diff --git a/marker/processors/blockquote.py b/marker/processors/blockquote.py index bc0b5bec..018a7ecf 100644 --- a/marker/processors/blockquote.py +++ b/marker/processors/blockquote.py @@ -63,4 +63,4 @@ def __call__(self, document: Document): next_block.blockquote_level += 1 elif len(next_block.structure) >= 2 and (x_indent and y_indent): next_block.blockquote = True - next_block.blockquote_level = 1 + next_block.blockquote_level = 1 \ No newline at end of file diff --git a/marker/processors/reference.py b/marker/processors/reference.py index f3eb2d06..e38b55a9 100644 --- a/marker/processors/reference.py +++ b/marker/processors/reference.py @@ -7,7 +7,6 @@ from marker.schema.groups.list import ListGroup from marker.schema.groups.table import TableGroup from marker.schema.registry import get_block_class -from marker.schema.groups.picture import PictureGroup from marker.schema.groups.figure import FigureGroup diff --git a/marker/processors/table.py b/marker/processors/table.py index 391ab4b3..75b723c0 100644 --- a/marker/processors/table.py +++ b/marker/processors/table.py @@ -3,7 +3,6 @@ from copy import deepcopy from typing import Annotated, List from collections import Counter -from PIL import ImageDraw from ftfy import fix_text from surya.detection import DetectionPredictor diff --git a/marker/renderers/markdown.py b/marker/renderers/markdown.py index f6b78933..9a48fa40 100644 --- a/marker/renderers/markdown.py +++ b/marker/renderers/markdown.py @@ -181,6 +181,12 @@ def convert_span(self, el, text, convert_as_inline): else: return text + def escape(self, text): + text = super().escape(text) + if self.options['escape_dollars']: + text = text.replace('$', r'\$') + return text + class MarkdownOutput(BaseModel): markdown: str images: dict @@ -203,6 +209,7 @@ def __call__(self, document: Document) -> MarkdownOutput: escape_misc=False, escape_underscores=False, escape_asterisks=False, + escape_dollars=True, sub_symbol="", sup_symbol="", inline_math_delimiters=self.inline_math_delimiters, From c85d72ba4241aa79a3aacbc9693ddb669d307d57 Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Wed, 29 Jan 2025 10:40:10 -0500 Subject: [PATCH 11/11] Version bump --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 6878e04e..08d8c72e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "1.3.2" +version = "1.3.3" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md"