From 8cbf79cef70df85f398363420da2fc3e71ad90ca Mon Sep 17 00:00:00 2001
From: Vik Paruchuri <vik.paruchuri@gmail.com>
Date: Mon, 21 Oct 2024 09:07:52 -0400
Subject: [PATCH] Fix OOM errors

---
 marker/debug/render.py |  2 +-
 marker/settings.py     |  1 +
 marker/tables/table.py | 14 ++++++++++++--
 poetry.lock            | 20 ++++++++++----------
 pyproject.toml         |  4 ++--
 5 files changed, 26 insertions(+), 15 deletions(-)

diff --git a/marker/debug/render.py b/marker/debug/render.py
index 2580241d..299baee2 100644
--- a/marker/debug/render.py
+++ b/marker/debug/render.py
@@ -10,7 +10,7 @@ def get_font_path() -> str:
 
     if not os.path.exists(font_path):
         os.makedirs(os.path.dirname(font_path), exist_ok=True)
-        font_dl_path = f"{settings.RECOGNITION_FONT_DL_BASE}/{os.path.basename(font_path)}"
+        font_dl_path = f"{settings.FONT_DL_BASE}/{os.path.basename(font_path)}"
         with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f:
             r.raise_for_status()
             for chunk in r.iter_content(chunk_size=8192):
diff --git a/marker/settings.py b/marker/settings.py
index beaccf62..ee0756ea 100644
--- a/marker/settings.py
+++ b/marker/settings.py
@@ -79,6 +79,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # Table models
     SURYA_TABLE_DPI: int = 192
+    TABLE_REC_BATCH_SIZE: Optional[int] = None
 
     # Headings
     HEADING_LEVEL_COUNT: int = 4
diff --git a/marker/tables/table.py b/marker/tables/table.py
index 1cb16a12..66981a5b 100644
--- a/marker/tables/table.py
+++ b/marker/tables/table.py
@@ -12,10 +12,20 @@
 from marker.schema.block import Line, Span, Block
 from marker.schema.page import Page
 from typing import List
+from marker.ocr.recognition import get_batch_size as get_ocr_batch_size
+from marker.ocr.detection import get_batch_size as get_detector_batch_size
 
 from marker.settings import settings
 
 
+def get_batch_size():
+    if settings.TABLE_REC_BATCH_SIZE is not None:
+        return settings.TABLE_REC_BATCH_SIZE
+    elif settings.TORCH_DEVICE_MODEL == "cuda":
+        return 6
+    return 6
+
+
 def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
     table_imgs = []
     table_counts = []
@@ -83,11 +93,11 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod
     # Don't look at table cell detection tqdm output
     tqdm.disable = True
     table_imgs, table_boxes, table_counts, table_text_lines, img_sizes = get_table_boxes(pages, doc, fname)
-    cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES)
+    cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES, detector_batch_size=get_detector_batch_size())
     tqdm.disable = False
 
     # This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc.
-    table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
+    table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models, table_rec_batch_size=get_batch_size(), ocr_batch_size=get_ocr_batch_size())
     cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)]
     table_md = [formatter("markdown", cell)[0] for cell in cells]
 
diff --git a/poetry.lock b/poetry.lock
index bb66b083..b8312c4a 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1935,13 +1935,13 @@ dill = ">=0.3.8"
 
 [[package]]
 name = "narwhals"
-version = "1.9.4"
+version = "1.10.0"
 description = "Extremely lightweight compatibility layer between dataframe libraries"
 optional = false
 python-versions = ">=3.8"
 files = [
-    {file = "narwhals-1.9.4-py3-none-any.whl", hash = "sha256:cdd16f73268a3f0d3327aa9e4c6ab25a0b277629d6710bef58e86f40e57e5cc9"},
-    {file = "narwhals-1.9.4.tar.gz", hash = "sha256:5de1f2d7bfbe555573d945fe1d760469a05784f3e69b7bc1b5b1303aae7946a1"},
+    {file = "narwhals-1.10.0-py3-none-any.whl", hash = "sha256:c83a378960651c391e5f3d68af3a821eda74c9713073518fe0c39aefc5ad8f8e"},
+    {file = "narwhals-1.10.0.tar.gz", hash = "sha256:a380e64110c3169c4b0b592c5b64ae6dc4cce76e9d3c56edc608a8ae5994cfc1"},
 ]
 
 [package.extras]
@@ -2045,13 +2045,13 @@ files = [
 
 [[package]]
 name = "networkx"
-version = "3.4.1"
+version = "3.4.2"
 description = "Python package for creating and manipulating graphs and networks"
 optional = false
 python-versions = ">=3.10"
 files = [
-    {file = "networkx-3.4.1-py3-none-any.whl", hash = "sha256:e30a87b48c9a6a7cc220e732bffefaee585bdb166d13377734446ce1a0620eed"},
-    {file = "networkx-3.4.1.tar.gz", hash = "sha256:f9df45e85b78f5bd010993e897b4f1fdb242c11e015b101bd951e5c0e29982d8"},
+    {file = "networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f"},
+    {file = "networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1"},
 ]
 
 [package.extras]
@@ -4217,13 +4217,13 @@ dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"]
 
 [[package]]
 name = "tabled-pdf"
-version = "0.1.1"
+version = "0.1.2"
 description = "Detect and recognize tables in PDFs and images."
 optional = false
 python-versions = "<4.0,>=3.10"
 files = [
-    {file = "tabled_pdf-0.1.1-py3-none-any.whl", hash = "sha256:ec40cef5d5348127ccfbbf519ec83a8f54511986944cbf391521af316904789b"},
-    {file = "tabled_pdf-0.1.1.tar.gz", hash = "sha256:1db0518881473fe33f402c59555d3d460ceed86a92b5fe0ecb07841f82760c5c"},
+    {file = "tabled_pdf-0.1.2-py3-none-any.whl", hash = "sha256:21b39b097f6055884b4f1f21f66daf75d7f27588dbc06875ad7b5b7eccb60a05"},
+    {file = "tabled_pdf-0.1.2.tar.gz", hash = "sha256:d443eca41cd126f311527507cfc5a7a45c9a69efce2ec61beb91f9caeb44de39"},
 ]
 
 [package.dependencies]
@@ -5076,4 +5076,4 @@ propcache = ">=0.2.0"
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.10"
-content-hash = "b5016a59dd59840e90f8e59b8d019868648d447905122c690c37dbda8369d726"
+content-hash = "5dba5f1eb047fda1b8efc029069486ccec132e44a88a3aa8533b0dd5e7b06186"
diff --git a/pyproject.toml b/pyproject.toml
index 31295321..b6b6cb22 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.3.1"
+version = "0.3.2"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <github@vikas.sh>"]
 readme = "README.md"
@@ -36,7 +36,7 @@ surya-ocr = "^0.6.6"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
 pdftext = "^0.3.17"
-tabled-pdf = "^0.1.1"
+tabled-pdf = "^0.1.2"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"