Merge pull request #307 from VikParuchuri/dev

Fix OOM errors
VikParuchuri · Oct 21, 2024 · 93a3ca6 · 93a3ca6
2 parents 361f9b5 + 8cbf79c
commit 93a3ca6
Show file tree

Hide file tree

Showing 5 changed files with 26 additions and 15 deletions.
diff --git a/marker/debug/render.py b/marker/debug/render.py
@@ -10,7 +10,7 @@ def get_font_path() -> str:
 
     if not os.path.exists(font_path):
         os.makedirs(os.path.dirname(font_path), exist_ok=True)
-        font_dl_path = f"{settings.RECOGNITION_FONT_DL_BASE}/{os.path.basename(font_path)}"
+        font_dl_path = f"{settings.FONT_DL_BASE}/{os.path.basename(font_path)}"
         with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f:
             r.raise_for_status()
             for chunk in r.iter_content(chunk_size=8192):

diff --git a/marker/settings.py b/marker/settings.py
@@ -79,6 +79,7 @@ def TORCH_DEVICE_MODEL(self) -> str:
 
     # Table models
     SURYA_TABLE_DPI: int = 192
+    TABLE_REC_BATCH_SIZE: Optional[int] = None
 
     # Headings
     HEADING_LEVEL_COUNT: int = 4

diff --git a/marker/tables/table.py b/marker/tables/table.py
@@ -12,10 +12,20 @@
 from marker.schema.block import Line, Span, Block
 from marker.schema.page import Page
 from typing import List
+from marker.ocr.recognition import get_batch_size as get_ocr_batch_size
+from marker.ocr.detection import get_batch_size as get_detector_batch_size
 
 from marker.settings import settings
 
 
+def get_batch_size():
+    if settings.TABLE_REC_BATCH_SIZE is not None:
+        return settings.TABLE_REC_BATCH_SIZE
+    elif settings.TORCH_DEVICE_MODEL == "cuda":
+        return 6
+    return 6
+
+
 def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
     table_imgs = []
     table_counts = []
@@ -83,11 +93,11 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod
     # Don't look at table cell detection tqdm output
     tqdm.disable = True
     table_imgs, table_boxes, table_counts, table_text_lines, img_sizes = get_table_boxes(pages, doc, fname)
-    cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES)
+    cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES, detector_batch_size=get_detector_batch_size())
     tqdm.disable = False
 
     # This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc.
-    table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
+    table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models, table_rec_batch_size=get_batch_size(), ocr_batch_size=get_ocr_batch_size())
     cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)]
     table_md = [formatter("markdown", cell)[0] for cell in cells]
 

diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -1,6 +1,6 @@
 [tool.poetry]
 name = "marker-pdf"
-version = "0.3.1"
+version = "0.3.2"
 description = "Convert PDF to markdown with high speed and accuracy."
 authors = ["Vik Paruchuri <[email protected]>"]
 readme = "README.md"
@@ -36,7 +36,7 @@ surya-ocr = "^0.6.6"
 filetype = "^1.2.0"
 regex = "^2024.4.28"
 pdftext = "^0.3.17"
-tabled-pdf = "^0.1.1"
+tabled-pdf = "^0.1.2"
 
 [tool.poetry.group.dev.dependencies]
 jupyter = "^1.0.0"