From 8cbf79cef70df85f398363420da2fc3e71ad90ca Mon Sep 17 00:00:00 2001 From: Vik Paruchuri Date: Mon, 21 Oct 2024 09:07:52 -0400 Subject: [PATCH] Fix OOM errors --- marker/debug/render.py | 2 +- marker/settings.py | 1 + marker/tables/table.py | 14 ++++++++++++-- poetry.lock | 20 ++++++++++---------- pyproject.toml | 4 ++-- 5 files changed, 26 insertions(+), 15 deletions(-) diff --git a/marker/debug/render.py b/marker/debug/render.py index 2580241d..299baee2 100644 --- a/marker/debug/render.py +++ b/marker/debug/render.py @@ -10,7 +10,7 @@ def get_font_path() -> str: if not os.path.exists(font_path): os.makedirs(os.path.dirname(font_path), exist_ok=True) - font_dl_path = f"{settings.RECOGNITION_FONT_DL_BASE}/{os.path.basename(font_path)}" + font_dl_path = f"{settings.FONT_DL_BASE}/{os.path.basename(font_path)}" with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f: r.raise_for_status() for chunk in r.iter_content(chunk_size=8192): diff --git a/marker/settings.py b/marker/settings.py index beaccf62..ee0756ea 100644 --- a/marker/settings.py +++ b/marker/settings.py @@ -79,6 +79,7 @@ def TORCH_DEVICE_MODEL(self) -> str: # Table models SURYA_TABLE_DPI: int = 192 + TABLE_REC_BATCH_SIZE: Optional[int] = None # Headings HEADING_LEVEL_COUNT: int = 4 diff --git a/marker/tables/table.py b/marker/tables/table.py index 1cb16a12..66981a5b 100644 --- a/marker/tables/table.py +++ b/marker/tables/table.py @@ -12,10 +12,20 @@ from marker.schema.block import Line, Span, Block from marker.schema.page import Page from typing import List +from marker.ocr.recognition import get_batch_size as get_ocr_batch_size +from marker.ocr.detection import get_batch_size as get_detector_batch_size from marker.settings import settings +def get_batch_size(): + if settings.TABLE_REC_BATCH_SIZE is not None: + return settings.TABLE_REC_BATCH_SIZE + elif settings.TORCH_DEVICE_MODEL == "cuda": + return 6 + return 6 + + def get_table_boxes(pages: List[Page], doc: PdfDocument, fname): table_imgs = [] table_counts = [] @@ -83,11 +93,11 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod # Don't look at table cell detection tqdm output tqdm.disable = True table_imgs, table_boxes, table_counts, table_text_lines, img_sizes = get_table_boxes(pages, doc, fname) - cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES) + cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES, detector_batch_size=get_detector_batch_size()) tqdm.disable = False # This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc. - table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models) + table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models, table_rec_batch_size=get_batch_size(), ocr_batch_size=get_ocr_batch_size()) cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)] table_md = [formatter("markdown", cell)[0] for cell in cells] diff --git a/poetry.lock b/poetry.lock index bb66b083..b8312c4a 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1935,13 +1935,13 @@ dill = ">=0.3.8" [[package]] name = "narwhals" -version = "1.9.4" +version = "1.10.0" description = "Extremely lightweight compatibility layer between dataframe libraries" optional = false python-versions = ">=3.8" files = [ - {file = "narwhals-1.9.4-py3-none-any.whl", hash = "sha256:cdd16f73268a3f0d3327aa9e4c6ab25a0b277629d6710bef58e86f40e57e5cc9"}, - {file = "narwhals-1.9.4.tar.gz", hash = "sha256:5de1f2d7bfbe555573d945fe1d760469a05784f3e69b7bc1b5b1303aae7946a1"}, + {file = "narwhals-1.10.0-py3-none-any.whl", hash = "sha256:c83a378960651c391e5f3d68af3a821eda74c9713073518fe0c39aefc5ad8f8e"}, + {file = "narwhals-1.10.0.tar.gz", hash = "sha256:a380e64110c3169c4b0b592c5b64ae6dc4cce76e9d3c56edc608a8ae5994cfc1"}, ] [package.extras] @@ -2045,13 +2045,13 @@ files = [ [[package]] name = "networkx" -version = "3.4.1" +version = "3.4.2" description = "Python package for creating and manipulating graphs and networks" optional = false python-versions = ">=3.10" files = [ - {file = "networkx-3.4.1-py3-none-any.whl", hash = "sha256:e30a87b48c9a6a7cc220e732bffefaee585bdb166d13377734446ce1a0620eed"}, - {file = "networkx-3.4.1.tar.gz", hash = "sha256:f9df45e85b78f5bd010993e897b4f1fdb242c11e015b101bd951e5c0e29982d8"}, + {file = "networkx-3.4.2-py3-none-any.whl", hash = "sha256:df5d4365b724cf81b8c6a7312509d0c22386097011ad1abe274afd5e9d3bbc5f"}, + {file = "networkx-3.4.2.tar.gz", hash = "sha256:307c3669428c5362aab27c8a1260aa8f47c4e91d3891f48be0141738d8d053e1"}, ] [package.extras] @@ -4217,13 +4217,13 @@ dev = ["hypothesis (>=6.70.0)", "pytest (>=7.1.0)"] [[package]] name = "tabled-pdf" -version = "0.1.1" +version = "0.1.2" description = "Detect and recognize tables in PDFs and images." optional = false python-versions = "<4.0,>=3.10" files = [ - {file = "tabled_pdf-0.1.1-py3-none-any.whl", hash = "sha256:ec40cef5d5348127ccfbbf519ec83a8f54511986944cbf391521af316904789b"}, - {file = "tabled_pdf-0.1.1.tar.gz", hash = "sha256:1db0518881473fe33f402c59555d3d460ceed86a92b5fe0ecb07841f82760c5c"}, + {file = "tabled_pdf-0.1.2-py3-none-any.whl", hash = "sha256:21b39b097f6055884b4f1f21f66daf75d7f27588dbc06875ad7b5b7eccb60a05"}, + {file = "tabled_pdf-0.1.2.tar.gz", hash = "sha256:d443eca41cd126f311527507cfc5a7a45c9a69efce2ec61beb91f9caeb44de39"}, ] [package.dependencies] @@ -5076,4 +5076,4 @@ propcache = ">=0.2.0" [metadata] lock-version = "2.0" python-versions = "^3.10" -content-hash = "b5016a59dd59840e90f8e59b8d019868648d447905122c690c37dbda8369d726" +content-hash = "5dba5f1eb047fda1b8efc029069486ccec132e44a88a3aa8533b0dd5e7b06186" diff --git a/pyproject.toml b/pyproject.toml index 31295321..b6b6cb22 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,6 +1,6 @@ [tool.poetry] name = "marker-pdf" -version = "0.3.1" +version = "0.3.2" description = "Convert PDF to markdown with high speed and accuracy." authors = ["Vik Paruchuri "] readme = "README.md" @@ -36,7 +36,7 @@ surya-ocr = "^0.6.6" filetype = "^1.2.0" regex = "^2024.4.28" pdftext = "^0.3.17" -tabled-pdf = "^0.1.1" +tabled-pdf = "^0.1.2" [tool.poetry.group.dev.dependencies] jupyter = "^1.0.0"