Skip to content

Commit

Permalink
Merge pull request #307 from VikParuchuri/dev
Browse files Browse the repository at this point in the history
Fix OOM errors
  • Loading branch information
VikParuchuri authored Oct 21, 2024
2 parents 361f9b5 + 8cbf79c commit 93a3ca6
Show file tree
Hide file tree
Showing 5 changed files with 26 additions and 15 deletions.
2 changes: 1 addition & 1 deletion marker/debug/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ def get_font_path() -> str:

if not os.path.exists(font_path):
os.makedirs(os.path.dirname(font_path), exist_ok=True)
font_dl_path = f"{settings.RECOGNITION_FONT_DL_BASE}/{os.path.basename(font_path)}"
font_dl_path = f"{settings.FONT_DL_BASE}/{os.path.basename(font_path)}"
with requests.get(font_dl_path, stream=True) as r, open(font_path, 'wb') as f:
r.raise_for_status()
for chunk in r.iter_content(chunk_size=8192):
Expand Down
1 change: 1 addition & 0 deletions marker/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -79,6 +79,7 @@ def TORCH_DEVICE_MODEL(self) -> str:

# Table models
SURYA_TABLE_DPI: int = 192
TABLE_REC_BATCH_SIZE: Optional[int] = None

# Headings
HEADING_LEVEL_COUNT: int = 4
Expand Down
14 changes: 12 additions & 2 deletions marker/tables/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,20 @@
from marker.schema.block import Line, Span, Block
from marker.schema.page import Page
from typing import List
from marker.ocr.recognition import get_batch_size as get_ocr_batch_size
from marker.ocr.detection import get_batch_size as get_detector_batch_size

from marker.settings import settings


def get_batch_size():
if settings.TABLE_REC_BATCH_SIZE is not None:
return settings.TABLE_REC_BATCH_SIZE
elif settings.TORCH_DEVICE_MODEL == "cuda":
return 6
return 6


def get_table_boxes(pages: List[Page], doc: PdfDocument, fname):
table_imgs = []
table_counts = []
Expand Down Expand Up @@ -83,11 +93,11 @@ def format_tables(pages: List[Page], doc: PdfDocument, fname: str, detection_mod
# Don't look at table cell detection tqdm output
tqdm.disable = True
table_imgs, table_boxes, table_counts, table_text_lines, img_sizes = get_table_boxes(pages, doc, fname)
cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES)
cells, needs_ocr = get_cells(table_imgs, table_boxes, img_sizes, table_text_lines, det_models, detect_boxes=settings.OCR_ALL_PAGES, detector_batch_size=get_detector_batch_size())
tqdm.disable = False

# This will redo OCR if OCR is forced, since we need to redetect bounding boxes, etc.
table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models)
table_rec = recognize_tables(table_imgs, cells, needs_ocr, rec_models, table_rec_batch_size=get_batch_size(), ocr_batch_size=get_ocr_batch_size())
cells = [assign_rows_columns(tr, im_size) for tr, im_size in zip(table_rec, img_sizes)]
table_md = [formatter("markdown", cell)[0] for cell in cells]

Expand Down
20 changes: 10 additions & 10 deletions poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions pyproject.toml
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
[tool.poetry]
name = "marker-pdf"
version = "0.3.1"
version = "0.3.2"
description = "Convert PDF to markdown with high speed and accuracy."
authors = ["Vik Paruchuri <[email protected]>"]
readme = "README.md"
Expand Down Expand Up @@ -36,7 +36,7 @@ surya-ocr = "^0.6.6"
filetype = "^1.2.0"
regex = "^2024.4.28"
pdftext = "^0.3.17"
tabled-pdf = "^0.1.1"
tabled-pdf = "^0.1.2"

[tool.poetry.group.dev.dependencies]
jupyter = "^1.0.0"
Expand Down

0 comments on commit 93a3ca6

Please sign in to comment.