Skip to content

Commit

Permalink
Fix bugs with tables
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Jan 27, 2025
1 parent e147ae6 commit 4ca60f4
Show file tree
Hide file tree
Showing 3 changed files with 35 additions and 7 deletions.
10 changes: 8 additions & 2 deletions marker/processors/table.py
Original file line number Diff line number Diff line change
Expand Up @@ -135,7 +135,9 @@ def __call__(self, document: Document):
for block in page.contained_blocks(document, self.block_types):
intersections = matrix_intersection_area([c.polygon.bbox for c in child_contained_blocks], [block.polygon.bbox])
for child, intersection in zip(child_contained_blocks, intersections):
if intersection > 0.95 and child.id in page.structure:
# Adjust this to percentage of the child block that is enclosed by the table
intersection_pct = intersection / max(child.polygon.area, 1)
if intersection_pct > 0.95 and child.id in page.structure:
page.structure.remove(child.id)

def finalize_cell_text(self, cell: SuryaTableCell):
Expand Down Expand Up @@ -284,7 +286,11 @@ def assign_pdftext_lines(self, extract_blocks: list, filepath: str):
table_idx = 0
for block in extract_blocks:
if block["page_id"] == pnum:
block["table_text_lines"] = page_tables[table_idx]
table_text = page_tables[table_idx]
if len(table_text) == 0:
block["ocr_block"] = True # Re-OCR the block if pdftext didn't find any text
else:
block["table_text_lines"] = page_tables[table_idx]
table_idx += 1
assert table_idx == len(page_tables), "Number of tables and table inputs must match"

Expand Down
9 changes: 4 additions & 5 deletions marker/schema/groups/page.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,11 +120,10 @@ def compute_line_block_intersections(self, provider_outputs: List[ProviderOutput
continue

max_intersection = intersection_line.argmax()
if intersection_matrix[line_idx, max_intersection] > 0:
max_intersections[line_idx] = (
intersection_matrix[line_idx, max_intersection],
blocks[max_intersection].id
)
max_intersections[line_idx] = (
intersection_matrix[line_idx, max_intersection],
blocks[max_intersection].id
)
return max_intersections

def replace_block(self, block: Block, new_block: Block):
Expand Down
23 changes: 23 additions & 0 deletions tests/processors/test_table_processor.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,3 +40,26 @@ def test_avoid_double_ocr(pdf_document, detection_model, recognition_model, tabl
table_output = renderer(pdf_document)
assert "Participants" in table_output.markdown


@pytest.mark.filename("multicol-blocks.pdf")
@pytest.mark.config({"page_range": [3]})
def test_overlap_blocks(pdf_document, detection_model, recognition_model, table_rec_model):
page = pdf_document.pages[0]
assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)

processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)

assert "Cascading, and the Auxiliary Problem Principle" in page.raw_text(pdf_document)


@pytest.mark.filename("pres.pdf")
@pytest.mark.config({"page_range": [4]})
def test_ocr_table(pdf_document, detection_model, recognition_model, table_rec_model):
processor = TableProcessor(detection_model, recognition_model, table_rec_model)
processor(pdf_document)

renderer = MarkdownRenderer()
table_output = renderer(pdf_document)
assert "1.2E-38" in table_output.markdown

0 comments on commit 4ca60f4

Please sign in to comment.