Skip to content

Commit

Permalink
Add order processor
Browse files Browse the repository at this point in the history
  • Loading branch information
VikParuchuri committed Feb 3, 2025
1 parent d487f46 commit 277f2db
Show file tree
Hide file tree
Showing 8 changed files with 60 additions and 14 deletions.
10 changes: 5 additions & 5 deletions benchmarks/overall/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,9 +23,9 @@ def marker_scoring_func(model_dict, sample, gt_markdown, use_llm=False, **kwargs
pdf_bytes = sample["pdf"] # This is a single page PDF
start = time.time()
marker_md = get_marker_markdown(model_dict, pdf_bytes, use_llm)
marker_md = clean_input(marker_md)
marker_md_clean = clean_input(marker_md)
total = time.time() - start
scores = score_blocks(gt_markdown, marker_md)
scores = score_blocks(gt_markdown, marker_md_clean)
scores["time"] = total
scores["markdown"] = marker_md
return scores
Expand All @@ -41,8 +41,8 @@ def mathpix_scoring_func(model_dict, sample, gt_markdown, mathpix_ds=None, **kwa
if not data:
raise ValueError(f"Could not find data for uuid {uuid}")

mathpix_md = clean_input(data["md"])
scores = score_blocks(gt_markdown, mathpix_md)
mathpix_md_clean = clean_input(data["md"])
scores = score_blocks(gt_markdown, mathpix_md_clean)
scores["time"] = data["time"]
scores["markdown"] = mathpix_md
scores["markdown"] = data["md"]
return scores
2 changes: 1 addition & 1 deletion benchmarks/overall/overall.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_method_scores(ds, model_dict, max_rows=None, score_func=marker_scoring_f
"averages_by_type": averages_by_type,
"averages_by_block_type": averages_by_block_type,
"average_time": avg_time,
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores)
"average_score": sum([bench_scores[k]["overall_score"] for k in bench_scores]) / len(bench_scores),
}

def print_scores(scores: Dict[str, FullResult], out_path: Path, default_method="marker"):
Expand Down
14 changes: 11 additions & 3 deletions benchmarks/overall/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import markdown2
from playwright.sync_api import sync_playwright

from benchmarks.overall.clean import convert_to_md, clean_input
from benchmarks.overall.schema import FullResult

def convert_to_html(md: str):
Expand Down Expand Up @@ -90,7 +91,13 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da

ds_rows = defaultdict(dict)
for idx in full_idxs:
row = ds[idx] # img, gt_blocks, classification, language, uuid
row = ds[idx]
ds_rows[idx].update({
"img": row["img"],
"classification": row["classification"],
"language": row["language"],
"uuid": row["uuid"]
})
for method in all_scores:
method_row = all_scores[method]["raw_scores"][idx]
ds_rows[idx].update({
Expand All @@ -99,10 +106,11 @@ def build_dataset(ds: datasets.Dataset, all_scores: Dict[str, FullResult]) -> da
f"{method}_image": markdown_to_image(method_row["markdown"]),
f"{method}_time": method_row["time"]
})
gt_md = "\n\n".join([clean_input(convert_to_md(block)) for block in json.loads(row["gt_blocks"])])
gt_html = [block["html"] for block in json.loads(row["gt_blocks"]) if len(block["html"]) > 0]
gt_md = "\n\n".join([convert_to_md(block) for block in gt_html])
ds_rows[idx].update({
"gt_markdown": gt_md,
"gt_image": markdown_to_image(gt_md)
"gt_markdown_image": markdown_to_image(gt_md)
})
out_dataset = datasets.Dataset.from_list([ds_rows[k] for k in full_idxs])
return out_dataset
Expand Down
1 change: 1 addition & 0 deletions benchmarks/overall/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ class FullResult(TypedDict):
averages_by_block_type: Dict[str, List[float]]
average_time: float
average_score: float
gt_markdown: List[str]
2 changes: 2 additions & 0 deletions marker/converters/pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,7 @@
from marker.schema.registry import register_block_class
from marker.util import strings_to_classes
from marker.processors.llm.llm_handwriting import LLMHandwritingProcessor
from marker.processors.order import OrderProcessor


class PdfConverter(BaseConverter):
Expand All @@ -59,6 +60,7 @@ class PdfConverter(BaseConverter):
"Enable higher quality processing with LLMs.",
] = False
default_processors: Tuple[BaseProcessor, ...] = (
OrderProcessor,
BlockquoteProcessor,
CodeProcessor,
DocumentTOCProcessor,
Expand Down
23 changes: 18 additions & 5 deletions marker/processors/order.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
from statistics import mean
from collections import defaultdict

from marker.processors import BaseProcessor
from marker.schema import BlockTypes
Expand All @@ -13,41 +14,53 @@ class OrderProcessor(BaseProcessor):

def __call__(self, document: Document):
for page in document.pages:
# Skip OCRed pages
if page.text_extraction_method != "pdftext":
continue

# Skip pages without layout slicing
if not page.layout_sliced:
continue

block_idxs = {}
block_idxs = defaultdict(int)
for block_id in page.structure:
block = document.get_block(block_id)
spans = block.contained_blocks(document, (BlockTypes.Span, ))
if len(spans) == 0:
continue

# Avg span position in original PDF
block_idxs[block_id] = (spans[0].minimum_position + spans[-1].maximum_position) / 2

for block_id in page.structure:
if block_id in block_idxs and block_idxs[block_id] > 0:
# Already assigned block id via span position
if block_idxs[block_id] > 0:
continue

block = document.get_block(block_id)
prev_block = document.get_prev_block(block)
next_block = document.get_next_block(block)

block_idx_add = 0
if prev_block:
block_idx_add = 1

while prev_block and prev_block.id not in block_idxs:
prev_block = document.get_prev_block(prev_block)
block_idx_add += 1

if not prev_block:
block_idx_add = -1
while next_block and next_block.id not in block_idxs:
next_block = document.get_next_block(next_block)
block_idx_add -= 1

if not next_block and not prev_block:
block_idxs[block_id] = 0
pass
elif prev_block:
block_idxs[block_id] = block_idxs[prev_block.id] + 1
block_idxs[block_id] = block_idxs[prev_block.id] + block_idx_add
else:
block_idxs[block_id] = block_idxs[next_block.id] - 1
block_idxs[block_id] = block_idxs[next_block.id] + block_idx_add

page.structure = sorted(page.structure, key=lambda x: block_idxs[x])

3 changes: 3 additions & 0 deletions marker/schema/polygon.py
Original file line number Diff line number Diff line change
Expand Up @@ -126,6 +126,9 @@ def center_distance(self, other: PolygonBox, x_weight: float = 1, y_weight: floa
else:
return abs(self.center[0] - other.center[0]) * x_weight + abs(self.center[1] - other.center[1]) * y_weight

def tl_distance(self, other: PolygonBox):
return ((self.bbox[0] - other.bbox[0]) ** 2 + (self.bbox[1] - other.bbox[1]) ** 2) ** 0.5

def rescale(self, old_size, new_size):
# Point is in x, y format
page_width, page_height = old_size
Expand Down
19 changes: 19 additions & 0 deletions marker/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,3 +80,22 @@ def matrix_intersection_area(boxes1: List[List[float]], boxes2: List[List[float]
height = np.maximum(0, max_y - min_y)

return width * height # Shape: (N, M)


def matrix_distance(boxes1: List[List[float]], boxes2: List[List[float]]) -> np.ndarray:
if len(boxes2) == 0:
return np.zeros((len(boxes1), 0))
if len(boxes1) == 0:
return np.zeros((0, len(boxes2)))

boxes1 = np.array(boxes1) # Shape: (N, 4)
boxes2 = np.array(boxes2) # Shape: (M, 4)

boxes1_centers = (boxes1[:, :2] + boxes1[:, 2:]) / 2 # Shape: (M, 2)
boxes2_centers = (boxes2[:, :2] + boxes2[:, 2:]) / 2 # Shape: (M, 2)

boxes1_centers = boxes1_centers[:, np.newaxis, :] # Shape: (N, 1, 2)
boxes2_centers = boxes2_centers[np.newaxis, :, :] # Shape: (1, M, 2)

distances = np.linalg.norm(boxes1_centers - boxes2_centers, axis=2) # Shape: (N, M)
return distances

0 comments on commit 277f2db

Please sign in to comment.