Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

enhancing recovery_to_doc #14396

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 33 additions & 6 deletions ppstructure/recovery/recovery_to_doc.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,18 +97,38 @@ def sorted_layout_boxes(res, w):
res[0]["layout"] = "single"
return res

# Sort boxes by y coordinate (top to bottom), then x coordinate (left to right)
sorted_boxes = sorted(res, key=lambda x: (x["bbox"][1], x["bbox"][0]))
_boxes = list(sorted_boxes)

new_res = []
res_left = []
res_right = []
res_left = [] # Buffer for left column boxes
res_right = [] # Buffer for right column boxes
i = 0

while True:
if i >= num_boxes:
break

# Calculate width ratio of text box relative to page width
# Used to determine if box spans across columns (indicating single column)
box_width = _boxes[i]["bbox"][2] - _boxes[i]["bbox"][0]
width_ratio = box_width / w

# If box width > 60% of page width, treat as single column
if width_ratio > 0.6:
new_res += res_left
new_res += res_right
_boxes[i]["layout"] = "single"
new_res.append(_boxes[i])
res_left = []
res_right = []
i += 1
continue

# Handle the last box
if i == num_boxes - 1:
# Check if last box spans columns and is below previous box
if (
_boxes[i]["bbox"][1] > _boxes[i - 1]["bbox"][3]
and _boxes[i]["bbox"][0] < w / 2
Expand All @@ -119,27 +139,32 @@ def sorted_layout_boxes(res, w):
_boxes[i]["layout"] = "single"
new_res.append(_boxes[i])
else:
if _boxes[i]["bbox"][2] > w / 2:
# Classify as left or right column based on position
if _boxes[i]["bbox"][2] > w * 0.6:
_boxes[i]["layout"] = "double"
res_right.append(_boxes[i])
new_res += res_left
new_res += res_right
elif _boxes[i]["bbox"][0] < w / 2:
elif _boxes[i]["bbox"][0] < w * 0.4:
_boxes[i]["layout"] = "double"
res_left.append(_boxes[i])
new_res += res_left
new_res += res_right
res_left = []
res_right = []
break
elif _boxes[i]["bbox"][0] < w / 4 and _boxes[i]["bbox"][2] < 3 * w / 4:

# Left column criteria: starts before 30% of page width, ends before 55%
elif _boxes[i]["bbox"][0] < w * 0.3 and _boxes[i]["bbox"][2] < w * 0.55:
_boxes[i]["layout"] = "double"
res_left.append(_boxes[i])
i += 1
elif _boxes[i]["bbox"][0] > w / 4 and _boxes[i]["bbox"][2] > w / 2:
# Right column criteria: starts after 45% of page width, ends after 70%
elif _boxes[i]["bbox"][0] > w * 0.45 and _boxes[i]["bbox"][2] > w * 0.7:
_boxes[i]["layout"] = "double"
res_right.append(_boxes[i])
i += 1
# If neither left nor right column criteria met, treat as single column
else:
new_res += res_left
new_res += res_right
Expand All @@ -148,6 +173,8 @@ def sorted_layout_boxes(res, w):
res_left = []
res_right = []
i += 1

# Append any remaining boxes from left/right columns
if res_left:
new_res += res_left
if res_right:
Expand Down
4 changes: 4 additions & 0 deletions pytest.ini
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
[pytest]
filterwarnings =
ignore:pkg_resources is deprecated as an API:DeprecationWarning
addopts = -v
Binary file added tests/test_files/double_column.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file added tests/test_files/single_column.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 0 additions & 1 deletion tests/test_paddleocr.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,3 @@
# -*- encoding: utf-8 -*-
from pathlib import Path
from typing import Any

Expand Down
121 changes: 121 additions & 0 deletions tests/test_recovery_to_doc.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,121 @@
import os
import sys
import cv2
from docx import Document
from paddleocr import PPStructure

current_dir = os.path.dirname(os.path.abspath(__file__))
sys.path.append(os.path.abspath(os.path.join(current_dir, "..")))

from ppstructure.recovery.recovery_to_doc import sorted_layout_boxes, convert_info_docx


def test_double_column_structure(tmp_path):
"""
Test document structure analysis and docx generation for double-column layout.
Validates layout detection, column separation and document conversion.
"""
img_path = os.path.join(current_dir, "./test_files/double_column.png")
assert os.path.exists(img_path), "Required test image file not found"

# Initialize test image
img = cv2.imread(img_path)
assert img is not None, "Image loading failed"

# Initialize PPStructure engine for layout analysis
engine = PPStructure(show_log=True)

# Perform structural analysis
result = engine(img)
assert (
result is not None and len(result) > 0
), "Structure analysis produced no results"

# Process layout boxes in reading order
img_h, img_w = img.shape[:2]
sorted_results = sorted_layout_boxes(result, img_w)

# Validate double-column layout detection
double_column_boxes = []
for item in sorted_results:
if item.get("layout") == "double":
double_column_boxes.append(item)

assert len(double_column_boxes) >= 2, "Double-column layout not properly detected"

# Validate column distribution
left_column = []
right_column = []
for box in double_column_boxes:
box_center = (box["bbox"][0] + box["bbox"][2]) / 2
if box_center < img_w / 2:
left_column.append(box)
else:
right_column.append(box)

assert len(left_column) > 0, "Left column content not detected"
assert len(right_column) > 0, "Right column content not detected"

# Configure output directory
output_dir = str(tmp_path / "double_column_test")
os.makedirs(output_dir, exist_ok=True)

# Process document conversion
img_name = "test_double_column"
convert_info_docx(img, sorted_results, output_dir, img_name)

# Validate output document existence
docx_path = os.path.join(output_dir, f"{img_name}_ocr.docx")
assert os.path.exists(docx_path), "Document generation failed"

# Verify document content
doc = Document(docx_path)
assert len(doc.paragraphs) > 0, "Generated document contains no content"


def test_single_column_structure(tmp_path):
"""
Test document structure analysis and docx generation for single-column layout.
Validates layout detection, width ratio analysis and document conversion.
"""
img_path = os.path.join(current_dir, "./test_files/single_column.jpg")
assert os.path.exists(img_path), f"Test image {img_path} not found"

img = cv2.imread(img_path)
assert img is not None, f"Failed to load image {img_path}"

engine = PPStructure(show_log=True)
result = engine(img)
assert result is not None and len(result) > 0, "Layout analysis result is empty"

img_h, img_w = img.shape[:2]
sorted_results = sorted_layout_boxes(result, img_w)

# Check layout assignment is correct
single_column_boxes = []
for item in sorted_results:
box_width = item["bbox"][2] - item["bbox"][0]
width_ratio = box_width / img_w
# For text boxes that span >60% of page width, verify they are marked as single column
if width_ratio > 0.6 and item["type"] == "text":
assert (
item.get("layout") == "single"
), f"Wide text box ({width_ratio:.2f} of page width) not marked as single column"
single_column_boxes.append(item)

assert len(single_column_boxes) > 0, "No single column text boxes detected"

# Use temporary directory for output
output_dir = str(tmp_path / "single_column_test")
os.makedirs(output_dir, exist_ok=True)

img_name = "test_single_column"
convert_info_docx(img, sorted_results, output_dir, img_name)

# Verify output document
docx_path = os.path.join(output_dir, f"{img_name}_ocr.docx")
assert os.path.exists(docx_path), "Document not generated"

# Validate document content
doc = Document(docx_path)
assert len(doc.paragraphs) > 0, "Generated document is empty"
Loading