Skip to content

Commit

Permalink
[REF]: further reduce object loops
Browse files Browse the repository at this point in the history
Get all image, char and text objects in one throw.
  • Loading branch information
bosd committed Nov 9, 2024
1 parent 3f27b37 commit 06f48a1
Show file tree
Hide file tree
Showing 5 changed files with 82 additions and 75 deletions.
28 changes: 20 additions & 8 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import get_char_and_text_objects
from .utils import get_image_char_and_text_objects
from .utils import get_page_layout
from .utils import get_rotation
from .utils import is_url
Expand Down Expand Up @@ -119,7 +119,7 @@ def _get_pages(self, pages):

def _save_page(
self, filepath: StrByteType | Path, page: int, temp: str, **layout_kwargs
):
): # -> int, int, tuple[list[LTImage], list[LTTextLineHorizontal], list[LTTextLineVertical]]:
"""Saves specified page from PDF into a temporary directory.
Parameters
Expand Down Expand Up @@ -156,7 +156,9 @@ def _save_page(
outfile.write(f)
layout, dimensions = get_page_layout(fpath, **layout_kwargs)
# fix rotated PDF
chars, horizontal_text, vertical_text = get_char_and_text_objects(layout)
images, chars, horizontal_text, vertical_text = get_image_char_and_text_objects(
layout
)
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
Expand All @@ -176,9 +178,12 @@ def _save_page(
outfile.write(f)
# Only recompute layout and dimension after rotating the pdf
layout, dimensions = get_page_layout(fpath, **layout_kwargs)
images, chars, horizontal_text, vertical_text = (
get_image_char_and_text_objects(layout)
)
instream.close()
return layout, dimensions
return layout, dimensions
return layout, dimensions, images, chars, horizontal_text, vertical_text
return layout, dimensions, images, chars, horizontal_text, vertical_text

def parse(
self,
Expand Down Expand Up @@ -268,12 +273,19 @@ def _parse_page(
List of tables found in PDF.
"""
layout, dimensions = self._save_page(
self.filepath, page, tempdir, **layout_kwargs
layout, dimensions, images, chars, horizontal_text, vertical_text = (
self._save_page(self.filepath, page, tempdir, **layout_kwargs)
)
page_path = os.path.join(tempdir, f"page-{page}.pdf")
parser.prepare_page_parse(
page_path, layout, dimensions, page, layout_kwargs=layout_kwargs
page_path,
layout,
dimensions,
page,
images,
horizontal_text,
vertical_text,
layout_kwargs=layout_kwargs,
)
tables = parser.extract_tables()
return tables
21 changes: 14 additions & 7 deletions camelot/parsers/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,8 +10,6 @@
from ..utils import bbox_from_str
from ..utils import compute_accuracy
from ..utils import compute_whitespace
from ..utils import get_image_and_text_objects
from ..utils import get_page_layout
from ..utils import get_table_index
from ..utils import text_in_bbox

Expand Down Expand Up @@ -63,17 +61,26 @@ def table_bboxes(self):
"""
return sorted(self.table_bbox_parses.keys(), key=lambda x: x[1], reverse=True)

def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs):
def prepare_page_parse(
self,
filename,
layout,
dimensions,
page_idx,
images,
horizontal_text,
vertical_text,
layout_kwargs,
):
"""Prepare the page for parsing."""
self.filename = filename
self.layout_kwargs = layout_kwargs
self.layout = layout
self.dimensions = dimensions
self.page = page_idx
self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
self.images, self.horizontal_text, self.vertical_text = (
get_image_and_text_objects(self.layout)
)
self.images = images
self.horizontal_text = horizontal_text
self.vertical_text = vertical_text
self.pdf_width, self.pdf_height = self.dimensions
self.rootname, __ = os.path.splitext(self.filename)

Expand Down
39 changes: 35 additions & 4 deletions camelot/parsers/hybrid.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,7 +92,17 @@ def __init__(
debug=debug,
)

def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs):
def prepare_page_parse(
self,
filename,
layout,
dimensions,
page_idx,
images,
horizontal_text,
vertical_text,
layout_kwargs,
):
"""Call this method to prepare the page parsing .
Parameters
Expand All @@ -109,13 +119,34 @@ def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwar
[description]
"""
super().prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs
filename,
layout,
dimensions,
page_idx,
images,
horizontal_text,
vertical_text,
layout_kwargs,
)
self.network_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs
filename,
layout,
dimensions,
page_idx,
images,
horizontal_text,
vertical_text,
layout_kwargs,
)
self.lattice_parser.prepare_page_parse(
filename, layout, dimensions, page_idx, layout_kwargs
filename,
layout,
dimensions,
page_idx,
images,
horizontal_text,
vertical_text,
layout_kwargs,
)

def _generate_columns_and_rows(self, bbox, table_idx):
Expand Down
66 changes: 12 additions & 54 deletions camelot/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -1399,55 +1399,6 @@ def get_page_layout(
return layout, dim


def get_char_and_text_objects(
layout: LTContainer[LTItem],
) -> tuple[list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical]]:
"""Parse a pdf layout to get text objects.
Recursively parses pdf layout to get a list of
PDFMiner LTChar, LTTextLineHorizontal, LTTextLineVertical objects.
Parameters
----------
layout : object
PDFMiner LTContainer object
( LTPage, LTTextLineHorizontal, LTTextLineVertical).
Returns
-------
result : tuple
Include List of LTChar objects, list of LTTextLineHorizontal objects
and list of LTTextLineVertical objects
"""
char = []
horizontal_text = []
vertical_text = []

try:
for _object in layout:
if isinstance(_object, LTChar):
char.append(_object)
elif isinstance(_object, LTTextLineHorizontal):
horizontal_text.append(_object)
child_char = get_char_objects(_object)
char.extend(child_char)
elif isinstance(_object, LTTextLineVertical):
vertical_text.append(_object)
child_char = get_char_objects(_object)
char.extend(child_char)
elif isinstance(_object, LTContainer):
child_char, child_horizontal_text, child_vertical_text = (
get_char_and_text_objects(_object)
)
char.extend(child_char)
horizontal_text.extend(child_horizontal_text)
vertical_text.extend(child_vertical_text)
except AttributeError:
pass
return char, horizontal_text, vertical_text


def get_char_objects(layout: LTContainer[Any]) -> list[LTChar]:
"""Get charachter objects from a pdf layout.
Expand Down Expand Up @@ -1477,9 +1428,11 @@ def get_char_objects(layout: LTContainer[Any]) -> list[LTChar]:
return char


def get_image_and_text_objects(
def get_image_char_and_text_objects(
layout: LTContainer[LTItem],
) -> tuple[list[LTImage], list[LTTextLineHorizontal], list[LTTextLineVertical]]:
) -> tuple[
list[LTImage], list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical]
]:
"""Parse a PDF layout to get objects.
Recursively parses pdf layout to get a list of
Expand All @@ -1499,6 +1452,7 @@ def get_image_and_text_objects(
"""
image = []
char = []
horizontal_text = []
vertical_text = []

Expand All @@ -1510,13 +1464,17 @@ def get_image_and_text_objects(
horizontal_text.append(_object)
elif isinstance(_object, LTTextLineVertical):
vertical_text.append(_object)
if isinstance(_object, LTChar):
char.append(_object)
elif isinstance(_object, LTContainer):
child_image, child_horizontal_text, child_vertical_text = (
get_image_and_text_objects(_object)
child_image, child_char, child_horizontal_text, child_vertical_text = (
get_image_char_and_text_objects(_object)
)
image.extend(child_image)
child_char = get_char_objects(_object)
char.extend(child_char)
horizontal_text.extend(child_horizontal_text)
vertical_text.extend(child_vertical_text)
except AttributeError:
pass
return image, horizontal_text, vertical_text
return image, char, horizontal_text, vertical_text
3 changes: 1 addition & 2 deletions pypdf_table_extraction/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,9 +16,8 @@
from camelot.utils import find_columns_boundaries # noqa F401
from camelot.utils import flag_font_size # noqa F401
from camelot.utils import flavor_to_kwargs # noqa F401
from camelot.utils import get_char_and_text_objects # noqa F401
from camelot.utils import get_char_objects # noqa F401
from camelot.utils import get_image_and_text_objects # noqa F401
from camelot.utils import get_image_char_and_text_objects # noqa F401
from camelot.utils import get_index_closest_point # noqa F401
from camelot.utils import get_page_layout # noqa F401
from camelot.utils import get_rotation # noqa F401
Expand Down

0 comments on commit 06f48a1

Please sign in to comment.