[REF]: further reduce object loops

Get all image, char and text objects in one throw.
py-pdf · Nov 9, 2024 · 06f48a1 · 06f48a1
1 parent 3f27b37
commit 06f48a1
Show file tree

Hide file tree

Showing 5 changed files with 82 additions and 75 deletions.
diff --git a/camelot/handlers.py b/camelot/handlers.py
@@ -18,7 +18,7 @@
 from .parsers import Stream
 from .utils import TemporaryDirectory
 from .utils import download_url
-from .utils import get_char_and_text_objects
+from .utils import get_image_char_and_text_objects
 from .utils import get_page_layout
 from .utils import get_rotation
 from .utils import is_url
@@ -119,7 +119,7 @@ def _get_pages(self, pages):
 
     def _save_page(
         self, filepath: StrByteType | Path, page: int, temp: str, **layout_kwargs
-    ):
+    ):  # -> int, int, tuple[list[LTImage], list[LTTextLineHorizontal], list[LTTextLineVertical]]:
         """Saves specified page from PDF into a temporary directory.
 
         Parameters
@@ -156,7 +156,9 @@ def _save_page(
             outfile.write(f)
         layout, dimensions = get_page_layout(fpath, **layout_kwargs)
         # fix rotated PDF
-        chars, horizontal_text, vertical_text = get_char_and_text_objects(layout)
+        images, chars, horizontal_text, vertical_text = get_image_char_and_text_objects(
+            layout
+        )
         rotation = get_rotation(chars, horizontal_text, vertical_text)
         if rotation != "":
             fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
@@ -176,9 +178,12 @@ def _save_page(
                 outfile.write(f)
             # Only recompute layout and dimension after rotating the pdf
             layout, dimensions = get_page_layout(fpath, **layout_kwargs)
+            images, chars, horizontal_text, vertical_text = (
+                get_image_char_and_text_objects(layout)
+            )
             instream.close()
-            return layout, dimensions
-        return layout, dimensions
+            return layout, dimensions, images, chars, horizontal_text, vertical_text
+        return layout, dimensions, images, chars, horizontal_text, vertical_text
 
     def parse(
         self,
@@ -268,12 +273,19 @@ def _parse_page(
             List of tables found in PDF.
 
         """
-        layout, dimensions = self._save_page(
-            self.filepath, page, tempdir, **layout_kwargs
+        layout, dimensions, images, chars, horizontal_text, vertical_text = (
+            self._save_page(self.filepath, page, tempdir, **layout_kwargs)
         )
         page_path = os.path.join(tempdir, f"page-{page}.pdf")
         parser.prepare_page_parse(
-            page_path, layout, dimensions, page, layout_kwargs=layout_kwargs
+            page_path,
+            layout,
+            dimensions,
+            page,
+            images,
+            horizontal_text,
+            vertical_text,
+            layout_kwargs=layout_kwargs,
         )
         tables = parser.extract_tables()
         return tables
diff --git a/camelot/parsers/base.py b/camelot/parsers/base.py
@@ -10,8 +10,6 @@
 from ..utils import bbox_from_str
 from ..utils import compute_accuracy
 from ..utils import compute_whitespace
-from ..utils import get_image_and_text_objects
-from ..utils import get_page_layout
 from ..utils import get_table_index
 from ..utils import text_in_bbox
 
@@ -63,17 +61,26 @@ def table_bboxes(self):
         """
         return sorted(self.table_bbox_parses.keys(), key=lambda x: x[1], reverse=True)
 
-    def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs):
+    def prepare_page_parse(
+        self,
+        filename,
+        layout,
+        dimensions,
+        page_idx,
+        images,
+        horizontal_text,
+        vertical_text,
+        layout_kwargs,
+    ):
         """Prepare the page for parsing."""
         self.filename = filename
         self.layout_kwargs = layout_kwargs
         self.layout = layout
         self.dimensions = dimensions
         self.page = page_idx
-        self.layout, self.dimensions = get_page_layout(filename, **layout_kwargs)
-        self.images, self.horizontal_text, self.vertical_text = (
-            get_image_and_text_objects(self.layout)
-        )
+        self.images = images
+        self.horizontal_text = horizontal_text
+        self.vertical_text = vertical_text
         self.pdf_width, self.pdf_height = self.dimensions
         self.rootname, __ = os.path.splitext(self.filename)
 

diff --git a/camelot/parsers/hybrid.py b/camelot/parsers/hybrid.py
@@ -92,7 +92,17 @@ def __init__(
             debug=debug,
         )
 
-    def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwargs):
+    def prepare_page_parse(
+        self,
+        filename,
+        layout,
+        dimensions,
+        page_idx,
+        images,
+        horizontal_text,
+        vertical_text,
+        layout_kwargs,
+    ):
         """Call this method to prepare the page parsing .
 
         Parameters
@@ -109,13 +119,34 @@ def prepare_page_parse(self, filename, layout, dimensions, page_idx, layout_kwar
             [description]
         """
         super().prepare_page_parse(
-            filename, layout, dimensions, page_idx, layout_kwargs
+            filename,
+            layout,
+            dimensions,
+            page_idx,
+            images,
+            horizontal_text,
+            vertical_text,
+            layout_kwargs,
         )
         self.network_parser.prepare_page_parse(
-            filename, layout, dimensions, page_idx, layout_kwargs
+            filename,
+            layout,
+            dimensions,
+            page_idx,
+            images,
+            horizontal_text,
+            vertical_text,
+            layout_kwargs,
         )
         self.lattice_parser.prepare_page_parse(
-            filename, layout, dimensions, page_idx, layout_kwargs
+            filename,
+            layout,
+            dimensions,
+            page_idx,
+            images,
+            horizontal_text,
+            vertical_text,
+            layout_kwargs,
         )
 
     def _generate_columns_and_rows(self, bbox, table_idx):

diff --git a/camelot/utils.py b/camelot/utils.py
@@ -1399,55 +1399,6 @@ def get_page_layout(
         return layout, dim
 
 
-def get_char_and_text_objects(
-    layout: LTContainer[LTItem],
-) -> tuple[list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical]]:
-    """Parse a pdf layout to get text objects.
-
-    Recursively parses pdf layout to get a list of
-    PDFMiner LTChar, LTTextLineHorizontal, LTTextLineVertical objects.
-
-    Parameters
-    ----------
-    layout : object
-        PDFMiner LTContainer object
-            ( LTPage, LTTextLineHorizontal, LTTextLineVertical).
-
-    Returns
-    -------
-    result : tuple
-        Include List of LTChar objects, list of LTTextLineHorizontal objects
-        and list of LTTextLineVertical objects
-
-    """
-    char = []
-    horizontal_text = []
-    vertical_text = []
-
-    try:
-        for _object in layout:
-            if isinstance(_object, LTChar):
-                char.append(_object)
-            elif isinstance(_object, LTTextLineHorizontal):
-                horizontal_text.append(_object)
-                child_char = get_char_objects(_object)
-                char.extend(child_char)
-            elif isinstance(_object, LTTextLineVertical):
-                vertical_text.append(_object)
-                child_char = get_char_objects(_object)
-                char.extend(child_char)
-            elif isinstance(_object, LTContainer):
-                child_char, child_horizontal_text, child_vertical_text = (
-                    get_char_and_text_objects(_object)
-                )
-                char.extend(child_char)
-                horizontal_text.extend(child_horizontal_text)
-                vertical_text.extend(child_vertical_text)
-    except AttributeError:
-        pass
-    return char, horizontal_text, vertical_text
-
-
 def get_char_objects(layout: LTContainer[Any]) -> list[LTChar]:
     """Get charachter objects from a pdf layout.
 
@@ -1477,9 +1428,11 @@ def get_char_objects(layout: LTContainer[Any]) -> list[LTChar]:
     return char
 
 
-def get_image_and_text_objects(
+def get_image_char_and_text_objects(
     layout: LTContainer[LTItem],
-) -> tuple[list[LTImage], list[LTTextLineHorizontal], list[LTTextLineVertical]]:
+) -> tuple[
+    list[LTImage], list[LTChar], list[LTTextLineHorizontal], list[LTTextLineVertical]
+]:
     """Parse a PDF layout to get objects.
 
     Recursively parses pdf layout to get a list of
@@ -1499,6 +1452,7 @@ def get_image_and_text_objects(
 
     """
     image = []
+    char = []
     horizontal_text = []
     vertical_text = []
 
@@ -1510,13 +1464,17 @@ def get_image_and_text_objects(
                 horizontal_text.append(_object)
             elif isinstance(_object, LTTextLineVertical):
                 vertical_text.append(_object)
+            if isinstance(_object, LTChar):
+                char.append(_object)
             elif isinstance(_object, LTContainer):
-                child_image, child_horizontal_text, child_vertical_text = (
-                    get_image_and_text_objects(_object)
+                child_image, child_char, child_horizontal_text, child_vertical_text = (
+                    get_image_char_and_text_objects(_object)
                 )
                 image.extend(child_image)
+                child_char = get_char_objects(_object)
+                char.extend(child_char)
                 horizontal_text.extend(child_horizontal_text)
                 vertical_text.extend(child_vertical_text)
     except AttributeError:
         pass
-    return image, horizontal_text, vertical_text
+    return image, char, horizontal_text, vertical_text
diff --git a/pypdf_table_extraction/utils.py b/pypdf_table_extraction/utils.py
@@ -16,9 +16,8 @@
 from camelot.utils import find_columns_boundaries  # noqa F401
 from camelot.utils import flag_font_size  # noqa F401
 from camelot.utils import flavor_to_kwargs  # noqa F401
-from camelot.utils import get_char_and_text_objects  # noqa F401
 from camelot.utils import get_char_objects  # noqa F401
-from camelot.utils import get_image_and_text_objects  # noqa F401
+from camelot.utils import get_image_char_and_text_objects  # noqa F401
 from camelot.utils import get_index_closest_point  # noqa F401
 from camelot.utils import get_page_layout  # noqa F401
 from camelot.utils import get_rotation  # noqa F401