feat(table_of_contents): implement initial version (#52)

alphanome-ai · Nov 27, 2023 · 3496b21 · 3496b21
1 parent d824b84
commit 3496b21
Show file tree

Hide file tree

Showing 6 changed files with 144 additions and 0 deletions.
diff --git a/sec_parser/processing_engine/core.py b/sec_parser/processing_engine/core.py
@@ -32,6 +32,9 @@
     SupplementaryTextClassifier,
 )
 from sec_parser.processing_steps.table_classifier import TableClassifier
+from sec_parser.processing_steps.table_of_contents_classifier import (
+    TableOfContentsClassifier,
+)
 from sec_parser.processing_steps.text_classifier import TextClassifier
 from sec_parser.processing_steps.text_element_merger import TextElementMerger
 from sec_parser.processing_steps.title_classifier import TitleClassifier
@@ -46,6 +49,7 @@
     NotYetClassifiedElement,
     TextElement,
 )
+from sec_parser.semantic_elements.table_element.table_element import TableElement
 
 if TYPE_CHECKING:  # pragma: no cover
     from sec_parser.processing_engine.html_tag import HtmlTag
@@ -167,6 +171,7 @@ def get_default_steps(
             EmptyElementClassifier(types_to_process={NotYetClassifiedElement}),
             TopLevelSectionManagerFor10Q(types_to_process={NotYetClassifiedElement}),
             TableClassifier(types_to_process={NotYetClassifiedElement}),
+            TableOfContentsClassifier(types_to_process={TableElement}),
             TextClassifier(types_to_process={NotYetClassifiedElement}),
             HighlightedTextClassifier(types_to_process={TextElement}),
             SupplementaryTextClassifier(

diff --git a/sec_parser/processing_engine/html_tag.py b/sec_parser/processing_engine/html_tag.py
@@ -21,6 +21,7 @@
 from sec_parser.utils.bs4_.has_tag_children import has_tag_children
 from sec_parser.utils.bs4_.has_text_outside_tags import has_text_outside_tags
 from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
+from sec_parser.utils.bs4_.table_check_data_cell import check_table_contains_text_page
 from sec_parser.utils.bs4_.table_to_markdown import TableToMarkdown
 from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics
 from sec_parser.utils.bs4_.without_tags import without_tags
@@ -293,6 +294,9 @@ def get_approx_table_metrics(self) -> ApproxTableMetrics:
             self._approx_table_metrics = get_approx_table_metrics(self._bs4)
         return self._approx_table_metrics
 
+    def is_table_of_content(self) -> bool:
+        return check_table_contains_text_page(self._bs4)
+
     def table_to_markdown(self) -> str:
         if self._markdown_table is None:
             self._markdown_table = TableToMarkdown(self._bs4).convert()

diff --git a/sec_parser/processing_steps/table_of_contents_classifier.py b/sec_parser/processing_steps/table_of_contents_classifier.py
@@ -0,0 +1,51 @@
+from __future__ import annotations
+
+from typing import TYPE_CHECKING
+
+from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
+    AbstractElementwiseProcessingStep,
+    ElementProcessingContext,
+)
+from sec_parser.semantic_elements.table_element.table_of_contents_element import (
+    TableOfContentsElement,
+)
+
+if TYPE_CHECKING:  # pragma: no cover
+    from sec_parser.semantic_elements.abstract_semantic_element import (
+        AbstractSemanticElement,
+    )
+
+
+class TableOfContentsClassifier(AbstractElementwiseProcessingStep):
+    """
+    TableOfContentsClassifier class for converting elements into TableOfContentsElement instances.
+
+    This step scans through a list of semantic elements and changes it,
+    primarily by replacing suitable candidates with TableOfContentsElement instances.
+    """
+
+    def __init__(
+        self,
+        *,
+        types_to_process: set[type[AbstractSemanticElement]] | None = None,
+        types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
+    ) -> None:
+        super().__init__(
+            types_to_process=types_to_process,
+            types_to_exclude=types_to_exclude,
+        )
+
+    def _process_element(
+        self,
+        element: AbstractSemanticElement,
+        _: ElementProcessingContext,
+    ) -> AbstractSemanticElement:
+        is_table_of_content=element.html_tag.is_table_of_content()
+
+        if is_table_of_content is True:
+            return TableOfContentsElement.create_from_element(
+            element,
+            log_origin=self.__class__.__name__,
+            )
+
+        return element
diff --git a/sec_parser/semantic_elements/table_element/table_of_contents_element.py b/sec_parser/semantic_elements/table_element/table_of_contents_element.py
@@ -0,0 +1,7 @@
+from __future__ import annotations
+
+from sec_parser.semantic_elements.table_element.table_element import TableElement
+
+
+class TableOfContentsElement(TableElement):
+    pass
diff --git a/sec_parser/utils/bs4_/table_check_data_cell.py b/sec_parser/utils/bs4_/table_check_data_cell.py
@@ -0,0 +1,29 @@
+import bs4
+
+from sec_parser.utils.bs4_.get_single_table import get_single_table
+
+
+def check_table_contains_text_page(bs4_tag: bs4.Tag) -> bool:
+    """
+    check_table_contains_text_page determines whether the given bs4 tag
+    is a table of contents.
+
+    Checks whether one of the table's <td> tags (data cells) contains
+    the text "page".
+
+    Returns true if there exists at least one <td> tag
+    with the text "page", otherwise the function returns false.
+    """
+    table=get_single_table(bs4_tag)
+
+    return any(is_page_data_cell(t.text.strip()) for t in table.find_all("td"))
+
+def is_page_data_cell(data_cell_text: str) -> bool:
+    """
+    is_page_data_cell determines whether the given text expresses the
+    word "page".
+
+    Returns true if the given text expresses the word "page", otherwise
+    the function returns false.
+    """
+    return data_cell_text.lower() in {"page", "page no.", "page number"}
diff --git a/tests/unit/processing_steps/test_table_of_contents_classifier.py b/tests/unit/processing_steps/test_table_of_contents_classifier.py
@@ -0,0 +1,48 @@
+import pytest
+
+from sec_parser.processing_steps.table_of_contents_classifier import TableOfContentsClassifier
+from sec_parser.semantic_elements.table_element.table_of_contents_element import TableOfContentsElement
+from tests.unit._utils import assert_elements
+from tests.unit.processing_steps._utils import parse_initial_semantic_elements
+
+
+@pytest.mark.parametrize(
+    ("name", "html_str", "expected_elements"),
+    values := [
+        (
+            "simple",
+            """
+                <div>
+                <div>
+                <table>
+                    <tr>
+                        <td>Row 1 content</td>
+                    </tr>
+                    <tr>
+                        <td>Page</td>
+                    </tr>
+                </table>
+                </div>
+                </div>
+            """,
+            [
+                {"type": TableOfContentsElement, "tag": "div"},
+            ],
+        ),
+    ],
+    ids=[v[0] for v in values],
+)
+def test_table_of_contents_classifier(name, html_str, expected_elements):
+    """
+    test_table_of_contents_classifier test checks that the TableOfContentsClassifier can successfully
+    transform a list of semantic elements returned by `parse_initial_semantic_elements`.
+    """
+    # Arrange
+    elements = parse_initial_semantic_elements(html_str)
+    step = TableOfContentsClassifier()
+
+    # Act
+    processed_elements = step.process(elements)
+
+    # Assert
+    assert_elements(processed_elements, expected_elements)