Skip to content

Commit

Permalink
feat(table_of_contents): implement initial version (#52)
Browse files Browse the repository at this point in the history
  • Loading branch information
deenaawny-github-account authored Nov 27, 2023
1 parent d824b84 commit 3496b21
Show file tree
Hide file tree
Showing 6 changed files with 144 additions and 0 deletions.
5 changes: 5 additions & 0 deletions sec_parser/processing_engine/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,9 @@
SupplementaryTextClassifier,
)
from sec_parser.processing_steps.table_classifier import TableClassifier
from sec_parser.processing_steps.table_of_contents_classifier import (
TableOfContentsClassifier,
)
from sec_parser.processing_steps.text_classifier import TextClassifier
from sec_parser.processing_steps.text_element_merger import TextElementMerger
from sec_parser.processing_steps.title_classifier import TitleClassifier
Expand All @@ -46,6 +49,7 @@
NotYetClassifiedElement,
TextElement,
)
from sec_parser.semantic_elements.table_element.table_element import TableElement

if TYPE_CHECKING: # pragma: no cover
from sec_parser.processing_engine.html_tag import HtmlTag
Expand Down Expand Up @@ -167,6 +171,7 @@ def get_default_steps(
EmptyElementClassifier(types_to_process={NotYetClassifiedElement}),
TopLevelSectionManagerFor10Q(types_to_process={NotYetClassifiedElement}),
TableClassifier(types_to_process={NotYetClassifiedElement}),
TableOfContentsClassifier(types_to_process={TableElement}),
TextClassifier(types_to_process={NotYetClassifiedElement}),
HighlightedTextClassifier(types_to_process={TextElement}),
SupplementaryTextClassifier(
Expand Down
4 changes: 4 additions & 0 deletions sec_parser/processing_engine/html_tag.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,7 @@
from sec_parser.utils.bs4_.has_tag_children import has_tag_children
from sec_parser.utils.bs4_.has_text_outside_tags import has_text_outside_tags
from sec_parser.utils.bs4_.is_unary_tree import is_unary_tree
from sec_parser.utils.bs4_.table_check_data_cell import check_table_contains_text_page
from sec_parser.utils.bs4_.table_to_markdown import TableToMarkdown
from sec_parser.utils.bs4_.text_styles_metrics import compute_text_styles_metrics
from sec_parser.utils.bs4_.without_tags import without_tags
Expand Down Expand Up @@ -293,6 +294,9 @@ def get_approx_table_metrics(self) -> ApproxTableMetrics:
self._approx_table_metrics = get_approx_table_metrics(self._bs4)
return self._approx_table_metrics

def is_table_of_content(self) -> bool:
return check_table_contains_text_page(self._bs4)

def table_to_markdown(self) -> str:
if self._markdown_table is None:
self._markdown_table = TableToMarkdown(self._bs4).convert()
Expand Down
51 changes: 51 additions & 0 deletions sec_parser/processing_steps/table_of_contents_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
from __future__ import annotations

from typing import TYPE_CHECKING

from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import (
AbstractElementwiseProcessingStep,
ElementProcessingContext,
)
from sec_parser.semantic_elements.table_element.table_of_contents_element import (
TableOfContentsElement,
)

if TYPE_CHECKING: # pragma: no cover
from sec_parser.semantic_elements.abstract_semantic_element import (
AbstractSemanticElement,
)


class TableOfContentsClassifier(AbstractElementwiseProcessingStep):
"""
TableOfContentsClassifier class for converting elements into TableOfContentsElement instances.
This step scans through a list of semantic elements and changes it,
primarily by replacing suitable candidates with TableOfContentsElement instances.
"""

def __init__(
self,
*,
types_to_process: set[type[AbstractSemanticElement]] | None = None,
types_to_exclude: set[type[AbstractSemanticElement]] | None = None,
) -> None:
super().__init__(
types_to_process=types_to_process,
types_to_exclude=types_to_exclude,
)

def _process_element(
self,
element: AbstractSemanticElement,
_: ElementProcessingContext,
) -> AbstractSemanticElement:
is_table_of_content=element.html_tag.is_table_of_content()

if is_table_of_content is True:
return TableOfContentsElement.create_from_element(
element,
log_origin=self.__class__.__name__,
)

return element
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from __future__ import annotations

from sec_parser.semantic_elements.table_element.table_element import TableElement


class TableOfContentsElement(TableElement):
pass
29 changes: 29 additions & 0 deletions sec_parser/utils/bs4_/table_check_data_cell.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
import bs4

from sec_parser.utils.bs4_.get_single_table import get_single_table


def check_table_contains_text_page(bs4_tag: bs4.Tag) -> bool:
"""
check_table_contains_text_page determines whether the given bs4 tag
is a table of contents.
Checks whether one of the table's <td> tags (data cells) contains
the text "page".
Returns true if there exists at least one <td> tag
with the text "page", otherwise the function returns false.
"""
table=get_single_table(bs4_tag)

return any(is_page_data_cell(t.text.strip()) for t in table.find_all("td"))

def is_page_data_cell(data_cell_text: str) -> bool:
"""
is_page_data_cell determines whether the given text expresses the
word "page".
Returns true if the given text expresses the word "page", otherwise
the function returns false.
"""
return data_cell_text.lower() in {"page", "page no.", "page number"}
48 changes: 48 additions & 0 deletions tests/unit/processing_steps/test_table_of_contents_classifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
import pytest

from sec_parser.processing_steps.table_of_contents_classifier import TableOfContentsClassifier
from sec_parser.semantic_elements.table_element.table_of_contents_element import TableOfContentsElement
from tests.unit._utils import assert_elements
from tests.unit.processing_steps._utils import parse_initial_semantic_elements


@pytest.mark.parametrize(
("name", "html_str", "expected_elements"),
values := [
(
"simple",
"""
<div>
<div>
<table>
<tr>
<td>Row 1 content</td>
</tr>
<tr>
<td>Page</td>
</tr>
</table>
</div>
</div>
""",
[
{"type": TableOfContentsElement, "tag": "div"},
],
),
],
ids=[v[0] for v in values],
)
def test_table_of_contents_classifier(name, html_str, expected_elements):
"""
test_table_of_contents_classifier test checks that the TableOfContentsClassifier can successfully
transform a list of semantic elements returned by `parse_initial_semantic_elements`.
"""
# Arrange
elements = parse_initial_semantic_elements(html_str)
step = TableOfContentsClassifier()

# Act
processed_elements = step.process(elements)

# Assert
assert_elements(processed_elements, expected_elements)

0 comments on commit 3496b21

Please sign in to comment.