-
Notifications
You must be signed in to change notification settings - Fork 48
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
feat(table_of_contents): implement initial version (#52)
- Loading branch information
1 parent
d824b84
commit 3496b21
Showing
6 changed files
with
144 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
51 changes: 51 additions & 0 deletions
51
sec_parser/processing_steps/table_of_contents_classifier.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,51 @@ | ||
from __future__ import annotations | ||
|
||
from typing import TYPE_CHECKING | ||
|
||
from sec_parser.processing_steps.abstract_classes.abstract_elementwise_processing_step import ( | ||
AbstractElementwiseProcessingStep, | ||
ElementProcessingContext, | ||
) | ||
from sec_parser.semantic_elements.table_element.table_of_contents_element import ( | ||
TableOfContentsElement, | ||
) | ||
|
||
if TYPE_CHECKING: # pragma: no cover | ||
from sec_parser.semantic_elements.abstract_semantic_element import ( | ||
AbstractSemanticElement, | ||
) | ||
|
||
|
||
class TableOfContentsClassifier(AbstractElementwiseProcessingStep): | ||
""" | ||
TableOfContentsClassifier class for converting elements into TableOfContentsElement instances. | ||
This step scans through a list of semantic elements and changes it, | ||
primarily by replacing suitable candidates with TableOfContentsElement instances. | ||
""" | ||
|
||
def __init__( | ||
self, | ||
*, | ||
types_to_process: set[type[AbstractSemanticElement]] | None = None, | ||
types_to_exclude: set[type[AbstractSemanticElement]] | None = None, | ||
) -> None: | ||
super().__init__( | ||
types_to_process=types_to_process, | ||
types_to_exclude=types_to_exclude, | ||
) | ||
|
||
def _process_element( | ||
self, | ||
element: AbstractSemanticElement, | ||
_: ElementProcessingContext, | ||
) -> AbstractSemanticElement: | ||
is_table_of_content=element.html_tag.is_table_of_content() | ||
|
||
if is_table_of_content is True: | ||
return TableOfContentsElement.create_from_element( | ||
element, | ||
log_origin=self.__class__.__name__, | ||
) | ||
|
||
return element |
7 changes: 7 additions & 0 deletions
7
sec_parser/semantic_elements/table_element/table_of_contents_element.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
from __future__ import annotations | ||
|
||
from sec_parser.semantic_elements.table_element.table_element import TableElement | ||
|
||
|
||
class TableOfContentsElement(TableElement): | ||
pass |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,29 @@ | ||
import bs4 | ||
|
||
from sec_parser.utils.bs4_.get_single_table import get_single_table | ||
|
||
|
||
def check_table_contains_text_page(bs4_tag: bs4.Tag) -> bool: | ||
""" | ||
check_table_contains_text_page determines whether the given bs4 tag | ||
is a table of contents. | ||
Checks whether one of the table's <td> tags (data cells) contains | ||
the text "page". | ||
Returns true if there exists at least one <td> tag | ||
with the text "page", otherwise the function returns false. | ||
""" | ||
table=get_single_table(bs4_tag) | ||
|
||
return any(is_page_data_cell(t.text.strip()) for t in table.find_all("td")) | ||
|
||
def is_page_data_cell(data_cell_text: str) -> bool: | ||
""" | ||
is_page_data_cell determines whether the given text expresses the | ||
word "page". | ||
Returns true if the given text expresses the word "page", otherwise | ||
the function returns false. | ||
""" | ||
return data_cell_text.lower() in {"page", "page no.", "page number"} |
48 changes: 48 additions & 0 deletions
48
tests/unit/processing_steps/test_table_of_contents_classifier.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,48 @@ | ||
import pytest | ||
|
||
from sec_parser.processing_steps.table_of_contents_classifier import TableOfContentsClassifier | ||
from sec_parser.semantic_elements.table_element.table_of_contents_element import TableOfContentsElement | ||
from tests.unit._utils import assert_elements | ||
from tests.unit.processing_steps._utils import parse_initial_semantic_elements | ||
|
||
|
||
@pytest.mark.parametrize( | ||
("name", "html_str", "expected_elements"), | ||
values := [ | ||
( | ||
"simple", | ||
""" | ||
<div> | ||
<div> | ||
<table> | ||
<tr> | ||
<td>Row 1 content</td> | ||
</tr> | ||
<tr> | ||
<td>Page</td> | ||
</tr> | ||
</table> | ||
</div> | ||
</div> | ||
""", | ||
[ | ||
{"type": TableOfContentsElement, "tag": "div"}, | ||
], | ||
), | ||
], | ||
ids=[v[0] for v in values], | ||
) | ||
def test_table_of_contents_classifier(name, html_str, expected_elements): | ||
""" | ||
test_table_of_contents_classifier test checks that the TableOfContentsClassifier can successfully | ||
transform a list of semantic elements returned by `parse_initial_semantic_elements`. | ||
""" | ||
# Arrange | ||
elements = parse_initial_semantic_elements(html_str) | ||
step = TableOfContentsClassifier() | ||
|
||
# Act | ||
processed_elements = step.process(elements) | ||
|
||
# Assert | ||
assert_elements(processed_elements, expected_elements) |