From 9734bf360c3b9e775dc3d5d1e2a7d4f85a9b9122 Mon Sep 17 00:00:00 2001 From: "Cimon Lucas (LCM)" Date: Sun, 20 Aug 2023 15:22:02 +0200 Subject: [PATCH] Fixing test_iss1601 --- pypdf/_merger.py | 14 ++++++++------ pypdf/generic/_data_structures.py | 3 ++- tests/__init__.py | 17 +++++++++++++++++ tests/test_utils.py | 20 ++++++++++++++++++++ tests/test_writer.py | 17 +++++++---------- 5 files changed, 54 insertions(+), 17 deletions(-) diff --git a/pypdf/_merger.py b/pypdf/_merger.py index 924f18495..7db7262c2 100644 --- a/pypdf/_merger.py +++ b/pypdf/_merger.py @@ -24,6 +24,7 @@ # CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) # ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE # POSSIBILITY OF SUCH DAMAGE. +from perfs import exec_time import warnings from io import BytesIO, FileIO, IOBase @@ -321,12 +322,13 @@ def write(self, fileobj: Union[Path, StrByteType]) -> None: # Add pages to the PdfWriter # The commented out line below was replaced with the two lines below it # to allow PdfMerger to work with PyPdf 1.13 - for page in self.pages: - self.output.add_page(page.pagedata) - pages_obj = cast(Dict[str, Any], self.output._pages.get_object()) - page.out_pagedata = self.output.get_reference( - pages_obj[PA.KIDS][-1].get_object() - ) + with exec_time("for page in self.pages:"): + for page in self.pages: + self.output.add_page(page.pagedata) + pages_obj = cast(Dict[str, Any], self.output._pages.get_object()) + page.out_pagedata = self.output.get_reference( + pages_obj[PA.KIDS][-1].get_object() + ) # Once all pages are added, create outline items to point at those pages self._write_dests() diff --git a/pypdf/generic/_data_structures.py b/pypdf/generic/_data_structures.py index e2b63078e..4212e28ae 100644 --- a/pypdf/generic/_data_structures.py +++ b/pypdf/generic/_data_structures.py @@ -988,6 +988,7 @@ class ContentStream(DecodedStreamObject): * when .set_data() is called, ._operations is set to None * when .operations is set, ._data is set to None """ + def __init__( self, stream: Any, @@ -1232,7 +1233,7 @@ def write_to_stream( self, stream: StreamType, encryption_key: Union[None, str, bytes] = None ) -> None: if not self._data and self._operations: - self.get_data() # this ensures ._data is rebuilt for ContentStream + self.get_data() # this ensures ._data is rebuilt super().write_to_stream(stream, encryption_key) diff --git a/tests/__init__.py b/tests/__init__.py index 6b4305bec..0e86c1a8f 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -87,3 +87,20 @@ def get_object(self) -> "DummyObj": def get_reference(self, obj): return IndirectObject(idnum=1, generation=1, pdf=self) + + +def is_sublist(child_list, parent_list): + """ + Check if child_list is a sublist of parent_list, with respect to + * elements order + * elements repetition + + Elements are compared using `==` + """ + if len(child_list) == 0: + return True + if len(parent_list) == 0: + return False + if parent_list[0] == child_list[0]: + return is_sublist(child_list[1:], parent_list[1:]) + return is_sublist(child_list, parent_list[1:]) diff --git a/tests/test_utils.py b/tests/test_utils.py index b87001773..47c04214a 100644 --- a/tests/test_utils.py +++ b/tests/test_utils.py @@ -25,6 +25,8 @@ ) from pypdf.errors import DeprecationError, PdfReadError, PdfStreamError +from . import is_sublist + TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent RESOURCE_ROOT = PROJECT_ROOT / "resources" @@ -351,3 +353,21 @@ def test_parse_datetime_err(): parse_iso8824_date("D:20210408T054711Z") assert ex.value.args[0] == "Can not convert date: D:20210408T054711Z" assert parse_iso8824_date("D:20210408054711").tzinfo is None + + +def test_is_sublist(): + # Basic checks: + assert is_sublist([0, 1], [0, 1, 2]) is True + assert is_sublist([0, 2], [0, 1, 2]) is True + assert is_sublist([1, 2], [0, 1, 2]) is True + assert is_sublist([0, 3], [0, 1, 2]) is False + # Ensure order is checked: + assert is_sublist([1, 0], [0, 1, 2]) is False + # Ensure duplicates are handled: + assert is_sublist([0, 1, 1], [0, 1, 1, 2]) is True + assert is_sublist([0, 1, 1], [0, 1, 2]) is False + # Edge cases with empty lists: + assert is_sublist([], [0, 1, 2]) is True + assert is_sublist([0, 1], []) is False + # Self-sublist edge case: + assert is_sublist([0, 1, 2], [0, 1, 2]) is True diff --git a/tests/test_writer.py b/tests/test_writer.py index 5be8ac497..741b51a28 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -28,7 +28,7 @@ TextStringObject, ) -from . import get_data_from_url +from . import get_data_from_url, is_sublist TESTS_ROOT = Path(__file__).parent.resolve() PROJECT_ROOT = TESTS_ROOT.parent @@ -1238,23 +1238,20 @@ def test_iss1601(): url = "https://github.com/py-pdf/pypdf/files/10579503/badges-38.pdf" name = "badge-38.pdf" reader = PdfReader(BytesIO(get_data_from_url(url, name=name))) + original_cs_operations = ContentStream(reader.pages[0].get_contents(), reader).operations writer = PdfWriter() page_1 = writer.add_blank_page( reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] ) page_1.merge_transformed_page(reader.pages[0], Transformation()) - assert ( - ContentStream(reader.pages[0].get_contents(), reader).get_data() - in page_1.get_contents().get_data() - ) + page_1_cs_operations = page_1.get_contents().operations + assert is_sublist(original_cs_operations, page_1_cs_operations) page_1 = writer.add_blank_page( reader.pages[0].mediabox[2], reader.pages[0].mediabox[3] ) page_1.merge_page(reader.pages[0]) - assert ( - ContentStream(reader.pages[0].get_contents(), reader).get_data() - in page_1.get_contents().get_data() - ) + page_1_cs_operations = page_1.get_contents().operations + assert is_sublist(original_cs_operations, page_1_cs_operations) def test_attachments(): @@ -1539,7 +1536,7 @@ def test_watermark(): @pytest.mark.enable_socket() -@pytest.mark.timeout(2) # this was a lot slower before PR #2086 +@pytest.mark.timeout(4) # this was a lot slower before PR #2086 def test_watermarking_speed(): url = "https://github.com/py-pdf/pypdf/files/11985889/bg.pdf" name = "bgwatermark.pdf"