diff --git a/pyhanko/pdf_utils/reader.py b/pyhanko/pdf_utils/reader.py index 87ae6fa5..d77ac415 100644 --- a/pyhanko/pdf_utils/reader.py +++ b/pyhanko/pdf_utils/reader.py @@ -14,7 +14,7 @@ import re from collections import defaultdict from io import BytesIO -from typing import Dict, Generator, Optional, Set, Tuple, Union +from typing import BinaryIO, Dict, Generator, Optional, Set, Tuple, Union from . import generic, misc from .crypt import ( @@ -38,7 +38,6 @@ logger = logging.getLogger(__name__) - __all__ = [ 'PdfFileReader', 'HistoricalResolver', @@ -132,6 +131,41 @@ def process_data_at_eof(stream) -> int: return startxref +def _read_header_version(stream: BinaryIO) -> Tuple[int, int]: + stream.seek(0) + input_version = None + header = misc.read_until_whitespace(stream, maxchars=20) + # match ignores trailing chars + m = header_regex.match(header) + if m is not None: + major = int(m.group(1)) + minor = int(m.group(2)) + input_version = (major, minor) + if input_version is None: + raise PdfReadError('Illegal PDF header') + return input_version + + +def _read_xrefs_and_trailer( + stream: BinaryIO, handler_ref: PdfHandler, strict: bool +) -> Tuple[XRefCache, XRefBuilder]: + # start at the end to read the trailer & xref table + stream.seek(-1, os.SEEK_END) + # This needs to be recorded for incremental update purposes + last_startxref = process_data_at_eof(stream) + + # Read the xref table + xref_builder = XRefBuilder( + handler=handler_ref, + stream=stream, + strict=strict, + last_startxref=last_startxref, + ) + xref_sections = xref_builder.read_xrefs() + xref_cache = XRefCache(handler_ref, xref_sections) + return xref_cache, xref_builder + + class PdfFileReader(PdfHandler): """Class implementing functionality to read a PDF file and cache certain data about it.""" @@ -151,20 +185,29 @@ def __init__(self, stream, strict: bool = True): problems and also causes some correctable problems to be fatal. Defaults to ``True``. """ - self.security_handler: Optional[SecurityHandler] = None + self._security_handler: Optional[SecurityHandler] = None self.strict = strict self.resolved_objects: Dict[Tuple[int, int], generic.PdfObject] = {} self._header_version = None self._input_version = None self._historical_resolver_cache: Dict[int, HistoricalResolver] = {} self.stream = stream - self.xrefs, self.trailer = self.read() - encrypt_dict = self._get_encryption_params() - if encrypt_dict is not None: - self.security_handler = SecurityHandler.build(encrypt_dict) + # first, read the header & PDF version number + # (version number can be overridden in the document catalog later) + self._header_version = _read_header_version(stream) + self.xrefs, xref_builder = _read_xrefs_and_trailer(stream, self, strict) + self.last_startxref = xref_builder.last_startxref + self.trailer = xref_builder.trailer + self.has_xref_stream = xref_builder.has_xref_stream self._embedded_signatures = None + @property + def security_handler(self): + if self.encrypt_dict and not self._security_handler: + self._security_handler = SecurityHandler.build(self.encrypt_dict) + return self._security_handler + def _xmp_meta_view(self) -> Optional[DocumentMetadata]: try: from pyhanko.pdf_utils.metadata import xmp_xml @@ -280,15 +323,25 @@ def _get_object_from_stream(self, idnum, stmnum, idx): else: return generic.NullObject() - def _get_encryption_params(self) -> Optional[generic.DictionaryObject]: + @property + def encrypt_dict(self) -> Optional[generic.DictionaryObject]: try: encrypt_ref = self.trailer.raw_get('/Encrypt') except KeyError: return None if isinstance(encrypt_ref, generic.IndirectObject): - return self.get_object(encrypt_ref.reference, never_decrypt=True) + encrypt_dict = self.get_object( + encrypt_ref.reference, never_decrypt=True + ) + elif not self.strict: + encrypt_dict = encrypt_ref else: - return encrypt_ref + raise misc.PdfReadError( + "Encryption settings must be an indirect reference" + ) + if not isinstance(encrypt_dict, generic.DictionaryObject): + raise misc.PdfReadError("Encryption settings must be a dictionary") + return encrypt_dict @property def trailer_view(self) -> generic.DictionaryObject: @@ -475,40 +528,6 @@ def cache_indirect_object(self, generation, idnum, obj): self.resolved_objects[(generation, idnum)] = obj return obj - def read(self): - # first, read the header & PDF version number - # (version number can be overridden in the document catalog later) - stream = self.stream - stream.seek(0) - input_version = None - header = misc.read_until_whitespace(stream, maxchars=20) - # match ignores trailing chars - m = header_regex.match(header) - if m is not None: - major = int(m.group(1)) - minor = int(m.group(2)) - input_version = (major, minor) - if input_version is None: - raise PdfReadError('Illegal PDF header') - self._header_version = input_version - - # start at the end: - stream.seek(-1, os.SEEK_END) - - # This needs to be recorded for incremental update purposes - self.last_startxref = last_startxref = process_data_at_eof(stream) - # Read the xref table - xref_builder = XRefBuilder( - handler=self, - stream=stream, - strict=self.strict, - last_startxref=last_startxref, - ) - xref_sections = xref_builder.read_xrefs() - xref_cache = XRefCache(self, xref_sections) - self.has_xref_stream = xref_builder.has_xref_stream - return xref_cache, xref_builder.trailer - def decrypt(self, password: Union[str, bytes]) -> AuthResult: """ When using an encrypted PDF file with the standard PDF encryption diff --git a/pyhanko/pdf_utils/xref.py b/pyhanko/pdf_utils/xref.py index 5b8d0c65..b43eda7f 100644 --- a/pyhanko/pdf_utils/xref.py +++ b/pyhanko/pdf_utils/xref.py @@ -642,7 +642,7 @@ def __init__( self.sections: List[XRefSection] = [] self.trailer = TrailerDictionary() - self.trailer.container_ref = generic.TrailerReference(self) + self.trailer.container_ref = generic.TrailerReference(handler) self.has_xref_stream = False def _read_xref_stream_object(self): diff --git a/pyhanko_tests/data/pdf/malformed-encrypt-dict1.pdf b/pyhanko_tests/data/pdf/malformed-encrypt-dict1.pdf new file mode 100644 index 00000000..e246c192 Binary files /dev/null and b/pyhanko_tests/data/pdf/malformed-encrypt-dict1.pdf differ diff --git a/pyhanko_tests/data/pdf/malformed-encrypt-dict2.pdf b/pyhanko_tests/data/pdf/malformed-encrypt-dict2.pdf new file mode 100644 index 00000000..53d377e6 Binary files /dev/null and b/pyhanko_tests/data/pdf/malformed-encrypt-dict2.pdf differ diff --git a/pyhanko_tests/test_crypt.py b/pyhanko_tests/test_crypt.py index 3c74c3a9..b0de97a5 100644 --- a/pyhanko_tests/test_crypt.py +++ b/pyhanko_tests/test_crypt.py @@ -493,7 +493,8 @@ def test_pubkey_unsupported_filter(delete_subfilter): out = BytesIO() w.write(out) with pytest.raises(misc.PdfReadError): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_pubkey_encryption_block_cfs_s4(): @@ -505,7 +506,8 @@ def test_pubkey_encryption_block_cfs_s4(): out = BytesIO() w.write(out) with pytest.raises(misc.PdfReadError): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_pubkey_encryption_s5_requires_cfs(): @@ -518,7 +520,8 @@ def test_pubkey_encryption_s5_requires_cfs(): out = BytesIO() w.write(out) with pytest.raises(misc.PdfReadError): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_pubkey_encryption_dict_errors(): @@ -1433,7 +1436,8 @@ def test_legacy_o_u_values(entry): w.write(out) with pytest.raises(misc.PdfError, match="be 32 bytes long"): - PdfFileReader(out) + # noinspection PyStatementEffect + PdfFileReader(out).root['/Pages']['/Kids'][0]['/Content'].data def test_key_length_constraint(): @@ -1529,3 +1533,27 @@ def test_add_crypt_filter_to_stream_without_security_handler(): dummy_stream = generic.StreamObject(stream_data=b"1001") with pytest.raises(misc.PdfStreamError, match="no security handler"): dummy_stream.add_crypt_filter() + + +@pytest.mark.parametrize( + "fname,strict", + [ + ("malformed-encrypt-dict1.pdf", True), + ("malformed-encrypt-dict2.pdf", True), + ("malformed-encrypt-dict2.pdf", False), + ], +) +def test_malformed_crypt(fname, strict): + with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: + r = PdfFileReader(inf, strict=strict) + with pytest.raises(misc.PdfReadError, match='Encryption settings'): + r.encrypt_dict + + +def test_tolerate_direct_encryption_dict_in_nonstrict(): + fname = 'malformed-encrypt-dict1.pdf' + with open(os.path.join(PDF_DATA_DIR, fname), 'rb') as inf: + r = PdfFileReader(inf, strict=False) + r.decrypt('ownersecret') + data = r.root['/Pages']['/Kids'][0]['/Contents'].data + assert b'Hello' in data