diff --git a/pyhanko/pdf_utils/crypt/_util.py b/pyhanko/pdf_utils/crypt/_util.py index 01e7cf5f..59f955ef 100644 --- a/pyhanko/pdf_utils/crypt/_util.py +++ b/pyhanko/pdf_utils/crypt/_util.py @@ -15,7 +15,8 @@ def aes_cbc_decrypt(key, data, iv, use_padding=True): decryptor = cipher.decryptor() plaintext = decryptor.update(data) + decryptor.finalize() - if use_padding: + # we tolerate empty messages that don't have padding + if use_padding and len(plaintext) > 0: unpadder = padding.PKCS7(128).unpadder() return unpadder.update(plaintext) + unpadder.finalize() else: diff --git a/pyhanko/pdf_utils/writer.py b/pyhanko/pdf_utils/writer.py index 924be241..96061b50 100644 --- a/pyhanko/pdf_utils/writer.py +++ b/pyhanko/pdf_utils/writer.py @@ -4,6 +4,7 @@ for the original license. """ +import logging import os import typing from typing import Dict, Iterable, List, Optional, Set, Tuple, Union, cast @@ -53,6 +54,8 @@ 'copy_into_new_writer', ] +logger = logging.getLogger(__name__) + # TODO move this to content.py? def init_xobject_dictionary( @@ -755,88 +758,13 @@ def import_object( a new instance. """ - return self._import_object(obj, {}, obj_stream) - - def _import_object( - self, obj: generic.PdfObject, reference_map: dict, obj_stream - ) -> generic.PdfObject: - # TODO check the spec for guidance on fonts. Do font identifiers have - # to be globally unique? - - # TODO deal with container_ref - - if isinstance(obj, generic.DecryptedObjectProxy): - obj = obj.decrypted - if isinstance(obj, generic.IndirectObject): - try: - return reference_map[obj.reference] - except KeyError: - refd = obj.get_object() - # Add a placeholder to reserve the reference value. - # This ensures correct behaviour in recursive calls - # with self-references. - new_ido = self.allocate_placeholder() - reference_map[obj.reference] = new_ido - imported = self._import_object(refd, reference_map, obj_stream) - - # if the imported object is a bare reference and/or a stream - # object, we can't put it into an object stream. - if isinstance(imported, OBJSTREAM_FORBIDDEN): - obj_stream = None - - # fill in the placeholder - self.add_object( - imported, obj_stream=obj_stream, idnum=new_ido.idnum - ) - return new_ido - elif isinstance(obj, generic.DictionaryObject): - raw_dict = { - k: self._import_object(v, reference_map, obj_stream) - for k, v in obj.items() - if k != '/Metadata' - } - try: - # make sure to import metadata streams as such - meta_ref = obj.get_value_as_reference('/Metadata') - # ensure a MetadataStream object ends up in the cache - meta_ref.get_pdf_handler().get_object( - meta_ref, as_metadata_stream=True - ) - # ...then import the reference - raw_dict['/Metadata'] = self._import_object( - generic.IndirectObject( - meta_ref.idnum, meta_ref.generation, meta_ref.pdf - ), - reference_map, - obj_stream, - ) - except (KeyError, IndirectObjectExpected): - pass - - if isinstance(obj, generic.StreamObject): - stm_cls = generic.StreamObject - # again, make sure to import metadata streams as such - try: - # noinspection PyUnresolvedReferences - from pyhanko.pdf_utils.metadata import xmp_xml - - if isinstance(obj, xmp_xml.MetadataStream): - stm_cls = xmp_xml.MetadataStream - except ImportError: # pragma: nocover - pass - # In the vast majority of use cases, I'd expect the content - # to be available in encoded form by default. - # By initialising the stream object in this way, we avoid - # a potentially costly decoding operation. - return stm_cls(raw_dict, encoded_data=obj.encoded_data) - else: - return generic.DictionaryObject(raw_dict) - elif isinstance(obj, generic.ArrayObject): - return generic.ArrayObject( - self._import_object(v, reference_map, obj_stream) for v in obj - ) - else: - return obj + importer = _ObjectImporter( + source=obj.get_container_ref().get_pdf_handler(), + target=self, + obj_stream=obj_stream, + reference_map={}, + ) + return importer.import_object(obj) def import_page_as_xobject( self, other: PdfHandler, page_ix=0, inherit_filters=True @@ -1222,6 +1150,138 @@ def _populate_trailer(self, trailer): super()._populate_trailer(trailer) +class _ObjectImporter: + + def __init__( + self, + source: PdfHandler, + target: BasePdfFileWriter, + reference_map: Dict[generic.Reference, generic.IndirectObject], + obj_stream: Optional[ObjectStream], + ): + self.source = source + self.target = target + self.obj_stream = obj_stream + self.queued_references: List[ + Tuple[generic.Reference, generic.Reference] + ] = [] + self.reference_map = reference_map + + def import_object(self, obj: generic.PdfObject) -> generic.PdfObject: + result = self._ingest(obj) + + while self.queued_references: + source_ref, target_ref = self.queued_references.pop() + source_obj = source_ref.get_object() + imported = self._ingest(source_obj) + + # if the imported object is a bare reference and/or a stream + # object, we can't put it into an object stream. + if isinstance(imported, OBJSTREAM_FORBIDDEN): + obj_stream = None + else: + obj_stream = self.obj_stream + + # fill in the placeholder + self.target.add_object( + imported, obj_stream=obj_stream, idnum=target_ref.idnum + ) + + return result + + def _ingest(self, obj: generic.PdfObject): + if isinstance(obj, generic.DecryptedObjectProxy): + obj = obj.decrypted + if isinstance(obj, generic.IndirectObject): + return self.process_reference(obj.reference) + elif isinstance(obj, generic.DictionaryObject): + raw_dict = { + k: self._ingest(v) for k, v in obj.items() if k != '/Metadata' + } + try: + # make sure to import metadata streams as such + meta_ref = obj.get_value_as_reference('/Metadata') + # ensure a MetadataStream object ends up in the cache + meta_ref.get_pdf_handler().get_object( + meta_ref, as_metadata_stream=True + ) + # ...then import the reference + raw_dict['/Metadata'] = self.process_reference(meta_ref) + except (KeyError, IndirectObjectExpected): + pass + + if isinstance(obj, generic.StreamObject): + stm_cls = generic.StreamObject + # again, make sure to import metadata streams as such + try: + # noinspection PyUnresolvedReferences + from pyhanko.pdf_utils.metadata import xmp_xml + + if isinstance(obj, xmp_xml.MetadataStream): + stm_cls = xmp_xml.MetadataStream + except ImportError: # pragma: nocover + pass + # In the vast majority of use cases, I'd expect the content + # to be available in encoded form by default. + # By initialising the stream object in this way, we avoid + # a potentially costly decoding operation. + return stm_cls(raw_dict, encoded_data=obj.encoded_data) + else: + return generic.DictionaryObject(raw_dict) + elif isinstance(obj, generic.ArrayObject): + return generic.ArrayObject(self._ingest(v) for v in obj) + else: + return obj + + def process_reference(self, ref: generic.Reference) -> generic.PdfObject: + try: + return self.reference_map[ref] + except KeyError: + # Add a placeholder to reserve the reference value. + new_ido = self.target.allocate_placeholder() + self.reference_map[ref] = new_ido + self.queued_references.append((ref, new_ido.reference)) + return new_ido + + def preprocess_signature_data(self): + # Signature /Contents is never encrypted => ensure we respect that + # (even though the import operation is guaranteed to break the signature + # there are valid use cases for stripping the encryption on such files, + # e.g. for downstream processing) + from ..sign.fields import enumerate_sig_fields + + signature_dict_refs = [ + field_value.reference + for fq_name, field_value, field_ref in enumerate_sig_fields( + self.source, filled_status=True + ) + # this is the case in all valid PDFs + if isinstance(field_value, generic.IndirectObject) + ] + if signature_dict_refs: + logger.warning( + "Source document contains filled signature fields--the copy " + "operation will invalidate them." + ) + for ref in signature_dict_refs: + sig_dict = ref.get_object() + assert isinstance(sig_dict, generic.DictionaryObject) + raw_dict = { + k: self._ingest(v) + for k, v in sig_dict.items() + if k != '/Contents' + } + raw_dict['/Contents'] = generic.ByteStringObject( + sig_dict.raw_get( + '/Contents', decrypt=generic.EncryptedObjAccess.RAW + ).original_bytes + ) + self.reference_map[ref] = self.target.add_object( + generic.DictionaryObject(raw_dict), + obj_stream=None, + ) + + def copy_into_new_writer( input_handler: PdfHandler, writer_kwargs: Optional[dict] = None ) -> PdfFileWriter: @@ -1254,16 +1314,25 @@ def copy_into_new_writer( w = PdfFileWriter(init_page_tree=False, **writer_kwargs) input_root_ref = input_handler.root_ref output_root_ref = w.root_ref - # call _import_object in such a way that we translate the input handler's + # call _ObjectImporter in such a way that we translate the input handler's # root to the new writer's root. # From a technical PoV this doesn't matter, but it makes the output file # somewhat "cleaner" (i.e. it doesn't leave an orphaned document catalog # cluttering up the file) - new_root_dict = w._import_object( - input_handler.root, - reference_map={input_root_ref: output_root_ref}, + importer = _ObjectImporter( + source=input_handler, + target=w, + reference_map={ + input_root_ref: generic.IndirectObject( + idnum=output_root_ref.idnum, + generation=output_root_ref.generation, + pdf=w, + ) + }, obj_stream=None, ) + importer.preprocess_signature_data() + new_root_dict = importer.import_object(input_handler.root) # override the old root ref ix = (output_root_ref.generation, output_root_ref.idnum) w.objects[ix] = new_root_dict @@ -1278,9 +1347,13 @@ def copy_into_new_writer( except KeyError: info_dict = None if info_dict is not None: - imported_info = w._import_object( - info_dict, reference_map={}, obj_stream=None + importer = _ObjectImporter( + source=input_handler, + target=w, + reference_map={}, + obj_stream=None, ) + imported_info = importer.import_object(info_dict) w._info = w.add_object(imported_info) return w diff --git a/pyhanko_tests/data/pdf/minimal-aes256-empty-encrypted-string.pdf b/pyhanko_tests/data/pdf/minimal-aes256-empty-encrypted-string.pdf new file mode 100644 index 00000000..e5bfe2ba Binary files /dev/null and b/pyhanko_tests/data/pdf/minimal-aes256-empty-encrypted-string.pdf differ diff --git a/pyhanko_tests/data/pdf/signed-encrypted-pubkey-with-catalog-ref.pdf b/pyhanko_tests/data/pdf/signed-encrypted-pubkey-with-catalog-ref.pdf new file mode 100644 index 00000000..db188395 Binary files /dev/null and b/pyhanko_tests/data/pdf/signed-encrypted-pubkey-with-catalog-ref.pdf differ diff --git a/pyhanko_tests/test_crypt.py b/pyhanko_tests/test_crypt.py index 4709637c..b384289c 100644 --- a/pyhanko_tests/test_crypt.py +++ b/pyhanko_tests/test_crypt.py @@ -1569,3 +1569,19 @@ def test_tolerate_direct_encryption_dict_in_nonstrict(): r.decrypt('ownersecret') data = r.root['/Pages']['/Kids'][0]['/Contents'].data assert b'Hello' in data + + +def test_tolerate_empty_encrypted_string(): + with open( + os.path.join(PDF_DATA_DIR, 'minimal-aes256-empty-encrypted-string.pdf'), + 'rb', + ) as inf: + r = PdfFileReader(inf) + r.decrypt('secret') + obj = r.root.raw_get('/Blah', decrypt=generic.EncryptedObjAccess.PROXY) + assert isinstance(obj, generic.DecryptedObjectProxy) + decrypted = obj.decrypted + assert isinstance( + decrypted, (generic.TextStringObject, generic.ByteStringObject) + ) + assert decrypted.original_bytes == b"" diff --git a/pyhanko_tests/test_sign_encrypted.py b/pyhanko_tests/test_sign_encrypted.py index b535be64..19586088 100644 --- a/pyhanko_tests/test_sign_encrypted.py +++ b/pyhanko_tests/test_sign_encrypted.py @@ -5,12 +5,14 @@ from pyhanko.pdf_utils.incremental_writer import IncrementalPdfFileWriter from pyhanko.pdf_utils.reader import PdfFileReader +from pyhanko.pdf_utils.writer import copy_into_new_writer from pyhanko.sign import signers from pyhanko.sign.diff_analysis import ModificationLevel from pyhanko.sign.signers.pdf_signer import ( DSSContentSettings, SigDSSPlacementPreference, ) +from pyhanko.sign.validation import validate_pdf_signature from pyhanko_tests.samples import ( MINIMAL_AES256, MINIMAL_ONE_FIELD_AES256, @@ -18,11 +20,13 @@ MINIMAL_PUBKEY_ONE_FIELD_AES256, MINIMAL_PUBKEY_ONE_FIELD_RC4, MINIMAL_RC4, + PDF_DATA_DIR, PUBKEY_SELFSIGNED_DECRYPTER, ) from pyhanko_tests.signing_commons import ( DUMMY_HTTP_TS, FROM_CA, + SIMPLE_V_CONTEXT, live_testing_vc, val_trusted, ) @@ -175,3 +179,50 @@ def test_sign_encrypted_with_post_sign(requests_mock, password, file): assert status.modification_level == ModificationLevel.LTA_UPDATES assert len(r.embedded_regular_signatures) == 1 assert len(r.embedded_timestamp_signatures) == 1 + + +def test_copy_encrypted_signed_file(): + w = IncrementalPdfFileWriter(BytesIO(MINIMAL_ONE_FIELD_AES256)) + w.encrypt("ownersecret") + out = signers.sign_pdf( + w, + signers.PdfSignatureMetadata(), + signer=FROM_CA, + existing_fields_only=True, + ) + + r = PdfFileReader(out) + r.decrypt("ownersecret") + w = copy_into_new_writer(r) + out2 = BytesIO() + w.write(out2) + + r = PdfFileReader(out2) + assert not r.encrypted + s = r.embedded_signatures[0] + s.compute_integrity_info() + status = validate_pdf_signature(s, SIMPLE_V_CONTEXT(), skip_diff=True) + assert not status.intact + + +def test_copy_file_with_mdp_signature_and_backref(): + # This file has /Data in a signature reference dictionary + # pointing back to the root (which is sometimes still seen in + # FieldMDP signatures generated by Acrobat, among others) + + fname = f"{PDF_DATA_DIR}/signed-encrypted-pubkey-with-catalog-ref.pdf" + with open(fname, 'rb') as inf: + + r = PdfFileReader(inf) + r.decrypt_pubkey(PUBKEY_SELFSIGNED_DECRYPTER) + + w = copy_into_new_writer(r) + out2 = BytesIO() + w.write(out2) + + r = PdfFileReader(out2) + assert not r.encrypted + s = r.embedded_signatures[0] + s.compute_integrity_info() + status = validate_pdf_signature(s, SIMPLE_V_CONTEXT(), skip_diff=True) + assert not status.intact diff --git a/pyhanko_tests/test_utils.py b/pyhanko_tests/test_utils.py index 1d0a8897..b87d7ecd 100644 --- a/pyhanko_tests/test_utils.py +++ b/pyhanko_tests/test_utils.py @@ -2108,3 +2108,42 @@ def test_merge_resource_conflict(): } ), ) + + +def test_copy_deep_object_graph(): + f = BytesIO(MINIMAL) + w = IncrementalPdfFileWriter(f) + cur_obj = w.root['/Blah'] = generic.DictionaryObject() + w.update_root() + for i in range(4000): + next_obj = generic.DictionaryObject() + cur_obj[f'/Blah_{i}'] = generic.ArrayObject([w.add_object(next_obj)]) + cur_obj = next_obj + w.write_in_place() + + r = PdfFileReader(f) + new_w = writer.copy_into_new_writer(r) + out = BytesIO() + new_w.write(out) + + new_r = PdfFileReader(out) + assert len(new_r.root['/Blah']['/Blah_0'][0]['/Blah_1'][0]['/Blah_2']) == 1 + + +def test_copy_root_reference(): + f = BytesIO(MINIMAL) + w = IncrementalPdfFileWriter(f) + arr = generic.ArrayObject( + [generic.IndirectObject(w.root_ref.idnum, w.root_ref.generation, w)] + ) + w.root['/Blah'] = w.add_object(arr) + w.update_root() + w.write_in_place() + + r = PdfFileReader(f) + new_w = writer.copy_into_new_writer(r) + out = BytesIO() + new_w.write(out) + + new_r = PdfFileReader(out) + assert new_r.root['/Blah'].raw_get(0).idnum == w.root_ref.idnum