Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use latest version of objects from object streams (#1) #169

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -65,3 +65,9 @@ coverage.xml

# Sphinx documentation
docs/_build/

# Vim
*.sw[op]

# pyenv
.python-version
5 changes: 5 additions & 0 deletions pdfrw/objects/pdfdict.py
Original file line number Diff line number Diff line change
Expand Up @@ -186,6 +186,11 @@ def itervalues(self):
for key, value in self.iteritems():
yield value

def update_indirect(self, other):
''' Update a PdfDict without calling real_value() on the items. '''
for key, value in iteritems(other):
self[key] = value

def values(self):
return list((value for key, value in self.iteritems()))

Expand Down
62 changes: 47 additions & 15 deletions pdfrw/pdfreader.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,8 @@
from . import crypt
from .py23_diffs import convert_load, convert_store, iteritems

_PAGE_TREE_MAX_DEPTH = 50000


class PdfReader(PdfDict):

Expand Down Expand Up @@ -182,6 +184,12 @@ def loadindirect(self, key, PdfDict=PdfDict,
result = self.indirect_objects.get(key)
if not isinstance(result, PdfIndirect):
return result

# If the object was loaded from an object stream, return it
result = self.loaded_object_stream_objs.get(key)
if result is not None:
return result

source = self.source
offset = int(self.source.obj_offsets.get(key, '0'))
if not offset:
Expand Down Expand Up @@ -314,14 +322,25 @@ def load_stream_objects(self, object_streams):
sobj = func(objsource)

key = (num, 0)
self.indirect_objects[key] = sobj
if key in self.deferred_objects:
self.deferred_objects.remove(key)

# Mark the object as indirect, and
# add it to the list of streams if it starts a stream
sobj.indirect = key

# We call load_stream_objects on the most recent stream objects
# in the file first, so we don't want to clobber already-stored
# objects.
if key not in self.loaded_object_stream_objs:
self.loaded_object_stream_objs[key] = sobj

if key in self.indirect_objects:
continue

self.indirect_objects[key] = sobj

if key in self.deferred_objects:
self.deferred_objects.remove(key)

def findxref(self, fdata):
''' Find the cross reference section at the end of a file
'''
Expand Down Expand Up @@ -473,18 +492,26 @@ def readpages(self, node):

try:
result = []
stack = [node]
stack = [(node, 0)]
append = result.append
pop = stack.pop
while stack:
node = pop()
node, depth = pop()

# Guard against infinite loops in the page tree
if depth >= _PAGE_TREE_MAX_DEPTH:
log.error('Page tree exceeded max depth')
return []

nodetype = node[typename]
if nodetype == pagename:
append(node)
elif nodetype == pagesname:
stack.extend(reversed(node[kidname]))
stack.extend(
(n, depth + 1) for n in reversed(node[kidname])
)
elif nodetype == catalogname:
stack.append(node[pagesname])
stack.append((node[pagesname], depth + 1))
else:
log.error('Expected /Page or /Pages dictionary, got %s' %
repr(node))
Expand Down Expand Up @@ -601,6 +628,7 @@ def __init__(self, fname=None, fdata=None, decompress=False,
private = self.private
private.indirect_objects = {}
private.deferred_objects = set()
private.loaded_object_stream_objs = {}
private.special = {'<<': self.readdict,
'[': self.readarray,
'endobj': self.empty_obj,
Expand All @@ -617,14 +645,14 @@ def __init__(self, fname=None, fdata=None, decompress=False,
while 1:
source.obj_offsets = {}
trailer, is_stream = self.parsexref(source)
xref_list.append((source.obj_offsets, trailer, is_stream))
prev = trailer.Prev
if prev is None:
token = source.next()
if token != 'startxref' and not xref_list:
source.warning('Expected "startxref" '
'at end of xref table')
break
xref_list.append((source.obj_offsets, trailer, is_stream))
source.floc = int(prev)

# Handle document encryption
Expand All @@ -644,18 +672,22 @@ def __init__(self, fname=None, fdata=None, decompress=False,

self._parse_encrypt_info(source, password, trailer)

if is_stream:
self.load_stream_objects(trailer.object_streams)

while xref_list:
later_offsets, later_trailer, is_stream = xref_list.pop()
# Go through all trailers from earliest to latest and make sure the
# trailer object contains the latest information.
for later_offsets, later_trailer, is_stream in reversed(xref_list):
source.obj_offsets.update(later_offsets)
if is_stream:
trailer.update(later_trailer)
self.load_stream_objects(later_trailer.object_streams)
trailer.update_indirect(later_trailer)
else:
trailer = later_trailer

# Go through all trailers from latest to earliest and load their
# object streams.
while xref_list:
_, later_trailer, is_stream = xref_list.pop(0)
if is_stream:
self.load_stream_objects(later_trailer.object_streams)

trailer.Prev = None

if (trailer.Version and
Expand Down
6 changes: 1 addition & 5 deletions pdfrw/tokens.py
Original file line number Diff line number Diff line change
Expand Up @@ -82,8 +82,6 @@ def _gettoks(self, startloc, intern=intern,
'''
fdata = self.fdata
current = self.current = [(startloc, startloc)]
cache = {}
get_cache = cache.get
while 1:
for match in findtok(fdata, current[0][1]):
current[0] = tokspan = match.span()
Expand Down Expand Up @@ -141,9 +139,7 @@ def _gettoks(self, startloc, intern=intern,
self.exception(('Tokenizer logic incorrect -- '
'should never get here'))

newtok = get_cache(token)
if newtok is None:
newtok = cache[token] = toktype(token)
newtok = toktype(token)
yield newtok
if current[0] is not tokspan:
break
Expand Down
Loading