Skip to content

Commit

Permalink
process shape partly out of page; ignore replacement character `\ufff…
Browse files Browse the repository at this point in the history
…d`; fix empty font name issue; #256
  • Loading branch information
dothinking committed Jan 22, 2024
1 parent 9742d8d commit 346b850
Show file tree
Hide file tree
Showing 3 changed files with 50 additions and 48 deletions.
2 changes: 2 additions & 0 deletions pdf2docx/font/Fonts.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,8 @@ def extract(cls, fitz_doc):
fonts = []
for xref in xrefs:
basename, ext, _, buffer = fitz_doc.extract_font(xref)
if not basename: continue

basename = decode(basename)
name = cls._normalized_font_name(basename)

Expand Down
82 changes: 40 additions & 42 deletions pdf2docx/shape/Shapes.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
# -*- coding: utf-8 -*-

'''A group of ``Shape`` instances.
'''
'''A group of ``Shape`` instances.'''

from .Shape import Shape, Stroke, Fill, Hyperlink
from ..common.share import RectType
Expand All @@ -26,30 +23,24 @@ def restore(self, raws:list):
shape = Fill(raw)
# add to list
self.append(shape)

return self


def _update_bbox(self, shape:Shape):
def _update_bbox(self, e:Shape):
''' override. Do nothing.'''
pass


@property
def strokes(self):
''' Stroke Shapes, including table border, text underline and strike-through.
Cache it once calculated since it doesn't change generally.
'''
''' Stroke Shapes, including table border, text underline and strike-through.'''
instances = list(filter(
lambda shape: isinstance(shape, Stroke), self._instances))
return Shapes(instances)


@property
def fillings(self):
''' Fill Shapes, including cell shading and highlight.
Cache it once calculated since it doesn't change generally.
'''
''' Fill Shapes, including cell shading and highlight.'''
# white bg-color is by default, so ignore those fillings
instances = list(filter(
lambda shape: isinstance(shape, Fill) and \
Expand All @@ -72,21 +63,24 @@ def table_strokes(self):
lambda shape: shape.has_potential_type(RectType.BORDER), self._instances))
return ElementCollection(instances)


@property
def table_fillings(self):
'''Potential table shadings.'''
instances = list(filter(
lambda shape: shape.has_potential_type(RectType.SHADING), self._instances))
return ElementCollection(instances)



@property
def text_style_shapes(self):
'''Potential text style based shapes, e.g. underline, strike-through, highlight and hyperlink.'''
f = lambda shape: shape.has_potential_type(RectType.HIGHLIGHT) or \
shape.has_potential_type(RectType.UNDERLINE) or \
shape.has_potential_type(RectType.STRIKE) or \
shape.has_potential_type(RectType.HYPERLINK)
'''Potential text style based shapes,
e.g. underline, strike-through, highlight and hyperlink.'''
def f(shape):
return shape.has_potential_type(RectType.HIGHLIGHT) or \
shape.has_potential_type(RectType.UNDERLINE) or \
shape.has_potential_type(RectType.STRIKE) or \
shape.has_potential_type(RectType.HYPERLINK)
instances = set(filter(f, self._instances))
return ElementCollection(instances)

Expand All @@ -101,19 +95,24 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):
Args:
max_border_width (float): The max border width.
shape_min_dimension (float): Ignore shape if both width and height is lower than this value.
shape_min_dimension (float): Ignore shape if both width and height
is lower than this value.
"""
if not self._instances: return

# remove small shapes or shapes out of page
# remove small shapes or shapes out of page; and
# update bbox in case part of the shape is out of page
page_bbox = self.parent.bbox
f = lambda shape: shape.bbox.intersects(page_bbox) and \
max(shape.bbox.width, shape.bbox.height)>=shape_min_dimension
cleaned_shapes = list(filter(f, self._instances)) # type: list[Shape]
cleaned_shapes = [] # type: list[Shape]
for s in self:
if max(s.bbox.width, s.bbox.height)<shape_min_dimension: continue # small shapes
bbox_in_page = s.bbox.intersect(page_bbox)
if bbox_in_page.is_empty: continue # shapes out of page
cleaned_shapes.append(s.update_bbox(bbox_in_page)) # ignore out of page part

# merge normal shapes if same filling color
merged_shapes = self._merge_shapes(cleaned_shapes)

# convert Fill instance to Stroke if looks like stroke
shapes = []
for shape in merged_shapes:
Expand All @@ -126,7 +125,7 @@ def clean_up(self, max_border_width:float, shape_min_dimension:float):

# detect semantic type
self._parse_semantic_type()


def assign_to_tables(self, tables:list):
"""Add Shape to associated cells of given tables.
Expand All @@ -136,7 +135,7 @@ def assign_to_tables(self, tables:list):
"""
if not tables: return

# assign shapes to table region
# assign shapes to table region
shapes_in_tables = [[] for _ in tables] # type: list[list[Shape]]
shapes = [] # type: list[Shape]
for shape in self._instances:
Expand All @@ -154,7 +153,7 @@ def assign_to_tables(self, tables:list):
# not possible in current table, then check next table
elif not table.bbox.intersects(shape.bbox):
continue

# Now, this shape belongs to previous layout
else:
shapes.append(shape)
Expand All @@ -169,10 +168,11 @@ def assign_to_tables(self, tables:list):


def plot(self, page):
'''Plot shapes for debug purpose. Different colors are used to display the shapes in detected
semantic types, e.g. yellow for text based shape (stroke, underline and highlight). Due to
overlaps between Stroke and Fill related groups, some shapes are plot twice.
'''Plot shapes for debug purpose.
Different colors are used to display the shapes in detected semantic types, e.g.
yellow for text based shape (stroke, underline and highlight). Due to overlaps
between Stroke and Fill related groups, some shapes are plot twice.
Args:
page (fitz.Page): pdf page.
'''
Expand Down Expand Up @@ -201,10 +201,10 @@ def _merge_shapes(shapes):
# shapes excluding hyperlink first
normal_shapes = list(filter(
lambda shape: not shape.is_determined, shapes))

# group by color and connectivity (with margin considered)
f = lambda a, b: \
a.color==b.color and a.bbox.intersects(b.get_expand_bbox(constants.TINY_DIST))
def f(a, b):
return a.color==b.color and a.bbox.intersects(b.get_expand_bbox(constants.TINY_DIST))
groups = Collection(normal_shapes).group(f)

merged_shapes = []
Expand All @@ -215,22 +215,21 @@ def _merge_shapes(shapes):
merged_shapes.append(group[0].update_bbox(group.bbox))
else:
merged_shapes.extend(group)

# add hyperlinks back
hyperlinks = filter(lambda shape: shape.equal_to_type(RectType.HYPERLINK), shapes)
merged_shapes.extend(hyperlinks)

return merged_shapes


def _parse_semantic_type(self):
''' Detect shape type based on the position to text blocks.
''' Detect shape type based on the position to text blocks.
.. note::
Stroke shapes are grouped on connectivity to each other, but in some cases,
Stroke shapes are grouped on connectivity to each other, but in some cases,
the gap between borders and underlines/strikes are very close, which leads
to an incorrect table structure. So, it's required to distinguish them in
advance, though we needn't to ensure 100% accuracy. They are finally determined
advance, though we needn't to ensure 100% accuracy. They are finally determined
when parsing table structure and text format.
'''
# blocks in page (the original blocks without any further processing)
Expand All @@ -240,4 +239,3 @@ def _parse_semantic_type(self):
# check positions between shapes and text blocks
for shape in self._instances:
shape.parse_semantic_type(blocks)

14 changes: 8 additions & 6 deletions pdf2docx/text/TextSpan.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,8 @@ def __init__(self, raw:dict=None):

# filter empty chars
chars = [Char(c) for c in raw.get('chars', [])] # type: list[Char]
self.chars = [char for char in chars if char.c!='']
# ignore replacement character, see issue#256
self.chars = [char for char in chars if char.c not in ('', '\ufffd')]
self._text = raw.get('text', '') # not an original key from PyMuPDF

# font metrics
Expand Down Expand Up @@ -85,7 +86,7 @@ def text(self):
def text(self, value):
'''Set span text directly in case no chars are stores, e.g. restored from json.'''
self._text = value

def cal_bbox(self):
'''Calculate bbox based on contained instances.'''
bbox = fitz.Rect()
Expand Down Expand Up @@ -306,7 +307,8 @@ def _parse_text_format(self, rect:Shape, horizontal:bool=True):
# highlight: both the rect height and overlap must be large enough
if h_rect >= 0.5*h_span:
# In general, highlight color isn't white
if rect.color != rgb_value((1,1,1)) and self.get_main_bbox(rect, constants.FACTOR_MAJOR):
if rect.color != rgb_value((1,1,1)) and \
self.get_main_bbox(rect, constants.FACTOR_MAJOR):
rect.type = RectType.HIGHLIGHT

# near to bottom of span? yes, underline
Expand Down Expand Up @@ -400,7 +402,7 @@ def _set_text_format(self, docx_run):
# font name
font_name = self.font
docx_run.font.name = font_name
docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # set font for chinese characters
docx_run._element.rPr.rFonts.set(qn('w:eastAsia'), font_name) # for CJK characters
docx_run.font.color.rgb = RGBColor(*rgb_component(self.color))

# font size
Expand All @@ -419,8 +421,8 @@ def _set_text_format(self, docx_run):
for style in self.style:

t = style['type']
# Built-in method is provided to set highlight in python-docx, but supports only limited colors;
# so, set character shading instead if out of highlight color scope
# Built-in method is provided to set highlight in python-docx,but supports only
# limited colors; so, set character shading instead if out of highlight color scope.
if t==RectType.HIGHLIGHT.value:
docx.set_char_shading(docx_run, style['color'])

Expand Down

0 comments on commit 346b850

Please sign in to comment.