Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Jules - various #295

Merged
merged 8 commits into from
Jun 7, 2024
3 changes: 1 addition & 2 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# files
*.pyc
*.jp*g
*.docx
layout.json
.vscode/

Expand All @@ -20,4 +19,4 @@ diff.png
build/
dist/
*egg-info/
pdf2docx*.rst
pdf2docx*.rst
2 changes: 1 addition & 1 deletion pdf2docx/common/Block.py
Original file line number Diff line number Diff line change
Expand Up @@ -141,4 +141,4 @@ def make_docx(self, *args, **kwargs):
Raises:
NotImplementedError
"""
raise NotImplementedError
raise NotImplementedError
53 changes: 53 additions & 0 deletions pdf2docx/common/share.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,3 +255,56 @@ def inner(*args, **kwargs):
return objects
return inner
return wrapper

def is_list_item(text, bullets=True, numbers=True):
'''Returns `text` if `bullets` is true and `text` is a bullet character, or
`numbers` is true and `text` is not empty and consists entirely of digits
0-9. Otherwise returns None.

If `bullets` is True we use an internal list of bullet characters;
otherwise it should be a list of integer Unicode values.
'''
return False
if bullets is True:
bullets2 = (
# From https://en.wikipedia.org/wiki/Bullet_(typography).
0x2022, # BULLET (•, •)
0x2023, # TRIANGULAR BULLET
0x2043, # HYPHEN BULLET (⁃)
0x204c, # BLACK LEFTWARDS BULLET
0x204d, # BLACK RIGHTWARDS BULLET
0x2219, # BULLET OPERATOR for use in mathematical notation primarily as a dot product instead of interpunct.
0x25c9, # FISHEYE used in Japan as a bullet, and called tainome.
0x25cb, # WHITE CIRCLE (○)
0x25cf, # BLACK CIRCLE
0x25cf, # Bullet, black small circle.
0x25d8, # INVERSE BULLET
0x25e6, # WHITE BULLET
0x2619, # REVERSED ROTATED FLORAL HEART BULLET; see Fleuron (typography)
0x2765, # ROTATED HEAVY BLACK HEART BULLET
0x2767, # ROTATED FLORAL HEART BULLET; see Fleuron (typography)
0x29be, # CIRCLED WHITE BULLET (⦾)
0x29bf, # CIRCLED BULLET (⦿)

# Additional.
0x25aa, # Black small square, square bullet.
0xf0b7, # "Private Use Character" but seems to be used by libreoffice for bullets.
)
else:
bullets2 = bullets
if bullets:
if len(text)==1:
c = text[0]
cc = ord(c)
if cc in bullets2:
if bullets is True and cc == 0xf0b7:
return chr(0x2022)
return text
if numbers:
for c in text:
if isinstance(c, list):
c = c[0]
if c not in '0123456789':
break
else:
return text
8 changes: 7 additions & 1 deletion pdf2docx/converter.py
Original file line number Diff line number Diff line change
Expand Up @@ -106,7 +106,9 @@ def default_settings(self):
'extract_stream_table' : False, # don't consider stream table when extracting tables
'parse_lattice_table' : True, # whether parse lattice table or not; may destroy the layout if set False
'parse_stream_table' : True, # whether parse stream table or not; may destroy the layout if set False
'delete_end_line_hyphen' : False # delete hyphen at the end of a line
'delete_end_line_hyphen' : False, # delete hyphen at the end of a line
'raw_exceptions' : False, # Don't swallow exceptions
'list_not_table' : True, # Avoid treating bullet list as table.
}

# -----------------------------------------------------------------------
Expand Down Expand Up @@ -182,6 +184,8 @@ def parse_pages(self, **kwargs):
try:
page.parse(**kwargs)
except Exception as e:
if kwargs['raw_exceptions']:
raise
if not kwargs['debug'] and kwargs['ignore_page_error']:
logging.error('Ignore page %d due to parsing page error: %s', pid, e)
else:
Expand Down Expand Up @@ -224,6 +228,8 @@ def make_docx(self, filename_or_stream=None, **kwargs):
try:
page.make_docx(docx_file)
except Exception as e:
if kwargs['raw_exceptions']:
raise
if not kwargs['debug'] and kwargs['ignore_page_error']:
logging.error('Ignore page %d due to making page error: %s', pid, e)
else:
Expand Down
15 changes: 9 additions & 6 deletions pdf2docx/layout/Blocks.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from docx.shared import Pt
from ..common import constants
from ..common.Collection import ElementCollection
from ..common.share import (BlockType, lower_round, rgb_value)
from ..common.share import (BlockType, lower_round, rgb_value, is_list_item)
from ..common.Block import Block
from ..common.docx import (reset_paragraph_format, delete_paragraph)
from ..text.TextBlock import TextBlock
Expand Down Expand Up @@ -176,7 +176,7 @@ def assign_to_tables(self, tables:list):
self.reset(blocks)


def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float):
def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float, **kwargs):
'''Collect elements in Line level (line or table bbox), which may contained in a stream table region.

Table may exist on the following conditions:
Expand Down Expand Up @@ -230,11 +230,14 @@ def close_table():
bbox = row.bbox

# flow layout or not?
if not row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout):
table_lines.extend([sub_line(block) for block in row])

else:
if row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout):
close_table()
elif kwargs.get('list_not_table') and is_list_item(row[0].text):
# Don't interpret list-style bullet characters/numbers as
# indicating a table.
close_table()
else:
table_lines.extend([sub_line(block) for block in row])

# contained in shading or not?
for block in row:
Expand Down
2 changes: 2 additions & 0 deletions pdf2docx/main.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ def convert(pdf_file:str,
cv.convert(docx_file, start, end, pages, **kwargs)
except Exception as e:
logging.error(e)
if kwargs['raw_exceptions']:
raise
finally:
cv.close()

Expand Down
12 changes: 11 additions & 1 deletion pdf2docx/page/RawPageFitz.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
A wrapper of PyMuPDF Page as page engine.
'''

import fitz
import logging
from .RawPage import RawPage
from ..image.ImagesExtractor import ImagesExtractor
Expand All @@ -22,6 +23,7 @@ def extract_raw_dict(self, **settings):
if not self.page_engine: return raw_dict

# actual page size
# `self.page_engine` is the `fitz.Page`.
*_, w, h = self.page_engine.rect # always reflecting page rotation
raw_dict.update({ 'width' : w, 'height': h })
self.width, self.height = w, h
Expand Down Expand Up @@ -59,7 +61,15 @@ def _preprocess_text(self, **settings):
if ocr==1: raise SystemExit("OCR feature is planned but not implemented yet.")

# all text blocks no matter hidden or not
raw = self.page_engine.get_text('rawdict', flags=64)
sort = settings.get('sort')
raw = self.page_engine.get_text(
'rawdict',
flags=0
| fitz.TEXT_MEDIABOX_CLIP
| fitz.TEXT_CID_FOR_UNKNOWN_UNICODE
,
sort=sort,
)
text_blocks = raw.get('blocks', [])

# potential UnicodeDecodeError issue when trying to filter hidden text:
Expand Down
14 changes: 11 additions & 3 deletions pdf2docx/table/Cell.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,8 +25,11 @@ def text(self):
'''Text contained in this cell.'''
if not self: return None
# NOTE: sub-table may exists in
return '\n'.join([block.text if block.is_text_block else '<NEST TABLE>'
for block in self.blocks])
# fixme: prev code did `if block.is_text_block`, but sometimes
# there is no `is_text_block` member; would be good to ensure
# this member is always present and avoid use of `hasattr()`.
return '\n'.join([block.text if hasattr(block, 'text') else '<NEST TABLE>'
for block in self.blocks])


@property
Expand Down Expand Up @@ -75,7 +78,12 @@ def make_docx(self, table, indexes):
docx_cell = table.cell(i, j)
if n_row*n_col!=1:
_cell = table.cell(i+n_row-1, j+n_col-1)
docx_cell.merge(_cell)
try:
docx_cell.merge(_cell)
except Exception as e:
def show(c):
return f'[_tc.top={c._tc.top} _tc.bottom={c._tc.bottom}]'
raise Exception(f'Failed to merge docx_cell={show(docx_cell)} _cell={show(_cell)}. {i=} {j=} {n_row=} {n_col=}') from e

# ---------------------
# cell width (cell height is set by row height)
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/table/TablesConstructor.py
Original file line number Diff line number Diff line change
Expand Up @@ -379,4 +379,4 @@ def _inner_borders(lines:Lines, outer_borders:tuple):
borders_ = TablesConstructor._inner_borders(rows_lines[j], (top, bottom, left, right))
borders.extend(borders_)

return borders
return borders
12 changes: 11 additions & 1 deletion pdf2docx/text/Lines.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from ..common.Collection import ElementCollection
from ..common.share import TextAlignment
from ..common import constants
from ..common.share import is_list_item


class Lines(ElementCollection):
Expand All @@ -33,6 +34,11 @@ def restore(self, raws:list):
return self


def text(self):
'''For debugging.'''
return '\n'.join([line.text for line in self])


@property
def image_spans(self):
'''Get all ImageSpan instances.'''
Expand Down Expand Up @@ -72,8 +78,12 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr
end_of_sen = row[-1].text.strip().endswith(punc)
w = row[-1].bbox[2]-row[0].bbox[0]

if 0 and is_list_item(row[0].text[0]):
# Treat bullet list items as separate paragraphs.
start_of_para = True

# end of a sentense and free space at the end -> end of paragraph
if end_of_sen and w/W <= 1.0-line_break_free_space_ratio:
elif end_of_sen and w/W <= 1.0-line_break_free_space_ratio:
end_of_para = True

# start of sentence and free space at the start -> start of paragraph
Expand Down
2 changes: 1 addition & 1 deletion pdf2docx/text/TextBlock.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,4 +468,4 @@ def external_alignment():
if alignment==TextAlignment.LEFT or alignment==TextAlignment.JUSTIFY:
self.first_line_space = rows[0][0].bbox[idx0] - rows[1][0].bbox[idx0]

return alignment
return alignment
18 changes: 5 additions & 13 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,19 +28,11 @@ def load_long_description(fname):

def load_requirements(fname):
'''Load requirements.'''
try:
# pip >= 10.0
from pip._internal.req import parse_requirements
except ImportError:
# pip < 10.0
from pip.req import parse_requirements

reqs = parse_requirements(fname, session=False)
try:
requirements = [str(ir.requirement) for ir in reqs]
except AttributeError:
requirements = [str(ir.req) for ir in reqs]
return requirements
ret = list()
with open(fname) as f:
for line in f:
ret.append(line)
return ret


setup(
Expand Down
Binary file added test/samples/demo-whisper_2_3.pdf
Binary file not shown.
Binary file added test/samples/pdf2docx-lists-bullets3.docx
Binary file not shown.
Loading
Loading