ArtifexSoftware · julian-smith-artifex-com · Jun 7, 2024 · May 3, 2024 · May 3, 2024 · May 3, 2024
diff --git a/.gitignore b/.gitignore
@@ -1,7 +1,6 @@
 # files
 *.pyc
 *.jp*g
-*.docx
 layout.json
 .vscode/
 
@@ -20,4 +19,4 @@ diff.png
 build/
 dist/
 *egg-info/
-pdf2docx*.rst
+pdf2docx*.rst
diff --git a/pdf2docx/common/Block.py b/pdf2docx/common/Block.py
@@ -141,4 +141,4 @@ def make_docx(self, *args, **kwargs):
         Raises:
             NotImplementedError
         """
-        raise NotImplementedError
+        raise NotImplementedError
diff --git a/pdf2docx/common/share.py b/pdf2docx/common/share.py
@@ -255,3 +255,56 @@ def inner(*args, **kwargs):
             return objects
         return inner
     return wrapper
+
+def is_list_item(text, bullets=True, numbers=True):
+    '''Returns `text` if `bullets` is true and `text` is a bullet character, or
+    `numbers` is true and `text` is not empty and consists entirely of digits
+    0-9. Otherwise returns None.
+
+    If `bullets` is True we use an internal list of bullet characters;
+    otherwise it should be a list of integer Unicode values.
+    '''
+    return False
+    if bullets is True:
+        bullets2 = (
+                # From https://en.wikipedia.org/wiki/Bullet_(typography).
+                0x2022, # BULLET (&bull;, &bullet;)
+                0x2023, # TRIANGULAR BULLET
+                0x2043, # HYPHEN BULLET (&hybull;)
+                0x204c, # BLACK LEFTWARDS BULLET
+                0x204d, # BLACK RIGHTWARDS BULLET
+                0x2219, # BULLET OPERATOR for use in mathematical notation primarily as a dot product instead of interpunct.
+                0x25c9, # FISHEYE used in Japan as a bullet, and called tainome.
+                0x25cb, # WHITE CIRCLE (&cir;)
+                0x25cf, # BLACK CIRCLE
+                0x25cf, # Bullet, black small circle.
+                0x25d8, # INVERSE BULLET
+                0x25e6, # WHITE BULLET
+                0x2619, # REVERSED ROTATED FLORAL HEART BULLET; see Fleuron (typography)
+                0x2765, # ROTATED HEAVY BLACK HEART BULLET
+                0x2767, # ROTATED FLORAL HEART BULLET; see Fleuron (typography)
+                0x29be, # CIRCLED WHITE BULLET (&olcir;)
+                0x29bf, # CIRCLED BULLET (&ofcir;)
+
+                # Additional.
+                0x25aa, # Black small square, square bullet.
+                0xf0b7, # "Private Use Character" but seems to be used by libreoffice for bullets.
+                )
+    else:
+        bullets2 = bullets
+    if bullets:
+        if len(text)==1:
+            c = text[0]
+            cc = ord(c)
+            if cc in bullets2:
+                if bullets is True and cc == 0xf0b7:
+                    return chr(0x2022)
+                return text
+    if numbers:
+        for c in text:
+            if isinstance(c, list):
+                c = c[0]
+            if c not in '0123456789':
+                break
+        else:
+            return text
diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py
@@ -106,7 +106,9 @@ def default_settings(self):
             'extract_stream_table'           : False,  # don't consider stream table when extracting tables
             'parse_lattice_table'            : True,   # whether parse lattice table or not; may destroy the layout if set False
             'parse_stream_table'             : True,   # whether parse stream table or not; may destroy the layout if set False
-            'delete_end_line_hyphen'         : False   # delete hyphen at the end of a line
+            'delete_end_line_hyphen'         : False,  # delete hyphen at the end of a line
+            'raw_exceptions'                 : False,  # Don't swallow exceptions
+            'list_not_table'                 : True,   # Avoid treating bullet list as table.
         }
 
     # -----------------------------------------------------------------------
@@ -182,6 +184,8 @@ def parse_pages(self, **kwargs):
             try:
                 page.parse(**kwargs)
             except Exception as e:
+                if kwargs['raw_exceptions']:
+                    raise
                 if not kwargs['debug'] and kwargs['ignore_page_error']:
                     logging.error('Ignore page %d due to parsing page error: %s', pid, e)
                 else:
@@ -224,6 +228,8 @@ def make_docx(self, filename_or_stream=None, **kwargs):
             try:
                 page.make_docx(docx_file)
             except Exception as e:
+                if kwargs['raw_exceptions']:
+                    raise
                 if not kwargs['debug'] and kwargs['ignore_page_error']:
                     logging.error('Ignore page %d due to making page error: %s', pid, e)
                 else:

diff --git a/pdf2docx/layout/Blocks.py b/pdf2docx/layout/Blocks.py
@@ -8,7 +8,7 @@
 from docx.shared import Pt
 from ..common import constants
 from ..common.Collection import ElementCollection
-from ..common.share import (BlockType, lower_round, rgb_value)
+from ..common.share import (BlockType, lower_round, rgb_value, is_list_item)
 from ..common.Block import Block
 from ..common.docx import (reset_paragraph_format, delete_paragraph)
 from ..text.TextBlock import TextBlock
@@ -176,7 +176,7 @@ def assign_to_tables(self, tables:list):
         self.reset(blocks)
 
 
-    def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float):
+    def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float, **kwargs):
         '''Collect elements in Line level (line or table bbox), which may contained in a stream table region.
 
         Table may exist on the following conditions:
@@ -230,11 +230,14 @@ def close_table():
             bbox = row.bbox
 
             # flow layout or not?
-            if not row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout): 
-                table_lines.extend([sub_line(block) for block in row])
-
-            else:
+            if row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout):
                 close_table()
+            elif kwargs.get('list_not_table') and is_list_item(row[0].text):
+                 # Don't interpret list-style bullet characters/numbers as
+                 # indicating a table.
+                 close_table()
+            else:
+                table_lines.extend([sub_line(block) for block in row])
 
             # contained in shading or not?
             for block in row:

diff --git a/pdf2docx/main.py b/pdf2docx/main.py
@@ -41,6 +41,8 @@ def convert(pdf_file:str,
             cv.convert(docx_file, start, end, pages, **kwargs)
         except Exception as e:
             logging.error(e)
+            if kwargs['raw_exceptions']:
+                raise
         finally:
             cv.close()
 

diff --git a/pdf2docx/page/RawPageFitz.py b/pdf2docx/page/RawPageFitz.py
@@ -4,6 +4,7 @@
 A wrapper of PyMuPDF Page as page engine.
 '''
 
+import fitz
 import logging
 from .RawPage import RawPage
 from ..image.ImagesExtractor import ImagesExtractor
@@ -22,6 +23,7 @@ def extract_raw_dict(self, **settings):
         if not self.page_engine: return raw_dict
 
         # actual page size
+        # `self.page_engine` is the `fitz.Page`.
         *_, w, h = self.page_engine.rect # always reflecting page rotation
         raw_dict.update({ 'width' : w, 'height': h })
         self.width, self.height = w, h
@@ -59,7 +61,15 @@ def _preprocess_text(self, **settings):
         if ocr==1: raise SystemExit("OCR feature is planned but not implemented yet.")
 
         # all text blocks no matter hidden or not
-        raw = self.page_engine.get_text('rawdict', flags=64)
+        sort = settings.get('sort')
+        raw = self.page_engine.get_text(
+                'rawdict',
+                flags=0
+                    | fitz.TEXT_MEDIABOX_CLIP
+                    | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE
+                    ,
+                sort=sort,
+                )
         text_blocks = raw.get('blocks', [])
 
         # potential UnicodeDecodeError issue when trying to filter hidden text:

diff --git a/pdf2docx/table/Cell.py b/pdf2docx/table/Cell.py
@@ -25,8 +25,11 @@ def text(self):
         '''Text contained in this cell.'''
         if not self: return None
         # NOTE: sub-table may exists in
-        return '\n'.join([block.text if block.is_text_block else '<NEST TABLE>'
-                                 for block in self.blocks])
+        # fixme: prev code did `if block.is_text_block`, but sometimes
+        # there is no `is_text_block` member; would be good to ensure
+        # this member is always present and avoid use of `hasattr()`.
+        return '\n'.join([block.text if hasattr(block, 'text') else '<NEST TABLE>'
+                                for block in self.blocks])
 
 
     @property
@@ -75,7 +78,12 @@ def make_docx(self, table, indexes):
         docx_cell = table.cell(i, j)
         if n_row*n_col!=1:
             _cell = table.cell(i+n_row-1, j+n_col-1)
-            docx_cell.merge(_cell)
+            try:
+                docx_cell.merge(_cell)
+            except Exception as e:
+                def show(c):
+                    return f'[_tc.top={c._tc.top} _tc.bottom={c._tc.bottom}]'
+                raise Exception(f'Failed to merge docx_cell={show(docx_cell)} _cell={show(_cell)}. {i=} {j=} {n_row=} {n_col=}') from e
 
         # ---------------------
         # cell width (cell height is set by row height)

diff --git a/pdf2docx/table/TablesConstructor.py b/pdf2docx/table/TablesConstructor.py
@@ -379,4 +379,4 @@ def _inner_borders(lines:Lines, outer_borders:tuple):
                 borders_ = TablesConstructor._inner_borders(rows_lines[j], (top, bottom, left, right))
                 borders.extend(borders_)
 
-        return borders
+        return borders
diff --git a/pdf2docx/text/Lines.py b/pdf2docx/text/Lines.py
@@ -11,6 +11,7 @@
 from ..common.Collection import ElementCollection
 from ..common.share import TextAlignment
 from ..common import constants
+from ..common.share import is_list_item
 
 
 class Lines(ElementCollection):
@@ -33,6 +34,11 @@ def restore(self, raws:list):
         return self
 
 
+    def text(self):
+        '''For debugging.'''
+        return '\n'.join([line.text for line in self])
+
+
     @property
     def image_spans(self):
         '''Get all ImageSpan instances.'''
@@ -72,8 +78,12 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr
             end_of_sen = row[-1].text.strip().endswith(punc)
             w =  row[-1].bbox[2]-row[0].bbox[0]
 
+            if 0 and is_list_item(row[0].text[0]):
+                # Treat bullet list items as separate paragraphs.
+                start_of_para = True
+
             # end of a sentense and free space at the end -> end of paragraph
-            if end_of_sen and w/W <= 1.0-line_break_free_space_ratio:
+            elif end_of_sen and w/W <= 1.0-line_break_free_space_ratio:
                 end_of_para = True
 
             # start of sentence and free space at the start -> start of paragraph

diff --git a/pdf2docx/text/TextBlock.py b/pdf2docx/text/TextBlock.py
@@ -468,4 +468,4 @@ def external_alignment():
         if alignment==TextAlignment.LEFT or alignment==TextAlignment.JUSTIFY:
             self.first_line_space = rows[0][0].bbox[idx0] - rows[1][0].bbox[idx0]
 
-        return alignment
+        return alignment
diff --git a/setup.py b/setup.py
@@ -28,19 +28,11 @@ def load_long_description(fname):
 
 def load_requirements(fname):
     '''Load requirements.'''
-    try:
-        # pip >= 10.0
-        from pip._internal.req import parse_requirements
-    except ImportError:
-        # pip < 10.0
-        from pip.req import parse_requirements
-
-    reqs = parse_requirements(fname, session=False)
-    try:
-        requirements = [str(ir.requirement) for ir in reqs]
-    except AttributeError:
-        requirements = [str(ir.req) for ir in reqs]
-    return requirements
+    ret = list()
+    with open(fname) as f:
+        for line in f:
+            ret.append(line)
+    return ret
 
 
 setup(

diff --git a/test/samples/demo-whisper_2_3.pdf b/test/samples/demo-whisper_2_3.pdf
diff --git a/test/samples/pdf2docx-lists-bullets3.docx b/test/samples/pdf2docx-lists-bullets3.docx