diff --git a/.gitignore b/.gitignore index 28d40da..18c25db 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,6 @@ # files *.pyc *.jp*g -*.docx layout.json .vscode/ @@ -20,4 +19,4 @@ diff.png build/ dist/ *egg-info/ -pdf2docx*.rst \ No newline at end of file +pdf2docx*.rst diff --git a/pdf2docx/common/Block.py b/pdf2docx/common/Block.py index 3b46f0e..bdb48c6 100644 --- a/pdf2docx/common/Block.py +++ b/pdf2docx/common/Block.py @@ -141,4 +141,4 @@ def make_docx(self, *args, **kwargs): Raises: NotImplementedError """ - raise NotImplementedError \ No newline at end of file + raise NotImplementedError diff --git a/pdf2docx/common/share.py b/pdf2docx/common/share.py index e67b6e6..44b7d40 100644 --- a/pdf2docx/common/share.py +++ b/pdf2docx/common/share.py @@ -255,3 +255,56 @@ def inner(*args, **kwargs): return objects return inner return wrapper + +def is_list_item(text, bullets=True, numbers=True): + '''Returns `text` if `bullets` is true and `text` is a bullet character, or + `numbers` is true and `text` is not empty and consists entirely of digits + 0-9. Otherwise returns None. + + If `bullets` is True we use an internal list of bullet characters; + otherwise it should be a list of integer Unicode values. + ''' + return False + if bullets is True: + bullets2 = ( + # From https://en.wikipedia.org/wiki/Bullet_(typography). + 0x2022, # BULLET (•, •) + 0x2023, # TRIANGULAR BULLET + 0x2043, # HYPHEN BULLET (⁃) + 0x204c, # BLACK LEFTWARDS BULLET + 0x204d, # BLACK RIGHTWARDS BULLET + 0x2219, # BULLET OPERATOR for use in mathematical notation primarily as a dot product instead of interpunct. + 0x25c9, # FISHEYE used in Japan as a bullet, and called tainome. + 0x25cb, # WHITE CIRCLE (○) + 0x25cf, # BLACK CIRCLE + 0x25cf, # Bullet, black small circle. + 0x25d8, # INVERSE BULLET + 0x25e6, # WHITE BULLET + 0x2619, # REVERSED ROTATED FLORAL HEART BULLET; see Fleuron (typography) + 0x2765, # ROTATED HEAVY BLACK HEART BULLET + 0x2767, # ROTATED FLORAL HEART BULLET; see Fleuron (typography) + 0x29be, # CIRCLED WHITE BULLET (⦾) + 0x29bf, # CIRCLED BULLET (⦿) + + # Additional. + 0x25aa, # Black small square, square bullet. + 0xf0b7, # "Private Use Character" but seems to be used by libreoffice for bullets. + ) + else: + bullets2 = bullets + if bullets: + if len(text)==1: + c = text[0] + cc = ord(c) + if cc in bullets2: + if bullets is True and cc == 0xf0b7: + return chr(0x2022) + return text + if numbers: + for c in text: + if isinstance(c, list): + c = c[0] + if c not in '0123456789': + break + else: + return text diff --git a/pdf2docx/converter.py b/pdf2docx/converter.py index 8da7e8a..337d9b3 100644 --- a/pdf2docx/converter.py +++ b/pdf2docx/converter.py @@ -106,7 +106,9 @@ def default_settings(self): 'extract_stream_table' : False, # don't consider stream table when extracting tables 'parse_lattice_table' : True, # whether parse lattice table or not; may destroy the layout if set False 'parse_stream_table' : True, # whether parse stream table or not; may destroy the layout if set False - 'delete_end_line_hyphen' : False # delete hyphen at the end of a line + 'delete_end_line_hyphen' : False, # delete hyphen at the end of a line + 'raw_exceptions' : False, # Don't swallow exceptions + 'list_not_table' : True, # Avoid treating bullet list as table. } # ----------------------------------------------------------------------- @@ -182,6 +184,8 @@ def parse_pages(self, **kwargs): try: page.parse(**kwargs) except Exception as e: + if kwargs['raw_exceptions']: + raise if not kwargs['debug'] and kwargs['ignore_page_error']: logging.error('Ignore page %d due to parsing page error: %s', pid, e) else: @@ -224,6 +228,8 @@ def make_docx(self, filename_or_stream=None, **kwargs): try: page.make_docx(docx_file) except Exception as e: + if kwargs['raw_exceptions']: + raise if not kwargs['debug'] and kwargs['ignore_page_error']: logging.error('Ignore page %d due to making page error: %s', pid, e) else: diff --git a/pdf2docx/layout/Blocks.py b/pdf2docx/layout/Blocks.py index a3ddff5..7f5e391 100644 --- a/pdf2docx/layout/Blocks.py +++ b/pdf2docx/layout/Blocks.py @@ -8,7 +8,7 @@ from docx.shared import Pt from ..common import constants from ..common.Collection import ElementCollection -from ..common.share import (BlockType, lower_round, rgb_value) +from ..common.share import (BlockType, lower_round, rgb_value, is_list_item) from ..common.Block import Block from ..common.docx import (reset_paragraph_format, delete_paragraph) from ..text.TextBlock import TextBlock @@ -176,7 +176,7 @@ def assign_to_tables(self, tables:list): self.reset(blocks) - def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float): + def collect_stream_lines(self, potential_shadings:list, line_separate_threshold:float, **kwargs): '''Collect elements in Line level (line or table bbox), which may contained in a stream table region. Table may exist on the following conditions: @@ -230,11 +230,14 @@ def close_table(): bbox = row.bbox # flow layout or not? - if not row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout): - table_lines.extend([sub_line(block) for block in row]) - - else: + if row.is_flow_layout(line_separate_threshold, cell_layout=cell_layout): close_table() + elif kwargs.get('list_not_table') and is_list_item(row[0].text): + # Don't interpret list-style bullet characters/numbers as + # indicating a table. + close_table() + else: + table_lines.extend([sub_line(block) for block in row]) # contained in shading or not? for block in row: diff --git a/pdf2docx/main.py b/pdf2docx/main.py index e4b187b..717f6cf 100644 --- a/pdf2docx/main.py +++ b/pdf2docx/main.py @@ -41,6 +41,8 @@ def convert(pdf_file:str, cv.convert(docx_file, start, end, pages, **kwargs) except Exception as e: logging.error(e) + if kwargs['raw_exceptions']: + raise finally: cv.close() diff --git a/pdf2docx/page/RawPageFitz.py b/pdf2docx/page/RawPageFitz.py index 52dcaa9..4158e9a 100644 --- a/pdf2docx/page/RawPageFitz.py +++ b/pdf2docx/page/RawPageFitz.py @@ -4,6 +4,7 @@ A wrapper of PyMuPDF Page as page engine. ''' +import fitz import logging from .RawPage import RawPage from ..image.ImagesExtractor import ImagesExtractor @@ -22,6 +23,7 @@ def extract_raw_dict(self, **settings): if not self.page_engine: return raw_dict # actual page size + # `self.page_engine` is the `fitz.Page`. *_, w, h = self.page_engine.rect # always reflecting page rotation raw_dict.update({ 'width' : w, 'height': h }) self.width, self.height = w, h @@ -59,7 +61,15 @@ def _preprocess_text(self, **settings): if ocr==1: raise SystemExit("OCR feature is planned but not implemented yet.") # all text blocks no matter hidden or not - raw = self.page_engine.get_text('rawdict', flags=64) + sort = settings.get('sort') + raw = self.page_engine.get_text( + 'rawdict', + flags=0 + | fitz.TEXT_MEDIABOX_CLIP + | fitz.TEXT_CID_FOR_UNKNOWN_UNICODE + , + sort=sort, + ) text_blocks = raw.get('blocks', []) # potential UnicodeDecodeError issue when trying to filter hidden text: diff --git a/pdf2docx/table/Cell.py b/pdf2docx/table/Cell.py index e2d3aff..cff0124 100644 --- a/pdf2docx/table/Cell.py +++ b/pdf2docx/table/Cell.py @@ -25,8 +25,11 @@ def text(self): '''Text contained in this cell.''' if not self: return None # NOTE: sub-table may exists in - return '\n'.join([block.text if block.is_text_block else '' - for block in self.blocks]) + # fixme: prev code did `if block.is_text_block`, but sometimes + # there is no `is_text_block` member; would be good to ensure + # this member is always present and avoid use of `hasattr()`. + return '\n'.join([block.text if hasattr(block, 'text') else '' + for block in self.blocks]) @property @@ -75,7 +78,12 @@ def make_docx(self, table, indexes): docx_cell = table.cell(i, j) if n_row*n_col!=1: _cell = table.cell(i+n_row-1, j+n_col-1) - docx_cell.merge(_cell) + try: + docx_cell.merge(_cell) + except Exception as e: + def show(c): + return f'[_tc.top={c._tc.top} _tc.bottom={c._tc.bottom}]' + raise Exception(f'Failed to merge docx_cell={show(docx_cell)} _cell={show(_cell)}. {i=} {j=} {n_row=} {n_col=}') from e # --------------------- # cell width (cell height is set by row height) diff --git a/pdf2docx/table/TablesConstructor.py b/pdf2docx/table/TablesConstructor.py index 546b7e5..3faffdd 100644 --- a/pdf2docx/table/TablesConstructor.py +++ b/pdf2docx/table/TablesConstructor.py @@ -379,4 +379,4 @@ def _inner_borders(lines:Lines, outer_borders:tuple): borders_ = TablesConstructor._inner_borders(rows_lines[j], (top, bottom, left, right)) borders.extend(borders_) - return borders \ No newline at end of file + return borders diff --git a/pdf2docx/text/Lines.py b/pdf2docx/text/Lines.py index c9223e6..20b5ae9 100644 --- a/pdf2docx/text/Lines.py +++ b/pdf2docx/text/Lines.py @@ -11,6 +11,7 @@ from ..common.Collection import ElementCollection from ..common.share import TextAlignment from ..common import constants +from ..common.share import is_list_item class Lines(ElementCollection): @@ -33,6 +34,11 @@ def restore(self, raws:list): return self + def text(self): + '''For debugging.''' + return '\n'.join([line.text for line in self]) + + @property def image_spans(self): '''Get all ImageSpan instances.''' @@ -72,8 +78,12 @@ def split_vertically_by_text(self, line_break_free_space_ratio:float, new_paragr end_of_sen = row[-1].text.strip().endswith(punc) w = row[-1].bbox[2]-row[0].bbox[0] + if 0 and is_list_item(row[0].text[0]): + # Treat bullet list items as separate paragraphs. + start_of_para = True + # end of a sentense and free space at the end -> end of paragraph - if end_of_sen and w/W <= 1.0-line_break_free_space_ratio: + elif end_of_sen and w/W <= 1.0-line_break_free_space_ratio: end_of_para = True # start of sentence and free space at the start -> start of paragraph diff --git a/pdf2docx/text/TextBlock.py b/pdf2docx/text/TextBlock.py index 46cd531..af111e1 100644 --- a/pdf2docx/text/TextBlock.py +++ b/pdf2docx/text/TextBlock.py @@ -468,4 +468,4 @@ def external_alignment(): if alignment==TextAlignment.LEFT or alignment==TextAlignment.JUSTIFY: self.first_line_space = rows[0][0].bbox[idx0] - rows[1][0].bbox[idx0] - return alignment \ No newline at end of file + return alignment diff --git a/setup.py b/setup.py index c4c05f9..5de3bce 100644 --- a/setup.py +++ b/setup.py @@ -28,19 +28,11 @@ def load_long_description(fname): def load_requirements(fname): '''Load requirements.''' - try: - # pip >= 10.0 - from pip._internal.req import parse_requirements - except ImportError: - # pip < 10.0 - from pip.req import parse_requirements - - reqs = parse_requirements(fname, session=False) - try: - requirements = [str(ir.requirement) for ir in reqs] - except AttributeError: - requirements = [str(ir.req) for ir in reqs] - return requirements + ret = list() + with open(fname) as f: + for line in f: + ret.append(line) + return ret setup( diff --git a/test/samples/demo-whisper_2_3.pdf b/test/samples/demo-whisper_2_3.pdf new file mode 100644 index 0000000..0e6d1df Binary files /dev/null and b/test/samples/demo-whisper_2_3.pdf differ diff --git a/test/samples/pdf2docx-lists-bullets3.docx b/test/samples/pdf2docx-lists-bullets3.docx new file mode 100644 index 0000000..76466f0 Binary files /dev/null and b/test/samples/pdf2docx-lists-bullets3.docx differ diff --git a/test/test.py b/test/test.py index 2a174d3..bc7a7ff 100644 --- a/test/test.py +++ b/test/test.py @@ -1,36 +1,48 @@ ''' The test framework: pytest, pytest-cov. -To test the pdf conversion and converting quality, the idea is to convert generated docx to pdf, -then check the image similarity between source pdf page and converted pdf page. Considering the -converting quality from docx to pdf, a Windows-based command line tool `OfficeToPDF` is used, in -addition, an installation of Microsoft Word is required. - -To leverage the benefit of Github Action, the testing process is divided into three parts: - 1. Convert sample pdf to docx with `pdf2docx`. - 2. Convert generated docx to pdf for comparing. - 3. Convert page to image and compare similarity with `python-opencv`. - -Test scripts on Part One and Three are applied with two test class respectively in this module, -so they could be run separately with pytest command, e.g. - -- pytest -vs --no-header test.py::TestConversion for Part One -- pytest -vs --no-header test.py::TestQuality for Part Three - -Links on MS Word to PDF conversion: - - https://github.com/cognidox/OfficeToPDF/releases - - https://github.com/AndyCyberSec/pylovepdf - - https://www.e-iceblue.com/Tutorials/Java/Spire.Doc-for-Java/Program-Guide/Conversion/Convert-Word-to-PDF-in-Java.html +We have a set of PDF files as test inputs. + +For a test file foo.pdf, we convert it into a file foo.pdf.docx using pdf2docx. + +To check whether this has worked as expected, we use Python package docx2pdf +(which uses Word) on Windows, or Libreoffice command line on other platforms, +to convert foo.pdf.docx into foo.pdf.docx.pdf. + +We then compare foo.pdf.docx.pdf with the original foo.pdf file using opencv, +generating a similarity value. + +So on Windows we require Word is installed, and on other platforms we require +that Libreoffice is installed. + +If docx2pdf fails with `Object reference not set to an instance of an +object. Did not convert`, it might be necessary to follow the instructions at: + + https://stackoverflow.com/questions/24860351/object-reference-not-set-to-an-instance-of-an-object-did-not-convert + + In a Cmd window run: + DCOMCNFG + Then: + Console Root > Component Services > Computers > My Computer > DCOM Config > Microsoft Word 97 - 2003 Document + Then: Right click then properties then Identity tab and set a username and + password. ''' +import glob import os import io import numpy as np import cv2 as cv import fitz from pdf2docx import Converter, parse +import subprocess +import time +import shutil +import platform +import pytest +root_path = os.path.abspath(f'{__file__}/../..') script_path = os.path.abspath(__file__) # current script path test_dir = os.path.dirname(script_path) sample_path = os.path.join(test_dir, 'samples') @@ -99,6 +111,89 @@ def get_mssism(i1, i2, kernel=(15,15)): return np.mean(mssim[0:3]) +def run(command): + print(f'Running: {command}') + subprocess.run(command, shell=1, check=1) + + +def document_to(in_, out): + if platform.system() == 'Windows': + return word_to(in_, out) + else: + return libreoffice_to(in_, out) + + +_g_word_to_docx2pdf = False + +def word_to(in_, out): + global _g_word_to_docx2pdf + if not _g_word_to_docx2pdf: + run('pip install docx2pdf') + import docx2pdf + _g_word_to_docx2pdf = True + assert os.path.isfile(in_), f'Not a file: {in_=}' + run(f'docx2pdf {in_} {out}') + return + import docx2pdf + try: + docx2pdf.convert(in_, out) + except Exception as e: + print(f'docx2pdf.convert() raised exception: {e}') + raise + + + +def libreoffice_to(in_, out): + '''Converts file to pdf using libreoffice. Returns generated path + f'{in_}.pdf'.''' + # Libreoffice does not allow direct specification of the output path and + # goes wrong wtih paths with multiple '.' characters, so we work on a + # temporary. Also it does not return non-zero if it fails so we check + # mtime. + #print(f'{in_=} {out=}') + assert os.path.isfile(in_) + in_root, in_ext = os.path.splitext(in_) + _, out_ext = os.path.splitext(out) + out_dir = os.path.dirname(out) + temp = f'{out_dir}/_temp_libreoffice_to' + in2 = f'{temp}{in_ext}' + out2 = f'{temp}{out_ext}' + shutil.copy2(in_, in2) + try: + t = time.time() + #print(f'{in_=} {in2=} {in_ext=}') + run(f'libreoffice --convert-to {out_ext[1:]} --outdir {out_dir} {in2}') + os.rename(out2, out) + t_out = os.path.getmtime(out) + assert t_out >= t, f'libreoffice failed to update/create {out=}' + finally: + os.remove(in2) + if os.path.isfile(out2): + os.remove(out2) + + +def compare_pdf(pdf1, pdf2, num_pages=None): + #print(f'Comparing {pdf1=} {pdf2=}') + with fitz.Document(pdf1) as doc1, fitz.Document(pdf2) as doc2: + if num_pages: + n1 = num_pages + else: + n1 = len(doc1) + n2 = len(doc2) + if n1 != n2: + print(f'Differing numbers of pages: {n1=} {n2=}.') + return -1 + sidx = 0 + # Find average similarity. + for n in range(n1): + diff_png = f'{pdf2}.diff.{n}.png' + sidx_n = get_page_similarity(doc1[n], doc2[n], diff_png) + #print(f'Page {n}: {diff_png} {sidx_n=}.') + sidx += sidx_n + sidx /= n1 + #print(f'{sidx=}') + return sidx + class TestConversion: '''Test the converting process.''' @@ -129,135 +224,6 @@ def convert_by_io_stream(self, filename): docx_file = os.path.join(output_path, f'{filename}.docx') with open(docx_file, 'wb') as f: f.write(out_stream.getvalue()) - # ------------------------------------------ - # stream - # ------------------------------------------ - def test_io_stream(self): - '''test input/output file stream.''' - self.convert_by_io_stream('demo-text') - - # ------------------------------------------ - # layout: section - # ------------------------------------------ - def test_section(self): - '''test page layout: section and column.''' - self.convert('demo-section') - - def test_section_spacing(self): - '''test page layout: section vertical position.''' - self.convert('demo-section-spacing') - - # ------------------------------------------ - # text styles - # ------------------------------------------ - def test_blank_file(self): - '''test blank file without any texts or images.''' - self.convert('demo-blank') - - def test_text_format(self): - '''test text format, e.g. highlight, underline, strike-through.''' - self.convert('demo-text') - - def test_text_alignment(self): - '''test text alignment.''' - self.convert('demo-text-alignment') - - def test_unnamed_fonts(self): - '''test unnamed fonts which destroys span bbox, and accordingly line/block layout.''' - self.convert('demo-text-unnamed-fonts') - - def test_text_scaling(self): - '''test font size. In this case, the font size is set precisely with character scaling.''' - self.convert('demo-text-scaling') - - def test_text_hidden(self): - '''test hidden text, which is ignore by default.''' - self.convert('demo-text-hidden') - - # ------------------------------------------ - # image styles - # ------------------------------------------ - def test_image(self): - '''test inline-image.''' - self.convert('demo-image') - - def test_vector_graphic(self): - '''test vector graphic.''' - self.convert('demo-image-vector-graphic') - - def test_image_color_space(self): - '''test image color space.''' - self.convert('demo-image-colorspace') - - def test_image_floating(self): - '''test floating images.''' - self.convert('demo-image-floating') - - def test_image_rotation(self): - '''test rotating image due to pdf page rotation.''' - self.convert('demo-image-rotation') - - def test_image_overlap(self): - '''test images with both intersection and page rotation.''' - self.convert('demo-image-overlap') - - - # ------------------------------------------ - # table styles - # ------------------------------------------ - def test_table_bottom(self): - '''page break due to table at the end of page.''' - self.convert('demo-table-bottom') - - def test_table_format(self): - '''test table format, e.g. - - border and shading style - - vertical cell - - merged cell - - text format in cell - ''' - self.convert('demo-table') - - def test_stream_table(self): - '''test stream structure and shading.''' - self.convert('demo-table-stream') - - def test_table_shading(self): - '''test simulating shape with shading cell.''' - self.convert('demo-table-shading') - - def test_table_shading_highlight(self): - '''test distinguishing table shading and highlight.''' - self.convert('demo-table-shading-highlight') - - def test_lattice_table(self): - '''test lattice table with very close text underlines to table borders.''' - self.convert('demo-table-close-underline') - - def test_lattice_table_invoice(self): - '''test invoice sample file with lattice table, vector graphic.''' - self.convert('demo-table-lattice') - - def test_lattice_cell(self): - '''test generating stream borders for lattice table cell.''' - self.convert('demo-table-lattice-one-cell') - - def test_table_border_style(self): - '''test border style, e.g. width, color.''' - self.convert('demo-table-border-style') - - def test_table_align_borders(self): - '''aligning stream table borders to simplify table structure.''' - self.convert('demo-table-align-borders') - - def test_nested_table(self): - '''test nested tables.''' - self.convert('demo-table-nested') - - def test_path_transformation(self): - '''test path transformation. In this case, the (0,0) origin is out of the page.''' - self.convert('demo-path-transformation') - # ------------------------------------------ # table contents @@ -297,66 +263,103 @@ def test_multi_pages(self): assert os.path.isfile(docx_file) +# We make a separate pytest test for each sample file. + +def _find_paths(): + ret = list() + for path in glob.glob(f'{sample_path}/*.docx') + glob.glob(f'{sample_path}/*.pdf'): + path_leaf = os.path.basename(path) + if path_leaf.count('.') > 1: + continue + ret.append(os.path.relpath(path, root_path)) + return ret + +g_paths = _find_paths() + +# We create a separate pytest for each sample file, paramaterised using the +# path of the sample file relative to the pdf2docx directory. +# +# So one can run a specific test with: +# +# pytest pdf2docx/test/test.py::test_one[test/samples/demo-whisper_2_3.pdf] -class TestQuality: +@pytest.mark.parametrize('path', g_paths) +def test_one(path): '''Check the quality of converted docx. - Note the docx files must be converted to PDF files in advance. ''' - - INDEX_MAP = { + + # Where there are two values, they are (sidx_required_word, + # sidx_required_libreoffice). + # + docx_to_sidx_required = { 'demo-blank.pdf': 1.0, 'demo-image-cmyk.pdf': 0.90, 'demo-image-transparent.pdf': 0.90, - 'demo-image-vector-graphic.pdf': 0.89, + 'demo-image-vector-graphic.pdf': (0.89, 0.68), 'demo-image.pdf': 0.90, - 'demo-image-rotation.pdf': 0.90, - 'demo-image-overlap.pdf': 0.90, - 'demo-path-transformation.pdf': 0.90, - 'demo-section-spacing.pdf': 0.90, - 'demo-section.pdf': 0.70, + 'demo-image-rotation.pdf': (0.90, 0.82), + 'demo-image-overlap.pdf': (0.90, 0.70), + 'demo-path-transformation.pdf': (0.89, 0.60), + 'demo-section-spacing.pdf': (0.90, 0.86), + 'demo-section.pdf': (0.70, 0.45), 'demo-table-align-borders.pdf': 0.49, - 'demo-table-border-style.pdf': 0.90, + 'demo-table-border-style.pdf': (0.90, 0.89), 'demo-table-bottom.pdf': 0.90, - 'demo-table-close-underline.pdf': 0.58, - 'demo-table-lattice-one-cell.pdf': 0.79, - 'demo-table-lattice.pdf': 0.75, + 'demo-table-close-underline.pdf': (0.57, 0.49), + 'demo-table-lattice-one-cell.pdf': (0.79, 0.75), + 'demo-table-lattice.pdf': (0.75, 0.59), 'demo-table-nested.pdf': 0.84, - 'demo-table-shading-highlight.pdf': 0.55, - 'demo-table-shading.pdf': 0.80, + 'demo-table-shading-highlight.pdf': (0.55, 0.45), + 'demo-table-shading.pdf': (0.80, 0.60), 'demo-table-stream.pdf': 0.55, - 'demo-table.pdf': 0.90, - 'demo-text-alignment.pdf': 0.90, - 'demo-text-scaling.pdf': 0.80, - 'demo-text-unnamed-fonts.pdf': 0.80, + 'demo-table.pdf': (0.90, 0.75), + 'demo-text-alignment.pdf': (0.90, 0.86), + 'demo-text-scaling.pdf': (0.80, 0.65), + 'demo-text-unnamed-fonts.pdf': (0.80, 0.77), 'demo-text-hidden.pdf': 0.90, - 'demo-text.pdf': 0.80 + 'demo-text.pdf': 0.80, + 'pdf2docx-lists-bullets3.docx': (0.98, 0.99), } - def setup(self): - '''create output path if not exist.''' - if not os.path.exists(output_path): os.mkdir(output_path) - - - def test_quality(self): - '''Convert page to image and compare similarity.''' - for filename in os.listdir(output_path): - if not filename.endswith('pdf'): continue - - source_pdf_file = os.path.join(sample_path, filename) - target_pdf_file = os.path.join(output_path, filename) - - # open pdf - source_pdf = fitz.open(source_pdf_file) - target_pdf = fitz.open(target_pdf_file) - - # compare page count - if len(source_pdf)>1: continue # one page sample only - assert len(target_pdf)==1, f"\nThe page count of {filename} is incorrect." - - # compare the first page - diff_png = os.path.join(output_path, f'{filename[:-4]}.png') - sidx = get_page_similarity(target_pdf[0], source_pdf[0], diff_png) - threshold = TestQuality.INDEX_MAP.get(filename, 0.10) - print(f'Checking {filename}: {sidx} v.s. {threshold}') - assert sidx>=threshold, 'Significant difference might exist since similarity index is lower than threshold.' - + print(f'# Looking at: {path}') + if os.path.basename(path) == 'demo-whisper_2_3.pdf': + print(f'Ignoring {path=} because known to fail.') + return + path = f'{root_path}/{path}' + path_leaf = os.path.basename(path) + _, ext = os.path.splitext(path) + if ext == '.docx': + pdf = f'{path}.pdf' + document_to(path, pdf) + else: + pdf = path + docx2 = f'{pdf}.docx' + pages = None + if os.path.basename(path) == 'demo-whisper_2_3.pdf': + pages = [25, 26, 27] + else: + with fitz.Document(pdf) as doc: + if len(doc) > 1: + print(f'Not testing because more than one page: {path}') + return + #print(f'Calling parse() {pdf=} {docx2=}') + parse(pdf, docx2, pages=pages, raw_exceptions=True) + assert os.path.isfile(docx2) + pdf2 = f'{docx2}.pdf' + document_to(docx2, pdf2) + sidx = compare_pdf(pdf, pdf2, num_pages=1) + + sidx_required = docx_to_sidx_required.get(path_leaf) + if sidx_required: + if isinstance(sidx_required, tuple): + sr_word, sr_libreoffice = sidx_required + sidx_required = sr_word if platform.system() == 'Windows' else sr_libreoffice + + #print(f'{path=}: {sidx_required=} {sidx=}.') + if sidx < sidx_required: + print(f'{sidx=} too low - should be >= {sidx_required=}') + print(f' {pdf}') + print(f' {pdf2}') + assert 0 + else: + print(f'# No sidx_required available for {path_leaf=}.')