Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multi parameter for page level parameters #42

Draft
wants to merge 4 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 23 additions & 4 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class PDFHandler:

"""

def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None, multi={}):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath: Union[StrByteType, Path] = filepath
Expand All @@ -51,6 +51,7 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
if sys.version_info[0] < 3:
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)
self.multi = multi

def _get_pages(self, pages):
"""Converts pages string to list of ints.
Expand Down Expand Up @@ -188,8 +189,17 @@ def parse(
with mp.get_context("spawn").Pool(processes=cpu_count) as pool:
jobs = []
for p in self.pages:
p_no = str(p)

page_kwargs = kwargs
page_parser = parser

if p_no in self.multi:
page_kwargs.update(self.multi[p_no])
page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)

j = pool.apply_async(
self._parse_page,(p, tempdir, parser, suppress_stdout, layout_kwargs)
self._parse_page,(p, tempdir, page_parser, suppress_stdout, layout_kwargs)
)
jobs.append(j)

Expand All @@ -198,7 +208,16 @@ def parse(
tables.extend(t)
else:
for p in self.pages:
t = self._parse_page(p, tempdir, parser, suppress_stdout, layout_kwargs)
p_no = str(p)

page_kwargs = kwargs
page_parser = parser

if p_no in self.multi:
page_kwargs.update(self.multi[p_no])
page_parser = Lattice(**page_kwargs) if flavor == 'lattice' else Stream(**page_kwargs)

t = self._parse_page(p, tempdir, page_parser, suppress_stdout, layout_kwargs)
tables.extend(t)

return TableList(sorted(tables))
Expand All @@ -224,7 +243,7 @@ def _parse_page(
-------
tables : camelot.core.TableList
List of tables found in PDF.

"""
self._save_page(self.filepath, page, tempdir)
page_path = os.path.join(tempdir, f"page-{page}.pdf")
Expand Down
6 changes: 5 additions & 1 deletion camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def read_pdf(
suppress_stdout=False,
parallel=False,
layout_kwargs=None,
multi = {},
**kwargs
):
"""Read PDF and return extracted tables.
Expand All @@ -43,6 +44,9 @@ def read_pdf(
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
multi: dict, optional(default: {})
A dict to enter parameters specific only for a page. Key: Page(str) to dict(defined params).
Parameters defined in multi overwrite kwargs for that page
table_areas : list, optional (default: None)
List of table area strings of the form x1,y1,x2,y2
where (x1, y1) -> left-top and (x2, y2) -> right-bottom
Expand Down Expand Up @@ -120,7 +124,7 @@ def read_pdf(
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, multi=multi)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand Down
17 changes: 17 additions & 0 deletions tests/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -3838,3 +3838,20 @@
"[email protected]",
],
]

data_multi_params1 = [
["Number of Coils", "Number of Paperclips"],
["5", "3, 5, 4"],
["10", "7, 8, 6"],
["15", "11, 10, 12"],
["20", "15, 13, 14"]
]

data_multi_params2 = [
["Time (drops of water)", "Distance (cm)"],
["1", "10,11,9"],
["2", "29, 31, 30"],
["3", "59, 58, 61"],
["4", "102, 100, 98"],
["5", "122, 125, 127"]
]
Binary file added tests/files/multi_params.pdf
Binary file not shown.
14 changes: 14 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
Expand Up @@ -214,3 +214,17 @@ def _make_table(page, order):
assert iterator_b is not None
item_c = next(iterator_b)
assert item_c is not None

@skip_on_windows
def test_multi_params(testdir):
df1 = pd.DataFrame(data_multi_params1)
df2 = pd.DataFrame(data_multi_params2)

filename = os.path.join(
testdir, "multi_params.pdf"
)
tables = camelot.read_pdf(filename, pages="all", multi={'2': {"table_regions": ["120, 210, 400, 90"]}},
split_text=True)

assert df1.equals(tables[0].df)
assert df2.equals(tables[1].df)
Loading