Skip to content

Commit

Permalink
add support for file_bytes argument with managed_file_context()
Browse files Browse the repository at this point in the history
  • Loading branch information
cscanlin-kwh authored and bosd committed Mar 27, 2024
1 parent 567520b commit d487642
Show file tree
Hide file tree
Showing 4 changed files with 153 additions and 78 deletions.
153 changes: 98 additions & 55 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
from contextlib import contextmanager
import io
import os
import sys
from pathlib import Path
Expand All @@ -11,7 +13,8 @@
from .parsers import Lattice
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import InvalidArguments
from .utils import get_url_bytes
from .utils import get_page_layout
from .utils import get_rotation
from .utils import get_text_objects
Expand All @@ -25,21 +28,36 @@ class PDFHandler:
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
"""

def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
FilePathType = Union[str, IO[Any], Path, None]

def __init__(self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath: Union[StrByteType, Path] = filepath
file_bytes = get_url_bytes(filepath)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
if not filepath:
# filepath must either be passed, or taken from the name attribute
filepath = getattr(file_bytes, 'name')
if not filepath:
msg = ('Either pass a `filepath`, or give the '
'`file_bytes` argument a name attribute')
raise InvalidArguments(msg)
self.file_bytes = file_bytes # ok to be None

self.filepath: Union[StrByteType, Path] = filepath
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")

Expand All @@ -51,13 +69,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)

@contextmanager
def managed_file_context(self):
"""Reads from either the `filepath` or `file_bytes`
attribute of this instance, to return a file-like object.
Closes any open file handles on exit or error.
Returns
-------
file_bytes : io.IOBase
A readable, seekable, file-like object
"""
if self.file_bytes:
# if we can't seek, write to a BytesIO object that can,
# then seek to the beginning before yielding
if not hasattr(self.file_bytes, 'seek'):
self.file_bytes = io.BytesIO(self.file_bytes.read())
self.file_bytes.seek(0)
yield self.file_bytes
else:
with open(self.filepath, "rb") as file_bytes:
yield file_bytes

def _get_pages(self, pages):
"""Converts pages string to list of ints.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -73,74 +113,77 @@ def _get_pages(self, pages):
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfReader(self.filepath, strict=False)

if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})
with self.managed_file_context() as f:
infile = PdfReader(f, strict=False)

if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})

result = []
for p in page_numbers:
result.extend(range(p["start"], p["end"] + 1))
return sorted(set(result))

def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
def _save_page(self, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
page : int
Page number.
temp : str
Tmp directory.
"""
infile = PdfReader(filepath, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)

with self.managed_file_context() as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()

def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs
Expand Down Expand Up @@ -173,7 +216,7 @@ def parse(
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
self._save_page(p, tempdir)
pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages:
Expand Down
22 changes: 16 additions & 6 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@
from pypdf._utils import StrByteType

from .handlers import PDFHandler
from .utils import remove_extra
from .utils import validate_input

from .utils import (
InvalidArguments,
validate_input,
remove_extra,
)


def read_pdf(
filepath: Union[StrByteType, Path],
filepath=Union[StrByteType, Path],
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs=None,
file_bytes=None,
**kwargs
):
"""Read PDF and return extracted tables.
Expand All @@ -25,8 +30,8 @@ def read_pdf(
Parameters
----------
filepath : str, Path, IO
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -37,6 +42,8 @@ def read_pdf(
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
Expand Down Expand Up @@ -112,12 +119,15 @@ def read_pdf(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand Down
37 changes: 20 additions & 17 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
import io
import random
import re
import shutil
Expand Down Expand Up @@ -34,6 +34,10 @@
_VALID_URLS.discard("")


class InvalidArguments(Exception):
pass


# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Expand Down Expand Up @@ -64,31 +68,30 @@ def random_string(length):
return ret


def download_url(url):
"""Download file from specified URL.
def get_url_bytes(url):
"""Get a stream of bytes for url
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
file_bytes : io.BytesIO
a file-like object that cane be read
"""
filename = f"{random_string(6)}.pdf"
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, None, headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
file_bytes = io.BytesIO()
file_bytes.name = url
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, data=None, headers=headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
file_bytes.write(obj.read())
file_bytes.seek(0)
return file_bytes


stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
Expand Down
19 changes: 19 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import os
from pathlib import Path

Expand Down Expand Up @@ -188,3 +189,21 @@ def test_handler_with_pathlib(testdir):
with open(filename, "rb") as f:
handler = PDFHandler(f)
assert handler._get_pages("1") == [1]

@skip_on_windows
def test_from_open(testdir):
filename = os.path.join(testdir, "foo.pdf")
with open(filename, "rb") as file_bytes:
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"

@skip_on_windows
def test_from_bytes(testdir):
filename = os.path.join(testdir, "foo.pdf")
file_bytes = io.BytesIO()
with open(filename, "rb") as f:
file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"

0 comments on commit d487642

Please sign in to comment.