Skip to content

Commit

Permalink
add support for file_bytes argument with managed_file_context()
Browse files Browse the repository at this point in the history
  • Loading branch information
cscanlin-kwh authored and bosd committed Mar 27, 2024
1 parent 567520b commit 81c9ce4
Show file tree
Hide file tree
Showing 4 changed files with 154 additions and 79 deletions.
155 changes: 99 additions & 56 deletions camelot/handlers.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
from contextlib import contextmanager
import io
import os
import sys
from pathlib import Path
from typing import Union
from typing import Union, Any, IO, TypeVar

from pypdf import PdfReader
from pypdf import PdfWriter
Expand All @@ -11,7 +13,8 @@
from .parsers import Lattice
from .parsers import Stream
from .utils import TemporaryDirectory
from .utils import download_url
from .utils import InvalidArguments
from .utils import get_url_bytes
from .utils import get_page_layout
from .utils import get_rotation
from .utils import get_text_objects
Expand All @@ -25,21 +28,36 @@ class PDFHandler:
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
password : str, optional (default: None)
Password for decryption.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
"""

def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None):
FilePathType = TypeVar(StrByteType, str, IO[Any], StrByteType, Path, None)

def __init__(self, filepath: FilePathType = None, pages="1", password=None, file_bytes=None):
if is_url(filepath):
filepath = download_url(filepath)
self.filepath: Union[StrByteType, Path] = filepath
file_bytes = get_url_bytes(filepath)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')
if not filepath:
# filepath must either be passed, or taken from the name attribute
filepath = getattr(file_bytes, 'name')
if not filepath:
msg = ('Either pass a `filepath`, or give the '
'`file_bytes` argument a name attribute')
raise InvalidArguments(msg)
self.file_bytes = file_bytes # ok to be None

self.filepath: Union[StrByteType, Path] = filepath
if isinstance(filepath, str) and not filepath.lower().endswith(".pdf"):
raise NotImplementedError("File format not supported")

Expand All @@ -51,13 +69,35 @@ def __init__(self, filepath: Union[StrByteType, Path], pages="1", password=None)
self.password = self.password.encode("ascii")
self.pages = self._get_pages(pages)

@contextmanager
def managed_file_context(self):
"""Reads from either the `filepath` or `file_bytes`
attribute of this instance, to return a file-like object.
Closes any open file handles on exit or error.
Returns
-------
file_bytes : io.IOBase
A readable, seekable, file-like object
"""
if self.file_bytes:
# if we can't seek, write to a BytesIO object that can,
# then seek to the beginning before yielding
if not hasattr(self.file_bytes, 'seek'):
self.file_bytes = io.BytesIO(self.file_bytes.read())
self.file_bytes.seek(0)
yield self.file_bytes
else:
with open(self.filepath, "rb") as file_bytes:
yield file_bytes

def _get_pages(self, pages):
"""Converts pages string to list of ints.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -73,74 +113,77 @@ def _get_pages(self, pages):
if pages == "1":
page_numbers.append({"start": 1, "end": 1})
else:
infile = PdfReader(self.filepath, strict=False)

if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})
with self.managed_file_context() as f:
infile = PdfReader(f, strict=False)

if infile.is_encrypted:
infile.decrypt(self.password)

if pages == "all":
page_numbers.append({"start": 1, "end": len(infile.pages)})
else:
for r in pages.split(","):
if "-" in r:
a, b = r.split("-")
if b == "end":
b = len(infile.pages)
page_numbers.append({"start": int(a), "end": int(b)})
else:
page_numbers.append({"start": int(r), "end": int(r)})

result = []
for p in page_numbers:
result.extend(range(p["start"], p["end"] + 1))
return sorted(set(result))

def _save_page(self, filepath: Union[StrByteType, Path], page, temp):
def _save_page(self, page, temp):
"""Saves specified page from PDF into a temporary directory.
Parameters
----------
filepath : str
Filepath or URL of the PDF file.
managed_file_context : io.IOBase
A readable, seekable, file-like object
page : int
Page number.
temp : str
Tmp directory.
"""
infile = PdfReader(filepath, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)

with self.managed_file_context() as fileobj:
infile = PdfReader(fileobj, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
fpath = os.path.join(temp, f"page-{page}.pdf")
froot, fext = os.path.splitext(fpath)
p = infile.pages[page - 1]
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()
layout, dim = get_page_layout(fpath)
# fix rotated PDF
chars = get_text_objects(layout, ltype="char")
horizontal_text = get_text_objects(layout, ltype="horizontal_text")
vertical_text = get_text_objects(layout, ltype="vertical_text")
rotation = get_rotation(chars, horizontal_text, vertical_text)
if rotation != "":
fpath_new = "".join([froot.replace("page", "p"), "_rotated", fext])
os.rename(fpath, fpath_new)
instream = open(fpath_new, "rb")
infile = PdfReader(instream, strict=False)
if infile.is_encrypted:
infile.decrypt(self.password)
outfile = PdfWriter()
p = infile.pages[0]
if rotation == "anticlockwise":
p.rotate(90)
elif rotation == "clockwise":
p.rotate(-90)
outfile.add_page(p)
with open(fpath, "wb") as f:
outfile.write(f)
instream.close()

def parse(
self, flavor="lattice", suppress_stdout=False, layout_kwargs=None, **kwargs
Expand Down Expand Up @@ -173,7 +216,7 @@ def parse(
tables = []
with TemporaryDirectory() as tempdir:
for p in self.pages:
self._save_page(self.filepath, p, tempdir)
self._save_page(p, tempdir)
pages = [os.path.join(tempdir, f"page-{p}.pdf") for p in self.pages]
parser = Lattice(**kwargs) if flavor == "lattice" else Stream(**kwargs)
for p in pages:
Expand Down
22 changes: 16 additions & 6 deletions camelot/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,17 +5,22 @@
from pypdf._utils import StrByteType

from .handlers import PDFHandler
from .utils import remove_extra
from .utils import validate_input

from .utils import (
InvalidArguments,
validate_input,
remove_extra,
)


def read_pdf(
filepath: Union[StrByteType, Path],
filepath=Union[StrByteType, Path],
pages="1",
password=None,
flavor="lattice",
suppress_stdout=False,
layout_kwargs=None,
file_bytes=None,
**kwargs
):
"""Read PDF and return extracted tables.
Expand All @@ -25,8 +30,8 @@ def read_pdf(
Parameters
----------
filepath : str, Path, IO
Filepath or URL of the PDF file.
filepath : str | pathlib.Path, optional (default: None)
Filepath or URL of the PDF file. Required if file_bytes is not given
pages : str, optional (default: '1')
Comma-separated page numbers.
Example: '1,3,4' or '1,4-end' or 'all'.
Expand All @@ -37,6 +42,8 @@ def read_pdf(
Lattice is used by default.
suppress_stdout : bool, optional (default: True)
Print all logs and warnings.
file_bytes : io.IOBase, optional (default: None)
A file-like stream. Required if filepath is not given
layout_kwargs : dict, optional (default: {})
A dict of `pdfminer.layout.LAParams
<https://github.com/euske/pdfminer/blob/master/pdfminer/layout.py#L33>`_ kwargs.
Expand Down Expand Up @@ -112,12 +119,15 @@ def read_pdf(
"Unknown flavor specified." " Use either 'lattice' or 'stream'"
)

if not filepath and not file_bytes:
raise InvalidArguments('Either `filepath` or `file_bytes` is required')

with warnings.catch_warnings():
if suppress_stdout:
warnings.simplefilter("ignore")

validate_input(kwargs, flavor=flavor)
p = PDFHandler(filepath, pages=pages, password=password)
p = PDFHandler(filepath, pages=pages, password=password, file_bytes=file_bytes)
kwargs = remove_extra(kwargs, flavor=flavor)
tables = p.parse(
flavor=flavor,
Expand Down
37 changes: 20 additions & 17 deletions camelot/utils.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import os
import io
import random
import re
import shutil
Expand Down Expand Up @@ -34,6 +34,10 @@
_VALID_URLS.discard("")


class InvalidArguments(Exception):
pass


# https://github.com/pandas-dev/pandas/blob/master/pandas/io/common.py
def is_url(url):
"""Check to see if a URL has a valid protocol.
Expand Down Expand Up @@ -64,31 +68,30 @@ def random_string(length):
return ret


def download_url(url):
"""Download file from specified URL.
def get_url_bytes(url):
"""Get a stream of bytes for url
Parameters
----------
url : str or unicode
Returns
-------
filepath : str or unicode
Temporary filepath.
file_bytes : io.BytesIO
a file-like object that cane be read
"""
filename = f"{random_string(6)}.pdf"
with tempfile.NamedTemporaryFile("wb", delete=False) as f:
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, None, headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
f.write(obj.read())
filepath = os.path.join(os.path.dirname(f.name), filename)
shutil.move(f.name, filepath)
return filepath
file_bytes = io.BytesIO()
file_bytes.name = url
headers = {"User-Agent": "Mozilla/5.0"}
request = Request(url, data=None, headers=headers)
obj = urlopen(request)
content_type = obj.info().get_content_type()
if content_type != "application/pdf":
raise NotImplementedError("File format not supported")
file_bytes.write(obj.read())
file_bytes.seek(0)
return file_bytes


stream_kwargs = ["columns", "edge_tol", "row_tol", "column_tol"]
Expand Down
19 changes: 19 additions & 0 deletions tests/test_common.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import io
import os
from pathlib import Path

Expand Down Expand Up @@ -188,3 +189,21 @@ def test_handler_with_pathlib(testdir):
with open(filename, "rb") as f:
handler = PDFHandler(f)
assert handler._get_pages("1") == [1]

@skip_on_windows
def test_from_open(testdir):
filename = os.path.join(testdir, "foo.pdf")
with open(filename, "rb") as file_bytes:
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"

@skip_on_windows
def test_from_bytes(testdir):
filename = os.path.join(testdir, "foo.pdf")
file_bytes = io.BytesIO()
with open(filename, "rb") as f:
file_bytes.write(f.read()) # note that we didn't seek, done by PDFHandler
tables = camelot.read_pdf(file_bytes=file_bytes)
assert repr(tables) == "<TableList n=1>"
assert repr(tables[0]) == "<Table shape=(7, 7)>"

0 comments on commit 81c9ce4

Please sign in to comment.