diff --git a/MANIFEST b/MANIFEST deleted file mode 100644 index 6a8ccb9..0000000 --- a/MANIFEST +++ /dev/null @@ -1,5 +0,0 @@ -# file GENERATED by distutils, do NOT edit -setup.cfg -setup.py -tikapp/__init__.py -tikapp/exceptions.py diff --git a/MANIFEST.in b/MANIFEST.in new file mode 100644 index 0000000..f9bd145 --- /dev/null +++ b/MANIFEST.in @@ -0,0 +1 @@ +include requirements.txt diff --git a/README b/README new file mode 100644 index 0000000..47af015 --- /dev/null +++ b/README @@ -0,0 +1,145 @@ +tika-app-python +=============== + +Overview +-------- + +tika-app-python is a wrapper for `Apache Tika App`_. + +Apache 2 Open Source License +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +tika-app-python can be downloaded, used, and modified free of charge. It +is available under the Apache 2 license. + +Authors +------- + +Main Author +~~~~~~~~~~~ + +Fedele Mantuano (**Twitter**: +[@fedelemantuano](https://twitter.com/fedelemantuano)) + +Installation +------------ + +Clone repository + +:: + + git clone https://github.com/fedelemantuano/tika-app-python.git + +and install tika-app-python with ``setup.py``: + +:: + + cd tika-app-python + + python setup.py install + +or use ``pip``: + +:: + + pip install tika-app + +Usage +----- + +Import ``TikaApp`` class: + +:: + + from tikapp import TikaApp + + tika_client = TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") + +For get **content type**: + +:: + + tika_client.detect_content_type("your_file") + +For detect **language**: + +:: + + tika_client.detect_language("your_file") + +For detect **all metadata and content**: + +:: + + tika_client.extract_all_content("your_file") + +For detect **only content**: + +:: + + tika_client.extract_only_content("your_file") + +If you want to use payload in base64, you can use the same methods with +``payload`` argument: + +:: + + tika_client.detect_content_type(payload="base64_payload") + tika_client.detect_language(payload="base64_payload") + tika_client.extract_all_content(payload="base64_payload") + tika_client.extract_only_content(payload="base64_payload") + +Usage from command-line +----------------------- + +If you installed tika-app-python with ``pip`` or ``setup.py`` you can +use it with command-line. To use tika-app-python you should submit the +Apache Tika app JAR. You can: - leave the default value: +``/opt/tika/tika-app-1.13.jar`` - set the enviroment value +``TIKA_APP_JAR`` - use ``--jar`` switch + +The last one overwrite all the others. + +These are all swithes: + +:: + + usage: tikapp [-h] (-f FILE | -p PAYLOAD) [-j JAR] [-d] [-t] [-l] [-a] + [-v] + + Wrapper for Apache Tika App. + + optional arguments: + -h, --help show this help message and exit + -f FILE, --file FILE File to submit (default: None) + -p PAYLOAD, --payload PAYLOAD + Base64 payload to submit (default: None) + -j JAR, --jar JAR Apache Tika app JAR (default: None) + -d, --detect Detect document type (default: False) + -t, --text Output plain text content (default: False) + -l, --language Output only language (default: False) + -a, --all Output metadata and content from all embedded files + (default: False) + -v, --version show program's version number and exit + +Example: + +.. code:: shell + + $ tikapp -f example_file -a + +Performance tests +----------------- + +These are the results of performance tests in `tests`_ folder: + +:: + + tika_content_type() 0.708108 sec + tika_detect_language() 1.748900 sec + magic_content_type() 0.000215 sec + tika_extract_all_content() 0.849755 sec + tika_extract_only_content() 0.791735 sec + +.. _Apache Tika App: https://tika.apache.org/ +.. _tests: https://github.com/fedelemantuano/tika-app-python/tree/develop/tests diff --git a/README.md b/README.md index 9bff11c..c0a7dc1 100644 --- a/README.md +++ b/README.md @@ -36,7 +36,7 @@ or use `pip`: pip install tika-app ``` -## Usage +## Usage in a project Import `TikaApp` class: @@ -79,9 +79,47 @@ tika_client.extract_all_content(payload="base64_payload") tika_client.extract_only_content(payload="base64_payload") ``` +## Usage from command-line + +If you installed tika-app-python with `pip` or `setup.py` you can use it with command-line. +To use tika-app-python you should submit the Apache Tika app JAR. You can: + - leave the default value: `/opt/tika/tika-app-1.13.jar` + - set the enviroment value `TIKA_APP_JAR` + - use `--jar` switch + +The last one overwrite all the others. + +These are all swithes: + +``` +usage: tikapp [-h] (-f FILE | -p PAYLOAD) [-j JAR] [-d] [-t] [-l] [-a] + [-v] + +Wrapper for Apache Tika App. + +optional arguments: + -h, --help show this help message and exit + -f FILE, --file FILE File to submit (default: None) + -p PAYLOAD, --payload PAYLOAD + Base64 payload to submit (default: None) + -j JAR, --jar JAR Apache Tika app JAR (default: None) + -d, --detect Detect document type (default: False) + -t, --text Output plain text content (default: False) + -l, --language Output only language (default: False) + -a, --all Output metadata and content from all embedded files + (default: False) + -v, --version show program's version number and exit +``` + +Example: + +```shell +$ tikapp -f example_file -a +``` + ## Performance tests -These are the results of performance tests in [profiling](https://github.com/fedelemantuano/tika-app-python/tree/develop/profiling) folder: +These are the results of performance tests in [tests](https://github.com/fedelemantuano/tika-app-python/tree/develop/tests) folder: ``` tika_content_type() 0.708108 sec diff --git a/requirements.txt b/requirements.txt index c457133..81ce69e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,2 +1,3 @@ +chainmap==1.0.2 python-magic==0.4.12 -simplejson==3.8.2 +simplejson==3.10.0 diff --git a/setup.py b/setup.py index c9d6f75..dfed53d 100644 --- a/setup.py +++ b/setup.py @@ -1,18 +1,56 @@ #!/usr/bin/env python +# -*- coding: utf-8 -*- +""" +Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +from os.path import join, dirname from distutils.core import setup +from tikapp import __versionstr__ + + +long_description = open(join(dirname(__file__), 'README')).read().strip() +requires = open(join(dirname(__file__), + 'requirements.txt')).read().splitlines() + setup( name='tika-app', - version='0.4', description='Python client for Apache Tika App', + license="Apache License, Version 2.0", + url='https://github.com/fedelemantuano/tika-app-python', + long_description=long_description, + version=__versionstr__, author='Fedele Mantuano', author_email='mantuano.fedele@gmail.com', maintainer='Fedele Mantuano', maintainer_email='mantuano.fedele@gmail.com', - url='https://github.com/fedelemantuano/tika-app-python', - keywords=['tika', 'apache', 'toolkit'], - requires=['simplejson'], - license="Apache License, Version 2.0", packages=['tikapp'], + platforms=["Linux", ], + keywords=['tika', 'apache', 'toolkit'], + classifiers=[ + "License :: OSI Approved :: Apache Software License", + "Intended Audience :: Developers", + "Operating System :: OS Independent", + "Programming Language :: Python", + "Programming Language :: Python :: 2", + "Programming Language :: Python :: 2.6", + "Programming Language :: Python :: 2.7", + ], + install_requires=requires, + entry_points={'console_scripts': [ + 'tikapp = tikapp.__main__:main']}, ) diff --git a/tests/test_tika_app.py b/tests/test_tika_app.py index 89e1eeb..d871928 100644 --- a/tests/test_tika_app.py +++ b/tests/test_tika_app.py @@ -40,38 +40,30 @@ class TestTikaApp(unittest.TestCase): def test_invalid_tika_app_jar(self): self.assertRaises( tika.InvalidTikaAppJar, - tika.TikaApp, - ) + tika.TikaApp) def test_invalid_switches(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") with self.assertRaises(tika.InvalidSwitches): tika_app.generic("--help") def test_generic(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") - self.assertIsInstance( - tika_app.generic(), - str, - ) + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") + self.assertIsInstance(tika_app.generic(), str) def test_invalid_parameters(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") with self.assertRaises(tika.InvalidParameters): - tika_app.extract_all_content( - file_path=None, - payload=None, - ) + tika_app.extract_all_content(file_path=None, payload=None) with self.assertRaises(tika.InvalidParameters): - tika_app.extract_all_content( - file_path=True, - payload=True, - ) + tika_app.extract_all_content(file_path=True, payload=True) def test_extract_content_from_file(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") + + self.assertEqual("/opt/tika/tika-app-1.13.jar", tika_app.file_jar) result = tika_app.extract_all_content(test_zip) self.assertIsInstance(result, str) @@ -82,43 +74,26 @@ def test_extract_content_from_file(self): self.assertEqual(result_obj[0]["Content-Type"], "application/zip") self.assertEqual( result_obj[1]["Content-Type"], - "text/plain; charset=ISO-8859-1" - ) - self.assertEqual( - result_obj[0]["resourceName"], - "test.zip" - ) - self.assertEqual( - result_obj[1]["resourceName"], - "test.txt" - ) + "text/plain; charset=ISO-8859-1") + self.assertEqual(result_obj[0]["resourceName"], "test.zip") + self.assertEqual(result_obj[1]["resourceName"], "test.txt") def test_extract_content_obj(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") result_obj = tika_app.extract_all_content( - file_path=test_zip, - convert_to_obj=True, - ) + file_path=test_zip, convert_to_obj=True) self.assertIsInstance(result_obj, list) self.assertEqual(len(result_obj), 2) self.assertEqual(result_obj[0]["Content-Type"], "application/zip") - self.assertEqual( - result_obj[1]["Content-Type"], - "text/plain; charset=ISO-8859-1" - ) - self.assertEqual( - result_obj[0]["resourceName"], - "test.zip" - ) - self.assertEqual( - result_obj[1]["resourceName"], - "test.txt" - ) + self.assertEqual(result_obj[1]["Content-Type"], + "text/plain; charset=ISO-8859-1") + self.assertEqual(result_obj[0]["resourceName"], "test.zip") + self.assertEqual(result_obj[1]["resourceName"], "test.txt") def test_extract_content_from_buffer(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") with open(test_zip, 'rb') as f: payload = f.read().encode("base64") @@ -132,33 +107,23 @@ def test_extract_content_from_buffer(self): result_file_obj = json.loads(result_file) result_payload_obj = json.loads(result_payload) - self.assertEqual( - result_file_obj[0]["Content-Type"], - result_payload_obj[0]["Content-Type"], - ) + self.assertEqual(result_file_obj[0]["Content-Type"], + result_payload_obj[0]["Content-Type"]) - self.assertEqual( - result_file_obj[1]["Content-Type"], - result_payload_obj[1]["Content-Type"], - ) + self.assertEqual(result_file_obj[1]["Content-Type"], + result_payload_obj[1]["Content-Type"]) - self.assertEqual( - result_file_obj[1]["resourceName"], - result_payload_obj[1]["resourceName"], - ) + self.assertEqual(result_file_obj[1]["resourceName"], + result_payload_obj[1]["resourceName"]) def test_language(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") - + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") result = tika_app.detect_language(file_path=test_txt) - self.assertEqual(result, "en") def test_extract_only_content(self): - tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.12.jar") - + tika_app = tika.TikaApp(file_jar="/opt/tika/tika-app-1.13.jar") result = tika_app.extract_only_content(file_path=test_txt) - self.assertIsInstance(result, str) self.assertIn("test", result) diff --git a/tikapp/__init__.py b/tikapp/__init__.py index e7cfa16..fa203c2 100644 --- a/tikapp/__init__.py +++ b/tikapp/__init__.py @@ -22,7 +22,7 @@ import os import tempfile from subprocess import Popen, PIPE, STDOUT -from exceptions import \ +from .exceptions import \ InvalidTikaAppJar, \ InvalidSwitches, \ InvalidFilePath, \ @@ -36,20 +36,21 @@ log = logging.getLogger(__name__) +VERSION = (0, 5, 0) +__version__ = VERSION +__versionstr__ = '.'.join(map(str, VERSION)) + class TikaApp(object): def __init__( self, file_jar=None, - memory_allocation=None, + memory_allocation=None ): - if not file_jar or not os.path.exists(file_jar): - log.exception("Invalid Tika app jar") - raise InvalidTikaAppJar("Invalid Tika app jar") - self._file_jar = file_jar - self._memory_allocation = memory_allocation + self.file_jar = file_jar + self.memory_allocation = memory_allocation def _write_payload(self, payload): """Write a base64 payload on temp file @@ -81,11 +82,9 @@ def _file_path(self, file_path=None, payload=None): file_ = file_path else: log.exception( - "Invalid parameters: you must pass file_path or payload" - ) + "Invalid parameters: you must pass file_path or payload") raise InvalidParameters( - "Invalid parameters: you must pass file_path or payload" - ) + "Invalid parameters: you must pass file_path or payload") if not os.path.exists(file_): log.exception("File {} does not exist".format(file_)) @@ -131,10 +130,22 @@ def _command_template(self, switches): def file_jar(self): return self._file_jar + @file_jar.setter + def file_jar(self, value): + if not value or not os.path.exists(value): + log.exception("Invalid Tika app jar") + raise InvalidTikaAppJar("Invalid Tika app jar") + + self._file_jar = value + @property def memory_allocation(self): return self._memory_allocation + @memory_allocation.setter + def memory_allocation(self, value): + self._memory_allocation = value + @property def help(self): return self._command_template(["--help"]) diff --git a/tikapp/__main__.py b/tikapp/__main__.py new file mode 100755 index 0000000..bf309bd --- /dev/null +++ b/tikapp/__main__.py @@ -0,0 +1,142 @@ +#!/usr/bin/env python +# -*- coding: utf-8 -*- + +""" +Copyright 2016 Fedele Mantuano (https://twitter.com/fedelemantuano) + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +""" + +import argparse +import os +import sys + +try: + from collections import ChainMap +except ImportError: + from chainmap import ChainMap + +current = os.path.realpath(os.path.dirname(__file__)) +root = os.path.join(current, '..') +sys.path.append(root) + +from tikapp import TikaApp, __versionstr__ + + +def get_args(): + parser = argparse.ArgumentParser( + description="Wrapper for Apache Tika App.", + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + + parsing_group = parser.add_mutually_exclusive_group(required=True) + parsing_group.add_argument( + "-f", + "--file", + dest="file", + help="File to submit") + parsing_group.add_argument( + "-p", + "--payload", + dest="payload", + help="Base64 payload to submit") + + parser.add_argument( + "-j", + "--jar", + dest="jar", + help="Apache Tika app JAR") + + parser.add_argument( + "-d", + "--detect", + dest="detect", + action="store_true", + help="Detect document type") + + parser.add_argument( + "-t", + "--text", + dest="text", + action="store_true", + help="Output plain text content") + + parser.add_argument( + "-l", + "--language", + dest="language", + action="store_true", + help="Output only language") + + parser.add_argument( + "-a", + "--all", + dest="all", + action="store_true", + help="Output metadata and content from all embedded files") + + parser.add_argument( + '-v', + '--version', + action='version', + version='%(prog)s {}'.format(__versionstr__)) + + return parser.parse_args() + + +def main(): + args = get_args() + + command_line = dict() + if args.jar: + command_line = {"TIKA_APP_JAR": args.jar} + + defaults = {"TIKA_APP_JAR": "/opt/tika/tika-app-1.13.jar"} + options = ChainMap(command_line, os.environ, defaults) + + tika = TikaApp(options['TIKA_APP_JAR']) + + if args.file: + f = args.file + + if args.detect: + print(tika.detect_content_type(file_path=f).encode('utf-8')) + + if args.text: + print(tika.extract_only_content(file_path=f).encode('utf-8')) + + if args.language: + print(tika.detect_language(file_path=f).encode('utf-8')) + + if args.all: + print(tika.extract_all_content( + file_path=f, pretty_print=True).encode('utf-8')) + + elif args.payload: + p = args.payload + + if args.detect: + print(tika.detect_content_type(payload=p).encode('utf-8')) + + if args.text: + print(tika.extract_only_content(payload=p).encode('utf-8')) + + if args.language: + print(tika.detect_language(payload=p).encode('utf-8')) + + if args.all: + print(tika.extract_all_content( + payload=p, pretty_print=True).encode('utf-8')) + + +if __name__ == '__main__': + main()