diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py index 3def035f2..9b6f0843e 100644 --- a/data_collection/gazette/pipelines.py +++ b/data_collection/gazette/pipelines.py @@ -1,6 +1,7 @@ import datetime as dt from pathlib import Path +import filetype from itemadapter import ItemAdapter from scrapy import spiderloader from scrapy.exceptions import DropItem @@ -164,11 +165,34 @@ def item_completed(self, results, item, info): def file_path(self, request, response=None, info=None, item=None): """ - Path to save the files, modified to organize the gazettes in directories. - The files will be under //. + Path to save the files, modified to organize the gazettes in directories + and with the right file extension added. + The files will be under //. """ - filepath = super().file_path(request, response=response, info=info, item=item) + filepath = Path( + super().file_path(request, response=response, info=info, item=item) + ) # The default path from the scrapy class begins with "full/". In this # class we replace that with the territory_id and gazette date. - filename = Path(filepath).name + filename = filepath.name + + if response is not None and not filepath.suffix: + filename = self._get_filename_with_extension(filename, response) + return str(Path(item["territory_id"], item["date"], filename)) + + def _get_filename_with_extension(self, filename, response): + # The majority of the Gazettes are PDF files, so we can check it + # faster validating document Content-Type before using a more costly + # check with filetype library + file_extension = ( + ".pdf" if response.headers.get("Content-Type") == b"application/pdf" else "" + ) + + if not file_extension: + # Checks file extension from file header if possible + max_file_header_size = 261 + file_kind = filetype.guess(response.body[:max_file_header_size]) + file_extension = f".{file_kind.extension}" if file_kind is not None else "" + + return f"{filename}{file_extension}" diff --git a/data_collection/requirements-dev.txt b/data_collection/requirements-dev.txt index faaf40216..345e90525 100644 --- a/data_collection/requirements-dev.txt +++ b/data_collection/requirements-dev.txt @@ -248,6 +248,9 @@ docutils==0.16 \ filelock==3.12.4 \ --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \ --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd +filetype==1.2.0 \ + --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \ + --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25 flake8==6.1.0 \ --hash=sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23 \ --hash=sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5 diff --git a/data_collection/requirements.in b/data_collection/requirements.in index 41cd69666..899f4d4ba 100644 --- a/data_collection/requirements.in +++ b/data_collection/requirements.in @@ -3,6 +3,7 @@ boto3==1.24.89 click chompjs dateparser +filetype itemadapter jinja2 psycopg2-binary diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt index 848a635dd..41bb67513 100644 --- a/data_collection/requirements.txt +++ b/data_collection/requirements.txt @@ -216,6 +216,9 @@ docutils==0.16 \ filelock==3.12.4 \ --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \ --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd +filetype==1.2.0 \ + --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \ + --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25 fqdn==1.5.1 \ --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \ --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014