From b1975e7c8739d303defdba557526b8b04fe58b4a Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Wed, 29 Nov 2023 21:15:59 -0300 Subject: [PATCH 1/3] =?UTF-8?q?Adiciona=20a=20extens=C3=A3o=20corresponden?= =?UTF-8?q?te=20no=20arquivo=20que=20est=C3=A1=20sendo=20baixado,=20quando?= =?UTF-8?q?=20o=20scrapy=20n=C3=A3o=20consegue=20achar=20a=20extens=C3=A3o?= =?UTF-8?q?=20pela=20url=20de=20download.=20Essa=20parte=20=C3=A9=20uma=20?= =?UTF-8?q?implementa=C3=A7=C3=A3o=20(com=20ajustes)=20do=20c=C3=B3digo=20?= =?UTF-8?q?sugerido=20pelo=20@ogecece=20=20em=20https://github.com/okfn-br?= =?UTF-8?q?asil/querido-diario/pull/946#pullrequestreview-1656259871=20.?= =?UTF-8?q?=20Para=20resolver=20o=20problema=20de=20"nos=20casos=20onde=20?= =?UTF-8?q?for=C3=A7amos=20a=20detec=C3=A7=C3=A3o=20da=20extens=C3=A3o,=20?= =?UTF-8?q?o=20arquivo=20sempre=20seria=20baixado=20novamente=20de=20forma?= =?UTF-8?q?=20desnecess=C3=A1ria",=20o=20m=C3=A9todo=20`stat=5Ffile`=20foi?= =?UTF-8?q?=20sobrescrito,=20com=20a=20inclus=C3=A3o=20da=20busca=20por=20?= =?UTF-8?q?um=20arquivo=20com=20extens=C3=A3o=20quando=20o=20scrapy=20n?= =?UTF-8?q?=C3=A3o=20consegue=20achar=20a=20extens=C3=A3o=20pela=20url=20d?= =?UTF-8?q?e=20download=20para=20comparar=20se=20o=20arquivo=20j=C3=A1=20f?= =?UTF-8?q?oi=20baixado.=20Resolve=20#819?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/pipelines.py | 50 ++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 3 deletions(-) diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py index 3def035f2..98db03e2a 100644 --- a/data_collection/gazette/pipelines.py +++ b/data_collection/gazette/pipelines.py @@ -1,12 +1,15 @@ import datetime as dt +from os import PathLike from pathlib import Path +from typing import Union +import filetype from itemadapter import ItemAdapter from scrapy import spiderloader from scrapy.exceptions import DropItem from scrapy.http import Request from scrapy.http.request import NO_CALLBACK -from scrapy.pipelines.files import FilesPipeline +from scrapy.pipelines.files import FilesPipeline, FSFilesStore from scrapy.settings import Settings from scrapy.utils import project from sqlalchemy.exc import SQLAlchemyError @@ -128,6 +131,8 @@ class QueridoDiarioFilesPipeline(FilesPipeline): DEFAULT_FILES_REQUESTS_FIELD = "file_requests" def __init__(self, *args, settings=None, **kwargs): + self.STORE_SCHEMES[""] = QueridoDiarioFSFilesStore + self.STORE_SCHEMES["file"] = QueridoDiarioFSFilesStore super().__init__(*args, settings=settings, **kwargs) if isinstance(settings, dict) or settings is None: @@ -167,8 +172,47 @@ def file_path(self, request, response=None, info=None, item=None): Path to save the files, modified to organize the gazettes in directories. The files will be under //. """ - filepath = super().file_path(request, response=response, info=info, item=item) + filepath = Path( + super().file_path(request, response=response, info=info, item=item) + ) # The default path from the scrapy class begins with "full/". In this # class we replace that with the territory_id and gazette date. - filename = Path(filepath).name + filename = filepath.name + + if not filepath.suffix and response is not None: + extension = self._detect_extension(response) + if extension: + filename += f".{extension}" + return str(Path(item["territory_id"], item["date"], filename)) + + def _detect_extension(self, response): + """Checks file extension from file header if possible""" + max_file_header_size = 261 + file_kind = filetype.guess(response.body[:max_file_header_size]) + if file_kind is None: + # logger.warning(f"Unable to guess the file type from downloaded file {response}!") + return "" + + return file_kind.extension + + +class QueridoDiarioFSFilesStore(FSFilesStore): + def __init__(self, basedir: Union[str, PathLike]): + super().__init__(basedir) + + def stat_file(self, path: Union[str, PathLike], info): + path_obj = Path(path) + if path_obj.suffix: + return super().stat_file(path, info) + + path_with_ext = self._find_file(path_obj) + return super().stat_file(path_with_ext, info) + + def _find_file(self, path): + """Finds a file with extension from a file path without extension""" + absolute_path = Path(self.basedir, path) + files = [p for p in absolute_path.parent.glob(f"{path.name}.*")] + if len(files) > 0: + return Path(path.parent, files[0].name) + return path From 71f90b7e46489e0366ba8ca0552ed54d590b1f24 Mon Sep 17 00:00:00 2001 From: Alex Silva Date: Wed, 29 Nov 2023 22:00:57 -0300 Subject: [PATCH 2/3] =?UTF-8?q?Adiciona=20a=20biblioteca=20'filetype'=20no?= =?UTF-8?q?s=20'requirements'=20do=20Querido=20Di=C3=A1rio.?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/requirements-dev.txt | 3 +++ data_collection/requirements.in | 1 + data_collection/requirements.txt | 3 +++ 3 files changed, 7 insertions(+) diff --git a/data_collection/requirements-dev.txt b/data_collection/requirements-dev.txt index faaf40216..345e90525 100644 --- a/data_collection/requirements-dev.txt +++ b/data_collection/requirements-dev.txt @@ -248,6 +248,9 @@ docutils==0.16 \ filelock==3.12.4 \ --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \ --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd +filetype==1.2.0 \ + --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \ + --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25 flake8==6.1.0 \ --hash=sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23 \ --hash=sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5 diff --git a/data_collection/requirements.in b/data_collection/requirements.in index 41cd69666..899f4d4ba 100644 --- a/data_collection/requirements.in +++ b/data_collection/requirements.in @@ -3,6 +3,7 @@ boto3==1.24.89 click chompjs dateparser +filetype itemadapter jinja2 psycopg2-binary diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt index 848a635dd..41bb67513 100644 --- a/data_collection/requirements.txt +++ b/data_collection/requirements.txt @@ -216,6 +216,9 @@ docutils==0.16 \ filelock==3.12.4 \ --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \ --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd +filetype==1.2.0 \ + --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \ + --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25 fqdn==1.5.1 \ --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \ --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014 From 589029d5c7c62a932d43015bb60d2aad64be8696 Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Sun, 2 Jun 2024 09:47:44 -0300 Subject: [PATCH 3/3] =?UTF-8?q?Simplifica=20defini=C3=A7=C3=A3o=20de=20ext?= =?UTF-8?q?ens=C3=A3o=20de=20arquivo=20baixado?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Quando a extensão do arquivo não é informada no nome dele, tentamos identificar ela através dos headers do response ou através de seu conteúdo. Porém devido a maneira como o Scrapy funciona, isso vai fazer com que o arquivo seja baixado novamente em novas execuções do spider. Para resolver esse problema seria necessários muitas mudanças na estrutura do Scrapy (incluindo a criação de novos FileStorage, o que tornaria o projeto mais complexo, com um ganho de performance irrelevante. Com esse modificação, identificamos a extensão, e se ela não vier por padrão, o arquivo será baixado novamente toda vez que reexecutarmos o spider (o que nunca acontece em produção para a data específica) --- data_collection/gazette/pipelines.py | 58 +++++++++------------------- 1 file changed, 19 insertions(+), 39 deletions(-) diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py index 98db03e2a..9b6f0843e 100644 --- a/data_collection/gazette/pipelines.py +++ b/data_collection/gazette/pipelines.py @@ -1,7 +1,5 @@ import datetime as dt -from os import PathLike from pathlib import Path -from typing import Union import filetype from itemadapter import ItemAdapter @@ -9,7 +7,7 @@ from scrapy.exceptions import DropItem from scrapy.http import Request from scrapy.http.request import NO_CALLBACK -from scrapy.pipelines.files import FilesPipeline, FSFilesStore +from scrapy.pipelines.files import FilesPipeline from scrapy.settings import Settings from scrapy.utils import project from sqlalchemy.exc import SQLAlchemyError @@ -131,8 +129,6 @@ class QueridoDiarioFilesPipeline(FilesPipeline): DEFAULT_FILES_REQUESTS_FIELD = "file_requests" def __init__(self, *args, settings=None, **kwargs): - self.STORE_SCHEMES[""] = QueridoDiarioFSFilesStore - self.STORE_SCHEMES["file"] = QueridoDiarioFSFilesStore super().__init__(*args, settings=settings, **kwargs) if isinstance(settings, dict) or settings is None: @@ -169,8 +165,9 @@ def item_completed(self, results, item, info): def file_path(self, request, response=None, info=None, item=None): """ - Path to save the files, modified to organize the gazettes in directories. - The files will be under //. + Path to save the files, modified to organize the gazettes in directories + and with the right file extension added. + The files will be under //. """ filepath = Path( super().file_path(request, response=response, info=info, item=item) @@ -179,40 +176,23 @@ def file_path(self, request, response=None, info=None, item=None): # class we replace that with the territory_id and gazette date. filename = filepath.name - if not filepath.suffix and response is not None: - extension = self._detect_extension(response) - if extension: - filename += f".{extension}" + if response is not None and not filepath.suffix: + filename = self._get_filename_with_extension(filename, response) return str(Path(item["territory_id"], item["date"], filename)) - def _detect_extension(self, response): - """Checks file extension from file header if possible""" - max_file_header_size = 261 - file_kind = filetype.guess(response.body[:max_file_header_size]) - if file_kind is None: - # logger.warning(f"Unable to guess the file type from downloaded file {response}!") - return "" - - return file_kind.extension - - -class QueridoDiarioFSFilesStore(FSFilesStore): - def __init__(self, basedir: Union[str, PathLike]): - super().__init__(basedir) - - def stat_file(self, path: Union[str, PathLike], info): - path_obj = Path(path) - if path_obj.suffix: - return super().stat_file(path, info) + def _get_filename_with_extension(self, filename, response): + # The majority of the Gazettes are PDF files, so we can check it + # faster validating document Content-Type before using a more costly + # check with filetype library + file_extension = ( + ".pdf" if response.headers.get("Content-Type") == b"application/pdf" else "" + ) - path_with_ext = self._find_file(path_obj) - return super().stat_file(path_with_ext, info) + if not file_extension: + # Checks file extension from file header if possible + max_file_header_size = 261 + file_kind = filetype.guess(response.body[:max_file_header_size]) + file_extension = f".{file_kind.extension}" if file_kind is not None else "" - def _find_file(self, path): - """Finds a file with extension from a file path without extension""" - absolute_path = Path(self.basedir, path) - files = [p for p in absolute_path.parent.glob(f"{path.name}.*")] - if len(files) > 0: - return Path(path.parent, files[0].name) - return path + return f"{filename}{file_extension}"