Simplifica definição de extensão de arquivo baixado

Quando a extensão do arquivo não é informada no nome dele, tentamos identificar ela através dos headers do response ou através de seu conteúdo. Porém devido a maneira como o Scrapy funciona, isso vai fazer com que o arquivo seja baixado novamente em novas execuções do spider. Para resolver esse problema seria necessários muitas mudanças na estrutura do Scrapy (incluindo a criação de novos FileStorage, o que tornaria o projeto mais complexo, com um ganho de performance irrelevante. Com esse modificação, identificamos a extensão, e se ela não vier por padrão, o arquivo será baixado novamente toda vez que reexecutarmos o spider (o que nunca acontece em produção para a data específica)
okfn-brasil · Jun 5, 2024 · 589029d · 589029d
1 parent 71f90b7
commit 589029d
Showing 1 changed file with 19 additions and 39 deletions.
diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py
@@ -1,15 +1,13 @@
 import datetime as dt
-from os import PathLike
 from pathlib import Path
-from typing import Union
 
 import filetype
 from itemadapter import ItemAdapter
 from scrapy import spiderloader
 from scrapy.exceptions import DropItem
 from scrapy.http import Request
 from scrapy.http.request import NO_CALLBACK
-from scrapy.pipelines.files import FilesPipeline, FSFilesStore
+from scrapy.pipelines.files import FilesPipeline
 from scrapy.settings import Settings
 from scrapy.utils import project
 from sqlalchemy.exc import SQLAlchemyError
@@ -131,8 +129,6 @@ class QueridoDiarioFilesPipeline(FilesPipeline):
     DEFAULT_FILES_REQUESTS_FIELD = "file_requests"
 
     def __init__(self, *args, settings=None, **kwargs):
-        self.STORE_SCHEMES[""] = QueridoDiarioFSFilesStore
-        self.STORE_SCHEMES["file"] = QueridoDiarioFSFilesStore
         super().__init__(*args, settings=settings, **kwargs)
 
         if isinstance(settings, dict) or settings is None:
@@ -169,8 +165,9 @@ def item_completed(self, results, item, info):
 
     def file_path(self, request, response=None, info=None, item=None):
         """
-        Path to save the files, modified to organize the gazettes in directories.
-        The files will be under <territory_id>/<gazette date>/.
+        Path to save the files, modified to organize the gazettes in directories
+        and with the right file extension added.
+        The files will be under <territory_id>/<gazette date>/<filename>.
         """
         filepath = Path(
             super().file_path(request, response=response, info=info, item=item)
@@ -179,40 +176,23 @@ def file_path(self, request, response=None, info=None, item=None):
         # class we replace that with the territory_id and gazette date.
         filename = filepath.name
 
-        if not filepath.suffix and response is not None:
-            extension = self._detect_extension(response)
-            if extension:
-                filename += f".{extension}"
+        if response is not None and not filepath.suffix:
+            filename = self._get_filename_with_extension(filename, response)
 
         return str(Path(item["territory_id"], item["date"], filename))
 
-    def _detect_extension(self, response):
-        """Checks file extension from file header if possible"""
-        max_file_header_size = 261
-        file_kind = filetype.guess(response.body[:max_file_header_size])
-        if file_kind is None:
-            # logger.warning(f"Unable to guess the file type from downloaded file {response}!")
-            return ""
-
-        return file_kind.extension
-
-
-class QueridoDiarioFSFilesStore(FSFilesStore):
-    def __init__(self, basedir: Union[str, PathLike]):
-        super().__init__(basedir)
-
-    def stat_file(self, path: Union[str, PathLike], info):
-        path_obj = Path(path)
-        if path_obj.suffix:
-            return super().stat_file(path, info)
+    def _get_filename_with_extension(self, filename, response):
+        # The majority of the Gazettes are PDF files, so we can check it
+        # faster validating document Content-Type before using a more costly
+        # check with filetype library
+        file_extension = (
+            ".pdf" if response.headers.get("Content-Type") == b"application/pdf" else ""
+        )
 
-        path_with_ext = self._find_file(path_obj)
-        return super().stat_file(path_with_ext, info)
+        if not file_extension:
+            # Checks file extension from file header if possible
+            max_file_header_size = 261
+            file_kind = filetype.guess(response.body[:max_file_header_size])
+            file_extension = f".{file_kind.extension}" if file_kind is not None else ""
 
-    def _find_file(self, path):
-        """Finds a file with extension from a file path without extension"""
-        absolute_path = Path(self.basedir, path)
-        files = [p for p in absolute_path.parent.glob(f"{path.name}.*")]
-        if len(files) > 0:
-            return Path(path.parent, files[0].name)
-        return path
+        return f"{filename}{file_extension}"