Simplifica definição de extensão de arquivo baixado (#1153)

Melhorias em #1045 Quando a extensão do arquivo não é informada no nome dele, tentamos identificar ela através dos headers do response ou através de seu conteúdo. Porém devido a maneira como o Scrapy funciona, isso vai fazer com que o arquivo seja baixado novamente em novas execuções do spider. Para resolver esse problema seria necessários muitas mudanças na estrutura do Scrapy (incluindo a criação de novos FileStorage, o que tornaria o projeto mais complexo, com um ganho de performance irrelevante. Com esse modificação, identificamos a extensão, e se ela não vier por padrão, o arquivo será baixado novamente toda vez que reexecutarmos o spider (o que nunca acontece em produção para a data específica)
okfn-brasil · Jun 5, 2024 · 7da36d4 · 7da36d4
2 parents e60833f + 589029d
commit 7da36d4
Show file tree

Hide file tree

Showing 4 changed files with 35 additions and 4 deletions.
diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py
@@ -1,6 +1,7 @@
 import datetime as dt
 from pathlib import Path
 
+import filetype
 from itemadapter import ItemAdapter
 from scrapy import spiderloader
 from scrapy.exceptions import DropItem
@@ -164,11 +165,34 @@ def item_completed(self, results, item, info):
 
     def file_path(self, request, response=None, info=None, item=None):
         """
-        Path to save the files, modified to organize the gazettes in directories.
-        The files will be under <territory_id>/<gazette date>/.
+        Path to save the files, modified to organize the gazettes in directories
+        and with the right file extension added.
+        The files will be under <territory_id>/<gazette date>/<filename>.
         """
-        filepath = super().file_path(request, response=response, info=info, item=item)
+        filepath = Path(
+            super().file_path(request, response=response, info=info, item=item)
+        )
         # The default path from the scrapy class begins with "full/". In this
         # class we replace that with the territory_id and gazette date.
-        filename = Path(filepath).name
+        filename = filepath.name
+
+        if response is not None and not filepath.suffix:
+            filename = self._get_filename_with_extension(filename, response)
+
         return str(Path(item["territory_id"], item["date"], filename))
+
+    def _get_filename_with_extension(self, filename, response):
+        # The majority of the Gazettes are PDF files, so we can check it
+        # faster validating document Content-Type before using a more costly
+        # check with filetype library
+        file_extension = (
+            ".pdf" if response.headers.get("Content-Type") == b"application/pdf" else ""
+        )
+
+        if not file_extension:
+            # Checks file extension from file header if possible
+            max_file_header_size = 261
+            file_kind = filetype.guess(response.body[:max_file_header_size])
+            file_extension = f".{file_kind.extension}" if file_kind is not None else ""
+
+        return f"{filename}{file_extension}"
diff --git a/data_collection/requirements-dev.txt b/data_collection/requirements-dev.txt
@@ -248,6 +248,9 @@ docutils==0.16 \
 filelock==3.12.4 \
     --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \
     --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd
+filetype==1.2.0 \
+    --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \
+    --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25
 flake8==6.1.0 \
     --hash=sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23 \
     --hash=sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5

diff --git a/data_collection/requirements.in b/data_collection/requirements.in
@@ -3,6 +3,7 @@ boto3==1.24.89
 click
 chompjs
 dateparser
+filetype
 itemadapter
 jinja2
 psycopg2-binary

diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt
@@ -216,6 +216,9 @@ docutils==0.16 \
 filelock==3.12.4 \
     --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \
     --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd
+filetype==1.2.0 \
+    --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \
+    --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25
 fqdn==1.5.1 \
     --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \
     --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014