From b1975e7c8739d303defdba557526b8b04fe58b4a Mon Sep 17 00:00:00 2001
From: Alex Silva <alexjbs@id.uff.br>
Date: Wed, 29 Nov 2023 21:15:59 -0300
Subject: [PATCH 1/3] =?UTF-8?q?Adiciona=20a=20extens=C3=A3o=20corresponden?=
 =?UTF-8?q?te=20no=20arquivo=20que=20est=C3=A1=20sendo=20baixado,=20quando?=
 =?UTF-8?q?=20o=20scrapy=20n=C3=A3o=20consegue=20achar=20a=20extens=C3=A3o?=
 =?UTF-8?q?=20pela=20url=20de=20download.=20Essa=20parte=20=C3=A9=20uma=20?=
 =?UTF-8?q?implementa=C3=A7=C3=A3o=20(com=20ajustes)=20do=20c=C3=B3digo=20?=
 =?UTF-8?q?sugerido=20pelo=20@ogecece=20=20em=20https://github.com/okfn-br?=
 =?UTF-8?q?asil/querido-diario/pull/946#pullrequestreview-1656259871=20.?=
 =?UTF-8?q?=20Para=20resolver=20o=20problema=20de=20"nos=20casos=20onde=20?=
 =?UTF-8?q?for=C3=A7amos=20a=20detec=C3=A7=C3=A3o=20da=20extens=C3=A3o,=20?=
 =?UTF-8?q?o=20arquivo=20sempre=20seria=20baixado=20novamente=20de=20forma?=
 =?UTF-8?q?=20desnecess=C3=A1ria",=20o=20m=C3=A9todo=20`stat=5Ffile`=20foi?=
 =?UTF-8?q?=20sobrescrito,=20com=20a=20inclus=C3=A3o=20da=20busca=20por=20?=
 =?UTF-8?q?um=20arquivo=20com=20extens=C3=A3o=20quando=20o=20scrapy=20n?=
 =?UTF-8?q?=C3=A3o=20consegue=20achar=20a=20extens=C3=A3o=20pela=20url=20d?=
 =?UTF-8?q?e=20download=20para=20comparar=20se=20o=20arquivo=20j=C3=A1=20f?=
 =?UTF-8?q?oi=20baixado.=20Resolve=20#819?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_collection/gazette/pipelines.py | 50 ++++++++++++++++++++++++++--
 1 file changed, 47 insertions(+), 3 deletions(-)
diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py
index 3def035f2..98db03e2a 100644
--- a/data_collection/gazette/pipelines.py
+++ b/data_collection/gazette/pipelines.py
@@ -1,12 +1,15 @@
 import datetime as dt
+from os import PathLike
 from pathlib import Path
+from typing import Union
 
+import filetype
 from itemadapter import ItemAdapter
 from scrapy import spiderloader
 from scrapy.exceptions import DropItem
 from scrapy.http import Request
 from scrapy.http.request import NO_CALLBACK
-from scrapy.pipelines.files import FilesPipeline
+from scrapy.pipelines.files import FilesPipeline, FSFilesStore
 from scrapy.settings import Settings
 from scrapy.utils import project
 from sqlalchemy.exc import SQLAlchemyError
@@ -128,6 +131,8 @@ class QueridoDiarioFilesPipeline(FilesPipeline):
     DEFAULT_FILES_REQUESTS_FIELD = "file_requests"
 
     def __init__(self, *args, settings=None, **kwargs):
+        self.STORE_SCHEMES[""] = QueridoDiarioFSFilesStore
+        self.STORE_SCHEMES["file"] = QueridoDiarioFSFilesStore
         super().__init__(*args, settings=settings, **kwargs)
 
         if isinstance(settings, dict) or settings is None:
@@ -167,8 +172,47 @@ def file_path(self, request, response=None, info=None, item=None):
         Path to save the files, modified to organize the gazettes in directories.
         The files will be under <territory_id>/<gazette date>/.
         """
-        filepath = super().file_path(request, response=response, info=info, item=item)
+        filepath = Path(
+            super().file_path(request, response=response, info=info, item=item)
+        )
         # The default path from the scrapy class begins with "full/". In this
         # class we replace that with the territory_id and gazette date.
-        filename = Path(filepath).name
+        filename = filepath.name
+
+        if not filepath.suffix and response is not None:
+            extension = self._detect_extension(response)
+            if extension:
+                filename += f".{extension}"
+
         return str(Path(item["territory_id"], item["date"], filename))
+
+    def _detect_extension(self, response):
+        """Checks file extension from file header if possible"""
+        max_file_header_size = 261
+        file_kind = filetype.guess(response.body[:max_file_header_size])
+        if file_kind is None:
+            # logger.warning(f"Unable to guess the file type from downloaded file {response}!")
+            return ""
+
+        return file_kind.extension
+
+
+class QueridoDiarioFSFilesStore(FSFilesStore):
+    def __init__(self, basedir: Union[str, PathLike]):
+        super().__init__(basedir)
+
+    def stat_file(self, path: Union[str, PathLike], info):
+        path_obj = Path(path)
+        if path_obj.suffix:
+            return super().stat_file(path, info)
+
+        path_with_ext = self._find_file(path_obj)
+        return super().stat_file(path_with_ext, info)
+
+    def _find_file(self, path):
+        """Finds a file with extension from a file path without extension"""
+        absolute_path = Path(self.basedir, path)
+        files = [p for p in absolute_path.parent.glob(f"{path.name}.*")]
+        if len(files) > 0:
+            return Path(path.parent, files[0].name)
+        return path

From 71f90b7e46489e0366ba8ca0552ed54d590b1f24 Mon Sep 17 00:00:00 2001
From: Alex Silva <alexjbs@id.uff.br>
Date: Wed, 29 Nov 2023 22:00:57 -0300
Subject: [PATCH 2/3] =?UTF-8?q?Adiciona=20a=20biblioteca=20'filetype'=20no?=
 =?UTF-8?q?s=20'requirements'=20do=20Querido=20Di=C3=A1rio.?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 data_collection/requirements-dev.txt | 3 +++
 data_collection/requirements.in      | 1 +
 data_collection/requirements.txt     | 3 +++
 3 files changed, 7 insertions(+)

diff --git a/data_collection/requirements-dev.txt b/data_collection/requirements-dev.txt
index faaf40216..345e90525 100644
--- a/data_collection/requirements-dev.txt
+++ b/data_collection/requirements-dev.txt
@@ -248,6 +248,9 @@ docutils==0.16 \
 filelock==3.12.4 \
     --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \
     --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd
+filetype==1.2.0 \
+    --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \
+    --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25
 flake8==6.1.0 \
     --hash=sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23 \
     --hash=sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5
diff --git a/data_collection/requirements.in b/data_collection/requirements.in
index 41cd69666..899f4d4ba 100644
--- a/data_collection/requirements.in
+++ b/data_collection/requirements.in
@@ -3,6 +3,7 @@ boto3==1.24.89
 click
 chompjs
 dateparser
+filetype
 itemadapter
 jinja2
 psycopg2-binary
diff --git a/data_collection/requirements.txt b/data_collection/requirements.txt
index 848a635dd..41bb67513 100644
--- a/data_collection/requirements.txt
+++ b/data_collection/requirements.txt
@@ -216,6 +216,9 @@ docutils==0.16 \
 filelock==3.12.4 \
     --hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \
     --hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd
+filetype==1.2.0 \
+    --hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \
+    --hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25
 fqdn==1.5.1 \
     --hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \
     --hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014

From 589029d5c7c62a932d43015bb60d2aad64be8696 Mon Sep 17 00:00:00 2001
From: Renne Rocha <renne@rocha.dev.br>
Date: Sun, 2 Jun 2024 09:47:44 -0300
Subject: [PATCH 3/3] =?UTF-8?q?Simplifica=20defini=C3=A7=C3=A3o=20de=20ext?=
 =?UTF-8?q?ens=C3=A3o=20de=20arquivo=20baixado?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Quando a extensão do arquivo não é informada no nome dele, tentamos
identificar ela através dos headers do response ou através de seu
conteúdo. Porém devido a maneira como o Scrapy funciona, isso vai fazer
com que o arquivo seja baixado novamente em novas execuções do spider.
Para resolver esse problema seria necessários muitas mudanças na
estrutura do Scrapy (incluindo a criação de novos FileStorage, o que
tornaria o projeto mais complexo, com um ganho de performance
irrelevante.

Com esse modificação, identificamos a extensão, e se ela não vier por
padrão, o arquivo será baixado novamente toda vez que reexecutarmos o
spider (o que nunca acontece em produção para a data específica)
---
 data_collection/gazette/pipelines.py | 58 +++++++++-------------------
 1 file changed, 19 insertions(+), 39 deletions(-)

diff --git a/data_collection/gazette/pipelines.py b/data_collection/gazette/pipelines.py
index 98db03e2a..9b6f0843e 100644
--- a/data_collection/gazette/pipelines.py
+++ b/data_collection/gazette/pipelines.py
@@ -1,7 +1,5 @@
 import datetime as dt
-from os import PathLike
 from pathlib import Path
-from typing import Union
 
 import filetype
 from itemadapter import ItemAdapter
@@ -9,7 +7,7 @@
 from scrapy.exceptions import DropItem
 from scrapy.http import Request
 from scrapy.http.request import NO_CALLBACK
-from scrapy.pipelines.files import FilesPipeline, FSFilesStore
+from scrapy.pipelines.files import FilesPipeline
 from scrapy.settings import Settings
 from scrapy.utils import project
 from sqlalchemy.exc import SQLAlchemyError
@@ -131,8 +129,6 @@ class QueridoDiarioFilesPipeline(FilesPipeline):
     DEFAULT_FILES_REQUESTS_FIELD = "file_requests"
 
     def __init__(self, *args, settings=None, **kwargs):
-        self.STORE_SCHEMES[""] = QueridoDiarioFSFilesStore
-        self.STORE_SCHEMES["file"] = QueridoDiarioFSFilesStore
         super().__init__(*args, settings=settings, **kwargs)
 
         if isinstance(settings, dict) or settings is None:
@@ -169,8 +165,9 @@ def item_completed(self, results, item, info):
 
     def file_path(self, request, response=None, info=None, item=None):
         """
-        Path to save the files, modified to organize the gazettes in directories.
-        The files will be under <territory_id>/<gazette date>/.
+        Path to save the files, modified to organize the gazettes in directories
+        and with the right file extension added.
+        The files will be under <territory_id>/<gazette date>/<filename>.
         """
         filepath = Path(
             super().file_path(request, response=response, info=info, item=item)
@@ -179,40 +176,23 @@ def file_path(self, request, response=None, info=None, item=None):
         # class we replace that with the territory_id and gazette date.
         filename = filepath.name
 
-        if not filepath.suffix and response is not None:
-            extension = self._detect_extension(response)
-            if extension:
-                filename += f".{extension}"
+        if response is not None and not filepath.suffix:
+            filename = self._get_filename_with_extension(filename, response)
 
         return str(Path(item["territory_id"], item["date"], filename))
 
-    def _detect_extension(self, response):
-        """Checks file extension from file header if possible"""
-        max_file_header_size = 261
-        file_kind = filetype.guess(response.body[:max_file_header_size])
-        if file_kind is None:
-            # logger.warning(f"Unable to guess the file type from downloaded file {response}!")
-            return ""
-
-        return file_kind.extension
-
-
-class QueridoDiarioFSFilesStore(FSFilesStore):
-    def __init__(self, basedir: Union[str, PathLike]):
-        super().__init__(basedir)
-
-    def stat_file(self, path: Union[str, PathLike], info):
-        path_obj = Path(path)
-        if path_obj.suffix:
-            return super().stat_file(path, info)
+    def _get_filename_with_extension(self, filename, response):
+        # The majority of the Gazettes are PDF files, so we can check it
+        # faster validating document Content-Type before using a more costly
+        # check with filetype library
+        file_extension = (
+            ".pdf" if response.headers.get("Content-Type") == b"application/pdf" else ""
+        )
 
-        path_with_ext = self._find_file(path_obj)
-        return super().stat_file(path_with_ext, info)
+        if not file_extension:
+            # Checks file extension from file header if possible
+            max_file_header_size = 261
+            file_kind = filetype.guess(response.body[:max_file_header_size])
+            file_extension = f".{file_kind.extension}" if file_kind is not None else ""
 
-    def _find_file(self, path):
-        """Finds a file with extension from a file path without extension"""
-        absolute_path = Path(self.basedir, path)
-        files = [p for p in absolute_path.parent.glob(f"{path.name}.*")]
-        if len(files) > 0:
-            return Path(path.parent, files[0].name)
-        return path
+        return f"{filename}{file_extension}"