Skip to content

Commit

Permalink
Simplifica definição de extensão de arquivo baixado
Browse files Browse the repository at this point in the history
Quando a extensão do arquivo não é informada no nome dele, tentamos
identificar ela através dos headers do response ou através de seu
conteúdo. Porém devido a maneira como o Scrapy funciona, isso vai fazer
com que o arquivo seja baixado novamente em novas execuções do spider.
Para resolver esse problema seria necessários muitas mudanças na
estrutura do Scrapy (incluindo a criação de novos FileStorage, o que
tornaria o projeto mais complexo, com um ganho de performance
irrelevante.

Com esse modificação, identificamos a extensão, e se ela não vier por
padrão, o arquivo será baixado novamente toda vez que reexecutarmos o
spider (o que nunca acontece em produção para a data específica)
  • Loading branch information
rennerocha authored and ogecece committed Jun 5, 2024
1 parent 71f90b7 commit 589029d
Showing 1 changed file with 19 additions and 39 deletions.
58 changes: 19 additions & 39 deletions data_collection/gazette/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,13 @@
import datetime as dt
from os import PathLike
from pathlib import Path
from typing import Union

import filetype
from itemadapter import ItemAdapter
from scrapy import spiderloader
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FilesPipeline, FSFilesStore
from scrapy.pipelines.files import FilesPipeline
from scrapy.settings import Settings
from scrapy.utils import project
from sqlalchemy.exc import SQLAlchemyError
Expand Down Expand Up @@ -131,8 +129,6 @@ class QueridoDiarioFilesPipeline(FilesPipeline):
DEFAULT_FILES_REQUESTS_FIELD = "file_requests"

def __init__(self, *args, settings=None, **kwargs):
self.STORE_SCHEMES[""] = QueridoDiarioFSFilesStore
self.STORE_SCHEMES["file"] = QueridoDiarioFSFilesStore
super().__init__(*args, settings=settings, **kwargs)

if isinstance(settings, dict) or settings is None:
Expand Down Expand Up @@ -169,8 +165,9 @@ def item_completed(self, results, item, info):

def file_path(self, request, response=None, info=None, item=None):
"""
Path to save the files, modified to organize the gazettes in directories.
The files will be under <territory_id>/<gazette date>/.
Path to save the files, modified to organize the gazettes in directories
and with the right file extension added.
The files will be under <territory_id>/<gazette date>/<filename>.
"""
filepath = Path(
super().file_path(request, response=response, info=info, item=item)
Expand All @@ -179,40 +176,23 @@ def file_path(self, request, response=None, info=None, item=None):
# class we replace that with the territory_id and gazette date.
filename = filepath.name

if not filepath.suffix and response is not None:
extension = self._detect_extension(response)
if extension:
filename += f".{extension}"
if response is not None and not filepath.suffix:
filename = self._get_filename_with_extension(filename, response)

return str(Path(item["territory_id"], item["date"], filename))

def _detect_extension(self, response):
"""Checks file extension from file header if possible"""
max_file_header_size = 261
file_kind = filetype.guess(response.body[:max_file_header_size])
if file_kind is None:
# logger.warning(f"Unable to guess the file type from downloaded file {response}!")
return ""

return file_kind.extension


class QueridoDiarioFSFilesStore(FSFilesStore):
def __init__(self, basedir: Union[str, PathLike]):
super().__init__(basedir)

def stat_file(self, path: Union[str, PathLike], info):
path_obj = Path(path)
if path_obj.suffix:
return super().stat_file(path, info)
def _get_filename_with_extension(self, filename, response):
# The majority of the Gazettes are PDF files, so we can check it
# faster validating document Content-Type before using a more costly
# check with filetype library
file_extension = (
".pdf" if response.headers.get("Content-Type") == b"application/pdf" else ""
)

path_with_ext = self._find_file(path_obj)
return super().stat_file(path_with_ext, info)
if not file_extension:
# Checks file extension from file header if possible
max_file_header_size = 261
file_kind = filetype.guess(response.body[:max_file_header_size])
file_extension = f".{file_kind.extension}" if file_kind is not None else ""

def _find_file(self, path):
"""Finds a file with extension from a file path without extension"""
absolute_path = Path(self.basedir, path)
files = [p for p in absolute_path.parent.glob(f"{path.name}.*")]
if len(files) > 0:
return Path(path.parent, files[0].name)
return path
return f"{filename}{file_extension}"

0 comments on commit 589029d

Please sign in to comment.