Skip to content

Commit

Permalink
Simplifica definição de extensão de arquivo baixado (#1153)
Browse files Browse the repository at this point in the history
Melhorias em #1045
    
    Quando a extensão do arquivo não é informada no nome dele, tentamos
    identificar ela através dos headers do response ou através de seu
conteúdo. Porém devido a maneira como o Scrapy funciona, isso vai fazer
com que o arquivo seja baixado novamente em novas execuções do spider.
    Para resolver esse problema seria necessários muitas mudanças na
    estrutura do Scrapy (incluindo a criação de novos FileStorage, o que
    tornaria o projeto mais complexo, com um ganho de performance
    irrelevante.
    
Com esse modificação, identificamos a extensão, e se ela não vier por
padrão, o arquivo será baixado novamente toda vez que reexecutarmos o
    spider (o que nunca acontece em produção para a data específica)
  • Loading branch information
ogecece authored Jun 5, 2024
2 parents e60833f + 589029d commit 7da36d4
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 4 deletions.
32 changes: 28 additions & 4 deletions data_collection/gazette/pipelines.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import datetime as dt
from pathlib import Path

import filetype
from itemadapter import ItemAdapter
from scrapy import spiderloader
from scrapy.exceptions import DropItem
Expand Down Expand Up @@ -164,11 +165,34 @@ def item_completed(self, results, item, info):

def file_path(self, request, response=None, info=None, item=None):
"""
Path to save the files, modified to organize the gazettes in directories.
The files will be under <territory_id>/<gazette date>/.
Path to save the files, modified to organize the gazettes in directories
and with the right file extension added.
The files will be under <territory_id>/<gazette date>/<filename>.
"""
filepath = super().file_path(request, response=response, info=info, item=item)
filepath = Path(
super().file_path(request, response=response, info=info, item=item)
)
# The default path from the scrapy class begins with "full/". In this
# class we replace that with the territory_id and gazette date.
filename = Path(filepath).name
filename = filepath.name

if response is not None and not filepath.suffix:
filename = self._get_filename_with_extension(filename, response)

return str(Path(item["territory_id"], item["date"], filename))

def _get_filename_with_extension(self, filename, response):
# The majority of the Gazettes are PDF files, so we can check it
# faster validating document Content-Type before using a more costly
# check with filetype library
file_extension = (
".pdf" if response.headers.get("Content-Type") == b"application/pdf" else ""
)

if not file_extension:
# Checks file extension from file header if possible
max_file_header_size = 261
file_kind = filetype.guess(response.body[:max_file_header_size])
file_extension = f".{file_kind.extension}" if file_kind is not None else ""

return f"{filename}{file_extension}"
3 changes: 3 additions & 0 deletions data_collection/requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -248,6 +248,9 @@ docutils==0.16 \
filelock==3.12.4 \
--hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \
--hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd
filetype==1.2.0 \
--hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \
--hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25
flake8==6.1.0 \
--hash=sha256:d5b3857f07c030bdb5bf41c7f53799571d75c4491748a3adcd47de929e34cd23 \
--hash=sha256:ffdfce58ea94c6580c77888a86506937f9a1a227dfcd15f245d694ae20a6b6e5
Expand Down
1 change: 1 addition & 0 deletions data_collection/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@ boto3==1.24.89
click
chompjs
dateparser
filetype
itemadapter
jinja2
psycopg2-binary
Expand Down
3 changes: 3 additions & 0 deletions data_collection/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -216,6 +216,9 @@ docutils==0.16 \
filelock==3.12.4 \
--hash=sha256:08c21d87ded6e2b9da6728c3dff51baf1dcecf973b768ef35bcbc3447edb9ad4 \
--hash=sha256:2e6f249f1f3654291606e046b09f1fd5eac39b360664c27f5aad072012f8bcbd
filetype==1.2.0 \
--hash=sha256:66b56cd6474bf41d8c54660347d37afcc3f7d1970648de365c102ef77548aadb \
--hash=sha256:7ce71b6880181241cf7ac8697a2f1eb6a8bd9b429f7ad6d27b8db9ba5f1c2d25
fqdn==1.5.1 \
--hash=sha256:105ed3677e767fb5ca086a0c1f4bb66ebc3c100be518f0e0d755d9eae164d89f \
--hash=sha256:3a179af3761e4df6eb2e026ff9e1a3033d3587bf980a0b1b2e1e5d08d7358014
Expand Down

0 comments on commit 7da36d4

Please sign in to comment.