forked from okfn-brasil/querido-diario
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Campos dos Goytacazes-RJ spider refactor
The way the spider was implemented assumed that there could only be a single file_url per day per is_extra_edition value, which was not always true. This refactoring gathers all the various files per day and is_extra_edition. The existing code did not address the text format for Saturday gazettes to be considered is_extra_edition. We also included the start_date and end_date handling. resolve okfn-brasil#637
- Loading branch information
Alex Harano
committed
Oct 5, 2022
1 parent
d04b95c
commit d6cc9e0
Showing
2 changed files
with
239 additions
and
62 deletions.
There are no files selected for viewing
239 changes: 239 additions & 0 deletions
239
data_collection/gazette/spiders/rj_campos_dos_goytacazes.py
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,239 @@ | ||
import re | ||
from datetime import date, datetime, timedelta | ||
|
||
import dateparser | ||
from scrapy import Request | ||
|
||
from gazette.items import Gazette | ||
from gazette.spiders.base import BaseGazetteSpider | ||
|
||
|
||
class RjCamposDosGoytacazesSpider(BaseGazetteSpider): | ||
TERRITORY_ID = "3301009" | ||
|
||
allowed_domains = ["www.campos.rj.gov.br"] | ||
name = "rj_campos_dos_goytacazes" | ||
BASE_URL = ( | ||
"https://www.campos.rj.gov.br/diario-oficial.php" | ||
"?PGpagina={PAGE_NUMBER}&PGporPagina=15" | ||
) | ||
# the current gazette system only allows at most this number of rows per page, | ||
# even when explicitly requesting more than that | ||
MAX_ROWS_PER_PAGE = 15 | ||
|
||
start_date = date(2010, 6, 10) | ||
|
||
def __init__(self, *args, **kwargs): | ||
super(RjCamposDosGoytacazesSpider, self).__init__(*args, **kwargs) | ||
self.collected_data_by_date = {} | ||
|
||
def _calculate_tentative_page_number_associated_with_end_date(self) -> int: | ||
"""Determine the page number that the end_date gazette might be at. | ||
Facts for the design of this method: | ||
- The first page of the pagination contains the most recent gazette. | ||
- We consider most Saturday and Sunday days have no gazette. | ||
Exception example: | ||
Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra | ||
- Even if the number of rows for the other days may vary from zero to | ||
more than one, we consider that non-Saturday and non-Sunday days | ||
will have one gazette | ||
- Considering the potential variation of established conditions, | ||
such as not having a gazette or having multiple rows for the same day, | ||
we tentatively set that the target end_date gazette might be available on | ||
the calculated page number or one before that. | ||
This method adopts the following heuristic: we calculate the number of | ||
non-Saturday and non-Sunday from the day this method runs until | ||
the target end_date and perform an integer division of the estimated number of | ||
days by the maximum number of rows on a page, and the result is | ||
the chosen page number. | ||
We only replace the calculated number when it is less than one: | ||
for that case, we replace it with 1, as the page numbering begins at 1. | ||
It returns a non-zero positive int. | ||
""" | ||
|
||
today: date = datetime.today().date() | ||
|
||
if today <= self.end_date: | ||
return 1 | ||
|
||
non_saturday_nor_sunday_day_count: int = 0 | ||
current_day: date = self.end_date | ||
one_day_timedelta: timedelta = timedelta(days=1) | ||
saturday_and_sunday_set = { | ||
6, # Saturday | ||
7, # Sunday | ||
} | ||
while current_day <= today: | ||
if current_day.isoweekday() not in saturday_and_sunday_set: | ||
non_saturday_nor_sunday_day_count += 1 | ||
current_day = current_day + one_day_timedelta | ||
|
||
self.logger.info( | ||
f"Number of non-Saturday and non-Sunday days from {self.end_date} to" | ||
f" {today}, inclusive: {non_saturday_nor_sunday_day_count}" | ||
) | ||
|
||
tentative_page_number_associated_with_end_date: int = ( | ||
non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE | ||
) | ||
|
||
if tentative_page_number_associated_with_end_date < 1: | ||
tentative_page_number_associated_with_end_date = 1 | ||
|
||
return tentative_page_number_associated_with_end_date | ||
|
||
def start_requests(self): | ||
tentative_page_number_associated_with_end_date: int = ( | ||
self._calculate_tentative_page_number_associated_with_end_date() | ||
) | ||
self.logger.info( | ||
f"Tentative page number for {self.end_date} calculated" | ||
f" as {tentative_page_number_associated_with_end_date}" | ||
) | ||
|
||
initial_url = self.BASE_URL.format( | ||
PAGE_NUMBER=tentative_page_number_associated_with_end_date | ||
) | ||
yield Request( | ||
url=initial_url, | ||
callback=self.per_page, | ||
cb_kwargs={ | ||
"searching_for_end_date": True, | ||
"current_page_number": tentative_page_number_associated_with_end_date, | ||
}, | ||
) | ||
|
||
def _extract_date_from_gazette_text(self, gazette_text): | ||
date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text) | ||
if not date_re: | ||
self.logger.warning(f"No date could be extracted from '{gazette_text}'") | ||
return | ||
|
||
textual_date = date_re.group(0) | ||
# The extra edition for August 28th, 2018 has a typo in the month name. | ||
textual_date = textual_date.replace("Agosoto", "Agosto") | ||
# The edition for December 17th, 2012 has a typo in the month name. | ||
textual_date = textual_date.replace("Dezembrbo", "Dezembro") | ||
gazette_date = dateparser.parse(textual_date, languages=["pt"]).date() | ||
return gazette_date | ||
|
||
def per_row_and_store_if_valid(self, response, row_element): | ||
"""Extract gazette data to collect from a page row. | ||
It returns the extracted gazette date. | ||
""" | ||
gazette_text = row_element.css("h4::text").get("") | ||
|
||
gazette_date = self._extract_date_from_gazette_text(gazette_text) | ||
if not gazette_date: | ||
return gazette_date | ||
|
||
if gazette_date < self.start_date or self.end_date < gazette_date: | ||
# the gazette listing goes from the newest to the oldest, | ||
# except for extra edition or additional files | ||
return gazette_date | ||
|
||
path_to_gazette = row_element.css("a::attr(href)").get().strip() | ||
# From November 17th, 2017 and backwards the path to the gazette PDF | ||
# is relative. | ||
if path_to_gazette.startswith("up/diario_oficial.php"): | ||
path_to_gazette = response.urljoin(path_to_gazette) | ||
|
||
# As the rows are analyzed top-down and multiple entries for | ||
# the same day also keep that order, we will use that to compose | ||
# in the case of multiple files. | ||
# | ||
# Example of a regular gazette with multiple files: | ||
# Diário Oficial Eletrônico de 31 de Maio de 2022 - Edição - 1100 | ||
# Diário Oficial Eletrônico de 31 de Maio de 2022 - Edição - 1100 - CADERNO 2 | ||
# | ||
# Example of an extra edition with multiple files: | ||
# Suplemento I do Diário Oficial Eletrônico de 05 de Abril de 2022 - Edição - 1064 | ||
# Suplemento II do Diário Oficial Eletrônico de 05 de Abril de 2022 - Edição - 1064 - ERRATA | ||
# | ||
# Example of a gazette on Saturday: | ||
# Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra | ||
|
||
is_extra_edition = ( | ||
gazette_text.startswith("Suplemento") or "Extra" in gazette_text | ||
) | ||
|
||
file_urls_by_is_extra_edition_for_the_date = ( | ||
self.collected_data_by_date.setdefault(gazette_date, {}) | ||
) | ||
|
||
file_urls_by_is_extra_edition_for_the_date.setdefault( | ||
is_extra_edition, [] | ||
).append(path_to_gazette) | ||
|
||
return gazette_date | ||
|
||
def per_page(self, response, current_page_number, searching_for_end_date=False): | ||
"""Extract gazette data to collect from a page.""" | ||
is_gazette_date_before_start_date = False | ||
for row_element in response.css("ul.ul-licitacoes li"): | ||
gazette_date = self.per_row_and_store_if_valid(response, row_element) | ||
|
||
if not gazette_date: | ||
continue | ||
|
||
if searching_for_end_date: | ||
if self.end_date < gazette_date: | ||
if current_page_number <= 1: | ||
# We already reached the most recent page | ||
searching_for_end_date = False | ||
else: | ||
# The tentative_page_number_associated_with_end_date strategy | ||
# failed, and we need to go to a more recent gazette page | ||
previous_page_number = current_page_number - 1 | ||
previous_page_url = self.BASE_URL.format( | ||
PAGE_NUMBER=previous_page_number, | ||
) | ||
yield Request( | ||
url=previous_page_url, | ||
callback=self.per_page, | ||
cb_kwargs={ | ||
"searching_for_end_date": True, | ||
"current_page_number": previous_page_number, | ||
}, | ||
) | ||
else: | ||
searching_for_end_date = False | ||
|
||
if gazette_date < self.start_date: | ||
is_gazette_date_before_start_date = True | ||
break | ||
|
||
if is_gazette_date_before_start_date: | ||
# Process all the collected data per date and then yield the gazettes | ||
for ( | ||
gazette_date, | ||
file_urls_by_is_extra_edition, | ||
) in self.collected_data_by_date.items(): | ||
for ( | ||
is_extra_edition, | ||
file_urls, | ||
) in file_urls_by_is_extra_edition.items(): | ||
yield Gazette( | ||
date=gazette_date, | ||
file_urls=file_urls, | ||
is_extra_edition=is_extra_edition, | ||
power="executive", | ||
) | ||
else: | ||
# We still need to collect data from the next page | ||
next_url = ( | ||
response.css(".pagination") | ||
.xpath("//a[contains(text(), 'Proxima')]/@href") | ||
.get() | ||
) | ||
if next_url: | ||
yield Request( | ||
response.urljoin(next_url), | ||
callback=self.per_page, | ||
cb_kwargs={"current_page_number": current_page_number + 1}, | ||
) |
This file was deleted.
Oops, something went wrong.