Skip to content

Commit

Permalink
Campos dos Goytacazes-RJ spider refactor
Browse files Browse the repository at this point in the history
The way the spider was implemented assumed that there could only be a single file_url per day per is_extra_edition value, which was not always true.

This refactoring gathers all the various files per day and is_extra_edition.

The existing code did not address the text format for Saturday gazettes to be considered is_extra_edition.

We also included the start_date and end_date handling.

resolve okfn-brasil#637
  • Loading branch information
Alex Harano committed Oct 5, 2022
1 parent d04b95c commit d6cc9e0
Show file tree
Hide file tree
Showing 2 changed files with 239 additions and 62 deletions.
239 changes: 239 additions & 0 deletions data_collection/gazette/spiders/rj_campos_dos_goytacazes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,239 @@
import re
from datetime import date, datetime, timedelta

import dateparser
from scrapy import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class RjCamposDosGoytacazesSpider(BaseGazetteSpider):
TERRITORY_ID = "3301009"

allowed_domains = ["www.campos.rj.gov.br"]
name = "rj_campos_dos_goytacazes"
BASE_URL = (
"https://www.campos.rj.gov.br/diario-oficial.php"
"?PGpagina={PAGE_NUMBER}&PGporPagina=15"
)
# the current gazette system only allows at most this number of rows per page,
# even when explicitly requesting more than that
MAX_ROWS_PER_PAGE = 15

start_date = date(2010, 6, 10)

def __init__(self, *args, **kwargs):
super(RjCamposDosGoytacazesSpider, self).__init__(*args, **kwargs)
self.collected_data_by_date = {}

def _calculate_tentative_page_number_associated_with_end_date(self) -> int:
"""Determine the page number that the end_date gazette might be at.
Facts for the design of this method:
- The first page of the pagination contains the most recent gazette.
- We consider most Saturday and Sunday days have no gazette.
Exception example:
Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra
- Even if the number of rows for the other days may vary from zero to
more than one, we consider that non-Saturday and non-Sunday days
will have one gazette
- Considering the potential variation of established conditions,
such as not having a gazette or having multiple rows for the same day,
we tentatively set that the target end_date gazette might be available on
the calculated page number or one before that.
This method adopts the following heuristic: we calculate the number of
non-Saturday and non-Sunday from the day this method runs until
the target end_date and perform an integer division of the estimated number of
days by the maximum number of rows on a page, and the result is
the chosen page number.
We only replace the calculated number when it is less than one:
for that case, we replace it with 1, as the page numbering begins at 1.
It returns a non-zero positive int.
"""

today: date = datetime.today().date()

if today <= self.end_date:
return 1

non_saturday_nor_sunday_day_count: int = 0
current_day: date = self.end_date
one_day_timedelta: timedelta = timedelta(days=1)
saturday_and_sunday_set = {
6, # Saturday
7, # Sunday
}
while current_day <= today:
if current_day.isoweekday() not in saturday_and_sunday_set:
non_saturday_nor_sunday_day_count += 1
current_day = current_day + one_day_timedelta

self.logger.info(
f"Number of non-Saturday and non-Sunday days from {self.end_date} to"
f" {today}, inclusive: {non_saturday_nor_sunday_day_count}"
)

tentative_page_number_associated_with_end_date: int = (
non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE
)

if tentative_page_number_associated_with_end_date < 1:
tentative_page_number_associated_with_end_date = 1

return tentative_page_number_associated_with_end_date

def start_requests(self):
tentative_page_number_associated_with_end_date: int = (
self._calculate_tentative_page_number_associated_with_end_date()
)
self.logger.info(
f"Tentative page number for {self.end_date} calculated"
f" as {tentative_page_number_associated_with_end_date}"
)

initial_url = self.BASE_URL.format(
PAGE_NUMBER=tentative_page_number_associated_with_end_date
)
yield Request(
url=initial_url,
callback=self.per_page,
cb_kwargs={
"searching_for_end_date": True,
"current_page_number": tentative_page_number_associated_with_end_date,
},
)

def _extract_date_from_gazette_text(self, gazette_text):
date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
if not date_re:
self.logger.warning(f"No date could be extracted from '{gazette_text}'")
return

textual_date = date_re.group(0)
# The extra edition for August 28th, 2018 has a typo in the month name.
textual_date = textual_date.replace("Agosoto", "Agosto")
# The edition for December 17th, 2012 has a typo in the month name.
textual_date = textual_date.replace("Dezembrbo", "Dezembro")
gazette_date = dateparser.parse(textual_date, languages=["pt"]).date()
return gazette_date

def per_row_and_store_if_valid(self, response, row_element):
"""Extract gazette data to collect from a page row.
It returns the extracted gazette date.
"""
gazette_text = row_element.css("h4::text").get("")

gazette_date = self._extract_date_from_gazette_text(gazette_text)
if not gazette_date:
return gazette_date

if gazette_date < self.start_date or self.end_date < gazette_date:
# the gazette listing goes from the newest to the oldest,
# except for extra edition or additional files
return gazette_date

path_to_gazette = row_element.css("a::attr(href)").get().strip()
# From November 17th, 2017 and backwards the path to the gazette PDF
# is relative.
if path_to_gazette.startswith("up/diario_oficial.php"):
path_to_gazette = response.urljoin(path_to_gazette)

# As the rows are analyzed top-down and multiple entries for
# the same day also keep that order, we will use that to compose
# in the case of multiple files.
#
# Example of a regular gazette with multiple files:
# Diário Oficial Eletrônico de 31 de Maio de 2022 - Edição - 1100
# Diário Oficial Eletrônico de 31 de Maio de 2022 - Edição - 1100 - CADERNO 2
#
# Example of an extra edition with multiple files:
# Suplemento I do Diário Oficial Eletrônico de 05 de Abril de 2022 - Edição - 1064
# Suplemento II do Diário Oficial Eletrônico de 05 de Abril de 2022 - Edição - 1064 - ERRATA
#
# Example of a gazette on Saturday:
# Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra

is_extra_edition = (
gazette_text.startswith("Suplemento") or "Extra" in gazette_text
)

file_urls_by_is_extra_edition_for_the_date = (
self.collected_data_by_date.setdefault(gazette_date, {})
)

file_urls_by_is_extra_edition_for_the_date.setdefault(
is_extra_edition, []
).append(path_to_gazette)

return gazette_date

def per_page(self, response, current_page_number, searching_for_end_date=False):
"""Extract gazette data to collect from a page."""
is_gazette_date_before_start_date = False
for row_element in response.css("ul.ul-licitacoes li"):
gazette_date = self.per_row_and_store_if_valid(response, row_element)

if not gazette_date:
continue

if searching_for_end_date:
if self.end_date < gazette_date:
if current_page_number <= 1:
# We already reached the most recent page
searching_for_end_date = False
else:
# The tentative_page_number_associated_with_end_date strategy
# failed, and we need to go to a more recent gazette page
previous_page_number = current_page_number - 1
previous_page_url = self.BASE_URL.format(
PAGE_NUMBER=previous_page_number,
)
yield Request(
url=previous_page_url,
callback=self.per_page,
cb_kwargs={
"searching_for_end_date": True,
"current_page_number": previous_page_number,
},
)
else:
searching_for_end_date = False

if gazette_date < self.start_date:
is_gazette_date_before_start_date = True
break

if is_gazette_date_before_start_date:
# Process all the collected data per date and then yield the gazettes
for (
gazette_date,
file_urls_by_is_extra_edition,
) in self.collected_data_by_date.items():
for (
is_extra_edition,
file_urls,
) in file_urls_by_is_extra_edition.items():
yield Gazette(
date=gazette_date,
file_urls=file_urls,
is_extra_edition=is_extra_edition,
power="executive",
)
else:
# We still need to collect data from the next page
next_url = (
response.css(".pagination")
.xpath("//a[contains(text(), 'Proxima')]/@href")
.get()
)
if next_url:
yield Request(
response.urljoin(next_url),
callback=self.per_page,
cb_kwargs={"current_page_number": current_page_number + 1},
)
62 changes: 0 additions & 62 deletions data_collection/gazette/spiders/rj_campos_goytacazes.py

This file was deleted.

0 comments on commit d6cc9e0

Please sign in to comment.