Skip to content

Commit

Permalink
start_date and end_date handling for Campos dos Goytacazes-RJ spider
Browse files Browse the repository at this point in the history
  • Loading branch information
Alex Harano committed Oct 5, 2022
1 parent d04b95c commit 830b411
Showing 1 changed file with 108 additions and 17 deletions.
125 changes: 108 additions & 17 deletions data_collection/gazette/spiders/rj_campos_goytacazes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import re
from datetime import date, datetime, timedelta

import dateparser
from scrapy import Request
Expand All @@ -7,14 +8,93 @@
from gazette.spiders.base import BaseGazetteSpider


class RjCampoGoytacazesSpider(BaseGazetteSpider):
class RjCamposDosGoytacazesSpider(BaseGazetteSpider):
TERRITORY_ID = "3301009"

allowed_domains = ["www.campos.rj.gov.br"]
name = "rj_campos_goytacazes"
start_urls = [
"https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15"
]
name = "rj_campos_dos_goytacazes"
BASE_URL = (
"https://www.campos.rj.gov.br/diario-oficial.php"
"?PGpagina={PAGE_NUMBER}&PGporPagina=15"
)
# the current gazette system only allows at most this number of rows per page,
# even when explicitly requesting more than that
MAX_ROWS_PER_PAGE = 15

start_date = date(2010, 6, 10)

def _calculate_tentative_page_number_associated_with_end_date(self) -> int:
"""Determine the page number that the end_date gazette might be at.
Facts for the design of this method:
- The first page of the pagination contains the most recent gazette.
- We consider most Saturday and Sunday days have no gazette.
Exception example:
Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra
- Even if the number of rows for the other days may vary from zero to
more than one, we consider that non-Saturday and non-Sunday days
will have one gazette
- Considering the potential variation of established conditions,
such as not having a gazette or having multiple rows for the same day,
we tentatively set that the target end_date gazette might be available on
the calculated page number or one before that.
This method adopts the following heuristic: we calculate the number of
non-Saturday and non-Sunday from the day this method runs until
the target end_date and perform an integer division of the estimated number of
days by the maximum number of rows on a page, and the result is
the chosen page number.
We only replace the calculated number when it is less than one:
for that case, we replace it with 1, as the page numbering begins at 1.
It returns a non-zero positive int.
"""

today: date = datetime.today().date()

if today <= self.end_date:
return 1

non_saturday_nor_sunday_day_count: int = 0
current_day: date = self.end_date
one_day_timedelta: timedelta = timedelta(days=1)
saturday_and_sunday_set = {
6, # Saturday
7, # Sunday
}
while current_day <= today:
if current_day.isoweekday() not in saturday_and_sunday_set:
non_saturday_nor_sunday_day_count += 1
current_day = current_day + one_day_timedelta

self.logger.info(
f"Number of non-Saturday and non-Sunday days from {self.end_date} to"
f" {today}, inclusive: {non_saturday_nor_sunday_day_count}"
)

tentative_page_number_associated_with_end_date: int = (
non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE
)

if tentative_page_number_associated_with_end_date < 1:
tentative_page_number_associated_with_end_date = 1

return tentative_page_number_associated_with_end_date

def start_requests(self):
tentative_page_number_associated_with_end_date: int = (
self._calculate_tentative_page_number_associated_with_end_date()
)
self.logger.info(
f"Tentative page number for {self.end_date} calculated"
f" as {tentative_page_number_associated_with_end_date}"
)

initial_url = self.BASE_URL.format(
PAGE_NUMBER=tentative_page_number_associated_with_end_date
)
yield Request(url=initial_url)

def parse(self, response):
"""
Expand All @@ -24,19 +104,29 @@ def parse(self, response):
@scrapes date file_urls is_extra_edition power
"""

is_gazette_date_before_start_date = False
for element in response.css("ul.ul-licitacoes li"):
gazette_text = element.css("h4::text").get("")

date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
if not date_re:
continue

date = date_re.group(0)
textual_date = date_re.group(0)
# The extra edition for August 28th, 2018 has a typo in the month name.
date = date.replace("Agosoto", "Agosto")
textual_date = textual_date.replace("Agosoto", "Agosto")
# The edition for December 17th, 2012 has a typo in the month name.
date = date.replace("Dezembrbo", "Dezembro")
date = dateparser.parse(date, languages=["pt"]).date()
textual_date = textual_date.replace("Dezembrbo", "Dezembro")
gazette_date = dateparser.parse(textual_date, languages=["pt"]).date()

if self.end_date < gazette_date:
# the gazette listing goes from the newest to the oldest,
# except for extra edition or additional files
continue

if gazette_date < self.start_date:
is_gazette_date_before_start_date = True
break

path_to_gazette = element.css("a::attr(href)").get().strip()
# From November 17th, 2017 and backwards the path to the gazette PDF
Expand All @@ -47,16 +137,17 @@ def parse(self, response):
is_extra_edition = gazette_text.startswith("Suplemento")

yield Gazette(
date=date,
date=gazette_date,
file_urls=[path_to_gazette],
is_extra_edition=is_extra_edition,
power="executive",
)

next_url = (
response.css(".pagination")
.xpath("//a[contains(text(), 'Proxima')]/@href")
.get()
)
if next_url:
yield Request(response.urljoin(next_url))
if not is_gazette_date_before_start_date:
next_url = (
response.css(".pagination")
.xpath("//a[contains(text(), 'Proxima')]/@href")
.get()
)
if next_url:
yield Request(response.urljoin(next_url))

0 comments on commit 830b411

Please sign in to comment.