diff --git a/data_collection/gazette/spiders/rj_campos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_goytacazes.py index 8501874ee8..9ad98471ee 100644 --- a/data_collection/gazette/spiders/rj_campos_goytacazes.py +++ b/data_collection/gazette/spiders/rj_campos_goytacazes.py @@ -1,4 +1,5 @@ import re +from datetime import date, datetime, timedelta import dateparser from scrapy import Request @@ -7,14 +8,93 @@ from gazette.spiders.base import BaseGazetteSpider -class RjCampoGoytacazesSpider(BaseGazetteSpider): +class RjCamposDosGoytacazesSpider(BaseGazetteSpider): TERRITORY_ID = "3301009" allowed_domains = ["www.campos.rj.gov.br"] - name = "rj_campos_goytacazes" - start_urls = [ - "https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15" - ] + name = "rj_campos_dos_goytacazes" + BASE_URL = ( + "https://www.campos.rj.gov.br/diario-oficial.php" + "?PGpagina={PAGE_NUMBER}&PGporPagina=15" + ) + # the current gazette system only allows at most this number of rows per page, + # even when explicitly requesting more than that + MAX_ROWS_PER_PAGE = 15 + + start_date = date(2010, 6, 10) + + def _calculate_tentative_page_number_associated_with_end_date(self) -> int: + """Determine the page number that the end_date gazette might be at. + + Facts for the design of this method: + - The first page of the pagination contains the most recent gazette. + - We consider most Saturday and Sunday days have no gazette. + Exception example: + Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra + - Even if the number of rows for the other days may vary from zero to + more than one, we consider that non-Saturday and non-Sunday days + will have one gazette + - Considering the potential variation of established conditions, + such as not having a gazette or having multiple rows for the same day, + we tentatively set that the target end_date gazette might be available on + the calculated page number or one before that. + + This method adopts the following heuristic: we calculate the number of + non-Saturday and non-Sunday from the day this method runs until + the target end_date and perform an integer division of the estimated number of + days by the maximum number of rows on a page, and the result is + the chosen page number. + + We only replace the calculated number when it is less than one: + for that case, we replace it with 1, as the page numbering begins at 1. + + It returns a non-zero positive int. + """ + + today: date = datetime.today().date() + + if today <= self.end_date: + return 1 + + non_saturday_nor_sunday_day_count: int = 0 + current_day: date = self.end_date + one_day_timedelta: timedelta = timedelta(days=1) + saturday_and_sunday_set = { + 6, # Saturday + 7, # Sunday + } + while current_day <= today: + if current_day.isoweekday() not in saturday_and_sunday_set: + non_saturday_nor_sunday_day_count += 1 + current_day = current_day + one_day_timedelta + + self.logger.info( + f"Number of non-Saturday and non-Sunday days from {self.end_date} to" + f" {today}, inclusive: {non_saturday_nor_sunday_day_count}" + ) + + tentative_page_number_associated_with_end_date: int = ( + non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE + ) + + if tentative_page_number_associated_with_end_date < 1: + tentative_page_number_associated_with_end_date = 1 + + return tentative_page_number_associated_with_end_date + + def start_requests(self): + tentative_page_number_associated_with_end_date: int = ( + self._calculate_tentative_page_number_associated_with_end_date() + ) + self.logger.info( + f"Tentative page number for {self.end_date} calculated" + f" as {tentative_page_number_associated_with_end_date}" + ) + + initial_url = self.BASE_URL.format( + PAGE_NUMBER=tentative_page_number_associated_with_end_date + ) + yield Request(url=initial_url) def parse(self, response): """ @@ -24,6 +104,7 @@ def parse(self, response): @scrapes date file_urls is_extra_edition power """ + is_gazette_date_before_start_date = False for element in response.css("ul.ul-licitacoes li"): gazette_text = element.css("h4::text").get("") @@ -31,12 +112,21 @@ def parse(self, response): if not date_re: continue - date = date_re.group(0) + textual_date = date_re.group(0) # The extra edition for August 28th, 2018 has a typo in the month name. - date = date.replace("Agosoto", "Agosto") + textual_date = textual_date.replace("Agosoto", "Agosto") # The edition for December 17th, 2012 has a typo in the month name. - date = date.replace("Dezembrbo", "Dezembro") - date = dateparser.parse(date, languages=["pt"]).date() + textual_date = textual_date.replace("Dezembrbo", "Dezembro") + gazette_date = dateparser.parse(textual_date, languages=["pt"]).date() + + if self.end_date < gazette_date: + # the gazette listing goes from the newest to the oldest, + # except for extra edition or additional files + continue + + if gazette_date < self.start_date: + is_gazette_date_before_start_date = True + break path_to_gazette = element.css("a::attr(href)").get().strip() # From November 17th, 2017 and backwards the path to the gazette PDF @@ -47,16 +137,17 @@ def parse(self, response): is_extra_edition = gazette_text.startswith("Suplemento") yield Gazette( - date=date, + date=gazette_date, file_urls=[path_to_gazette], is_extra_edition=is_extra_edition, power="executive", ) - next_url = ( - response.css(".pagination") - .xpath("//a[contains(text(), 'Proxima')]/@href") - .get() - ) - if next_url: - yield Request(response.urljoin(next_url)) + if not is_gazette_date_before_start_date: + next_url = ( + response.css(".pagination") + .xpath("//a[contains(text(), 'Proxima')]/@href") + .get() + ) + if next_url: + yield Request(response.urljoin(next_url))