From 8d5af4f95b80df64a1e488eeba57c563b5d23ba4 Mon Sep 17 00:00:00 2001 From: Alexandre Harano <--unset> Date: Sun, 9 Oct 2022 11:36:12 -0300 Subject: [PATCH] Campos dos Goytacazes-RJ spider refactor The way the spider was implemented assumed that there could only be a single file_url per day per is_extra_edition value, which was not always true. This refactoring gathers all the various files per day and is_extra_edition. We addressed the text format for Saturday gazettes to be considered is_extra_edition. We also included the start_date and end_date handling, and edition_number when applicable. resolve okfn-brasil/querido-diario#637 --- .../spiders/rj_campos_dos_goytacazes.py | 412 ++++++++++++++++++ .../gazette/spiders/rj_campos_goytacazes.py | 62 --- 2 files changed, 412 insertions(+), 62 deletions(-) create mode 100644 data_collection/gazette/spiders/rj_campos_dos_goytacazes.py delete mode 100644 data_collection/gazette/spiders/rj_campos_goytacazes.py diff --git a/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py new file mode 100644 index 0000000000..c696de48e9 --- /dev/null +++ b/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py @@ -0,0 +1,412 @@ +import calendar +import re +from datetime import date, datetime, timedelta +from typing import Callable, Generator + +import dateparser +from scrapy.http import Request, Response + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + +EDITION_NUMBER_RE = re.compile(r"Edição\s+(?:Extra\s+)?-?\s*(\d+)") +REGULAR_FULL_NOMINAL_DATE_RE = re.compile( + r"\s+" + r"(\d{1,2})(?# day)" + r"\s+d?e?\s*" + r"(\w+)(?# nominal month in pt)" + r"\s+d?e?\s*" + r"(\d{4})(?# year)", + flags=re.IGNORECASE, +) +MONTH_YEAR_NOMINAL_DATE_RE = re.compile( + r"Oficial\s+de\s*(\w+)(?# nominal month in pt)\s+d?e?\s*(\d{4})(?# year)", + flags=re.IGNORECASE, +) + + +class DetermineEndDatePageMixin: + """Collection of attributes and methods to determine the end_date page""" + + BASE_URL: str = ( + "https://www.campos.rj.gov.br/diario-oficial.php" + "?PGpagina={PAGE_NUMBER}&PGporPagina=15" + ) + # the current gazette system only allows at most this number of rows per page, + # even when explicitly requesting more than that + MAX_ROWS_PER_PAGE: int = 15 + MINIMUM_ALLOWED_PAGE_NUMBER: int = 1 + + def calculate_tentative_page_number_associated_with_end_date(self) -> int: + """Determine the page number that the end_date gazette might be at. + + Facts for the design of this method: + - The first page of the pagination contains the most recent gazette. + - We consider most Saturday and Sunday days have no gazette. + Exception example: + Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra + - Even if the number of rows for the other days may vary from zero to + more than one, we consider that non-Saturday and non-Sunday days + will have one gazette + - Considering the potential variation of established conditions, + such as not having a gazette or having multiple rows for the same day, + we tentatively set that the target end_date gazette might be available on + the calculated page number or one before that. + + This method adopts the following heuristic: we calculate the number of + non-Saturday and non-Sunday from the day this method runs until + the target end_date and perform an integer division of the estimated number of + days by the maximum number of rows on a page, and the result is + the chosen page number. + + We only replace the calculated number when it is less than one: + for that case, we replace it with 1, as the page numbering begins at 1. + + It returns a non-zero positive int. + """ + + today: date = datetime.today().date() + + if today <= self.end_date: + return self.MINIMUM_ALLOWED_PAGE_NUMBER + + non_saturday_nor_sunday_day_count: int = 0 + current_day: date = self.end_date + one_day_timedelta: timedelta = timedelta(days=1) + saturday_and_sunday_set: set[int] = { + 6, # Saturday + 7, # Sunday + } + while current_day <= today: + if current_day.isoweekday() not in saturday_and_sunday_set: + non_saturday_nor_sunday_day_count += 1 + current_day = current_day + one_day_timedelta + + self.logger.info( + f"Number of non-Saturday and non-Sunday days from {self.end_date} to" + f" {today}, inclusive: {non_saturday_nor_sunday_day_count}" + ) + + tentative_page_number: int = ( + non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE + ) + if tentative_page_number < self.MINIMUM_ALLOWED_PAGE_NUMBER: + tentative_page_number = self.MINIMUM_ALLOWED_PAGE_NUMBER + + self.logger.info( + f"Tentative page number for {self.end_date} calculated" + f" as {tentative_page_number}" + ) + + return tentative_page_number + + def extract_date_from_gazette_text(self, gazette_text: str) -> date | None: + if not gazette_text: + return None + + text: str = ( + gazette_text + # The extra edition for August 28th, 2018 has a typo in the month name. + .replace("Agosoto", "Agosto") + # The edition for December 17th, 2012 has a typo in the month name. + .replace("Dezembrbo", "Dezembro") + ) + + gazette_date: date | None = None + + if match_ := REGULAR_FULL_NOMINAL_DATE_RE.search(text): + textual_date: str = ( + f"{match_.group(1)} de {match_.group(2)} de {match_.group(3)}" + ) + gazette_date: date = dateparser.parse(textual_date, languages=["pt"]).date() + return gazette_date + + # From October 2012 to October 2013, it has a single row per month + # The provided data is a rar extension file and some of them are missing + if match_ := MONTH_YEAR_NOMINAL_DATE_RE.search(text): + # To avoid any issues for the date conversion, we do a safe replacement to + # initially consider as the first day of the month + textual_date: str = f"01 de {match_.group(1)} de {match_.group(2)}" + gazette_date: date = dateparser.parse(textual_date, languages=["pt"]).date() + + # As this case is a collection of gazettes for the full month, + # we consider the gazette date as the last day of that month + last_day_of_the_month: int = calendar.monthrange( + year=gazette_date.year, month=gazette_date.month + )[1] + gazette_date: date = gazette_date.replace(day=last_day_of_the_month) + + return gazette_date + + self.logger.warning(f"No date could be extracted from '{text}'") + return gazette_date + + def validate_date_condition_over_rows( + self, remaining_rows_for_current_page, condition: Callable[[date], bool] + ) -> date | None: + """Returns the found gazette date once the provided condition is valid. + + In case it is not found across the iteration, returns None. + """ + gazette_date: date | None = None + + for row_element in remaining_rows_for_current_page: + gazette_text = row_element.css("h4::text").get("").strip() + if not gazette_text: + continue + + gazette_date = self.extract_date_from_gazette_text(gazette_text) + if not gazette_date: + continue + + if condition(gazette_date): + return gazette_date + + return gazette_date + + def find_first_valid_date_in_rows( + self, remaining_rows_for_current_page + ) -> date | None: + return self.validate_date_condition_over_rows( + remaining_rows_for_current_page, condition=lambda date_: date_ is not None + ) + + def find_end_date_or_earlier_in_rows( + self, remaining_rows_for_current_page + ) -> date | None: + return self.validate_date_condition_over_rows( + remaining_rows_for_current_page, + condition=lambda date_: date_ <= self.end_date, + ) + + def find_end_date_page( + self, + response: Response, + current_page_number: int, + search_towards_the_past: bool | None, + ) -> Generator: + """Determine the page for end_date. + + In most cases, the tentative_page_number_associated_with_end_date strategy is + a conservative approach, as at 2022-10-05 there are consistently one or more gazettes + per non-Saturday and non-Sunday weekdays. It means that to determine the page for + end_date we will have to browse greater page numbers to find the actual page associated with end_date. + + However, we might face a period when, for any reason, the gazettes were not published as expected. + A fallback strategy consists in browsing pages with lower page number than + the tentative_page_number_associated_with_end_date value. + + We control the direction of the search by using the `search_towards_the_past` parameter: + - None: we still need to determine which way we need to find the end_date page + - False: tentative_page_number_associated_with_end_date strategy failed due to having more + days without any published gazettes. + - True: tentative_page_number_associated_with_end_date strategy worked, and we need to find + end_date in pages associated with earlier gazettes. + """ + + was_end_date_page_found = False + remaining_rows_for_current_page = iter(response.css("ul.ul-licitacoes li")) + + gazette_date = self.find_first_valid_date_in_rows( + remaining_rows_for_current_page + ) + if not gazette_date: + raise ValueError(f"No valid dates were found for this page: {response.url}") + + if not search_towards_the_past: + # This case is valid for both when search_towards_the_past is None and when it is False + if gazette_date < self.end_date: + # First valid date is earlier than end_date + + if current_page_number <= self.MINIMUM_ALLOWED_PAGE_NUMBER: + # We already reached the most recent page, so we will start triaging the data + was_end_date_page_found = True + else: + # We need to retrieve pages associated with newer gazettes + search_towards_the_past = False + else: + search_towards_the_past = True + + if ( + search_towards_the_past + ): # This condition should not be joined with the one above + was_end_date_page_found = ( + gazette_date <= self.end_date + or self.find_end_date_or_earlier_in_rows( + remaining_rows_for_current_page + ) + <= self.end_date + ) + + if was_end_date_page_found: + # As the page was found, we finally start triaging the data + yield Request( + response.url, + callback=self.triage_data_per_page, + dont_filter=True, # as we are requesting the same URL, we don't want to filter it + ) + else: + assert search_towards_the_past is not None + if search_towards_the_past: + next_call_page_number = current_page_number + 1 + else: + next_call_page_number = current_page_number - 1 + yield Request( + url=self.BASE_URL.format(PAGE_NUMBER=next_call_page_number), + callback=self.find_end_date_page, + cb_kwargs={ + "current_page_number": next_call_page_number, + "search_towards_the_past": search_towards_the_past, + }, + ) + + +class RjCamposDosGoytacazesSpider(DetermineEndDatePageMixin, BaseGazetteSpider): + TERRITORY_ID = "3301009" + + allowed_domains = ["www.campos.rj.gov.br"] + name = "rj_campos_dos_goytacazes" + + start_date = date(2010, 6, 10) + # November 17th, 2017 was the date of the last Diário Oficial gazette and + # also the date of the first Diário Oficial Eletrônico gazette + + def __init__(self, *args, **kwargs) -> None: + super(RjCamposDosGoytacazesSpider, self).__init__(*args, **kwargs) + self.current_date = None + self.current_edition_number = "" + self.collected_data_for_current_date = {} + self.triaged_data_by_date = {} + + def start_requests(self) -> Generator: + tentative_page_number_associated_with_end_date: int = ( + self.calculate_tentative_page_number_associated_with_end_date() + ) + + yield Request( + url=self.BASE_URL.format( + PAGE_NUMBER=tentative_page_number_associated_with_end_date + ), + callback=self.find_end_date_page, + cb_kwargs={ + "current_page_number": tentative_page_number_associated_with_end_date, + "search_towards_the_past": None, + }, + dont_filter=True, # the page may have already been requested when determining the end_date page + ) + + def triage_data_per_row(self, gazette_text: str) -> tuple[date | None, bool, str]: + """Triage gazette data for a gazette that is from November 17th 2017 or earlier. + + It returns + the extracted gazette date, + whether it is an extra edition, and + the edition number when applicable. + """ + gazette_date: date | None = None + is_extra_edition: bool = ( + gazette_text.startswith("Suplemento") or "Extra" in gazette_text + ) + edition_number: str = "" + + if not gazette_text: + return gazette_date, is_extra_edition, edition_number + + gazette_date = self.extract_date_from_gazette_text(gazette_text) + if ( + not gazette_date + or gazette_date < self.start_date + or self.end_date < gazette_date + ): + return gazette_date, is_extra_edition, edition_number + + edition_number_match = EDITION_NUMBER_RE.search(gazette_text) + if edition_number_match: + edition_number = edition_number_match.group(1).strip() + + return gazette_date, is_extra_edition, edition_number + + def instantiate_gazettes_and_reset_stored_data(self) -> list[Gazette]: + if not self.current_date: + return [] + + gazettes: list[Gazette] = [ + Gazette( + date=self.current_date, + edition_number=str(self.current_edition_number), + file_urls=file_urls, + is_extra_edition=is_extra_edition, + power="executive", + ) + for ( + is_extra_edition, + file_urls, + ) in self.collected_data_for_current_date.items() + ] + + self.current_date = None + self.current_edition_number = "" + self.collected_data_for_current_date = {} + return gazettes + + def triage_data_per_page(self, response) -> Generator: + """Triage gazette data from a page row. + + Once we determine that all the date range data was triaged, + we collect the gazettes. + + Otherwise, we triage from the next page. + """ + + is_gazette_date_before_start_date: bool = False + for row_element in response.css("ul.ul-licitacoes li"): + gazette_text = row_element.css("h4::text").get("").strip() + file_url = row_element.css("a::attr(href)").get().strip() + + gazette_date, is_extra_edition, edition_number = self.triage_data_per_row( + gazette_text + ) + + if not gazette_date: + continue + + if gazette_date < self.start_date: + is_gazette_date_before_start_date = True + break + + if ( + self.current_edition_number != edition_number + or self.current_date != gazette_date + ): + for gazette in self.instantiate_gazettes_and_reset_stored_data(): + yield gazette + + self.current_edition_number = edition_number + self.current_date = gazette_date + self.collected_data_for_current_date.setdefault( + is_extra_edition, [] + ).append(file_url) + + next_url = ( + response.css(".pagination") + .xpath("//a[contains(text(), 'Proxima')]/@href") + .get() + ) + + if is_gazette_date_before_start_date: + # Collect the gazettes using the triaged data + for gazette in self.instantiate_gazettes_and_reset_stored_data(): + yield gazette + elif next_url: + # Keep triaging data + yield Request( + response.urljoin(next_url), + callback=self.triage_data_per_page, + dont_filter=True, # the page may have already been requested when determining the end_date page + ) + else: + # Due to the Gazette instantiation construct of this spider, + # this is a corner case when we collect without an explicit start_date, + # meaning that is_gazette_date_before_start_date wouldn't be set to True + for gazette in self.instantiate_gazettes_and_reset_stored_data(): + yield gazette diff --git a/data_collection/gazette/spiders/rj_campos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_goytacazes.py deleted file mode 100644 index 8501874ee8..0000000000 --- a/data_collection/gazette/spiders/rj_campos_goytacazes.py +++ /dev/null @@ -1,62 +0,0 @@ -import re - -import dateparser -from scrapy import Request - -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider - - -class RjCampoGoytacazesSpider(BaseGazetteSpider): - TERRITORY_ID = "3301009" - - allowed_domains = ["www.campos.rj.gov.br"] - name = "rj_campos_goytacazes" - start_urls = [ - "https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15" - ] - - def parse(self, response): - """ - @url https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15 - @returns requests 1 - @returns items 15 15 - @scrapes date file_urls is_extra_edition power - """ - - for element in response.css("ul.ul-licitacoes li"): - gazette_text = element.css("h4::text").get("") - - date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text) - if not date_re: - continue - - date = date_re.group(0) - # The extra edition for August 28th, 2018 has a typo in the month name. - date = date.replace("Agosoto", "Agosto") - # The edition for December 17th, 2012 has a typo in the month name. - date = date.replace("Dezembrbo", "Dezembro") - date = dateparser.parse(date, languages=["pt"]).date() - - path_to_gazette = element.css("a::attr(href)").get().strip() - # From November 17th, 2017 and backwards the path to the gazette PDF - # is relative. - if path_to_gazette.startswith("up/diario_oficial.php"): - path_to_gazette = response.urljoin(path_to_gazette) - - is_extra_edition = gazette_text.startswith("Suplemento") - - yield Gazette( - date=date, - file_urls=[path_to_gazette], - is_extra_edition=is_extra_edition, - power="executive", - ) - - next_url = ( - response.css(".pagination") - .xpath("//a[contains(text(), 'Proxima')]/@href") - .get() - ) - if next_url: - yield Request(response.urljoin(next_url))