From 8d5af4f95b80df64a1e488eeba57c563b5d23ba4 Mon Sep 17 00:00:00 2001
From: Alexandre Harano <--unset>
Date: Sun, 9 Oct 2022 11:36:12 -0300
Subject: [PATCH] Campos dos Goytacazes-RJ spider refactor

The way the spider was implemented assumed that there could only be a single file_url per day per is_extra_edition value, which was not always true.

This refactoring gathers all the various files per day and is_extra_edition.

We addressed the text format for Saturday gazettes to be considered is_extra_edition.

We also included the start_date and end_date handling, and edition_number when applicable.

resolve okfn-brasil/querido-diario#637
---
 .../spiders/rj_campos_dos_goytacazes.py       | 412 ++++++++++++++++++
 .../gazette/spiders/rj_campos_goytacazes.py   |  62 ---
 2 files changed, 412 insertions(+), 62 deletions(-)
 create mode 100644 data_collection/gazette/spiders/rj_campos_dos_goytacazes.py
 delete mode 100644 data_collection/gazette/spiders/rj_campos_goytacazes.py

diff --git a/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py
new file mode 100644
index 0000000000..c696de48e9
--- /dev/null
+++ b/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py
@@ -0,0 +1,412 @@
+import calendar
+import re
+from datetime import date, datetime, timedelta
+from typing import Callable, Generator
+
+import dateparser
+from scrapy.http import Request, Response
+
+from gazette.items import Gazette
+from gazette.spiders.base import BaseGazetteSpider
+
+EDITION_NUMBER_RE = re.compile(r"Edição\s+(?:Extra\s+)?-?\s*(\d+)")
+REGULAR_FULL_NOMINAL_DATE_RE = re.compile(
+    r"\s+"
+    r"(\d{1,2})(?# day)"
+    r"\s+d?e?\s*"
+    r"(\w+)(?# nominal month in pt)"
+    r"\s+d?e?\s*"
+    r"(\d{4})(?# year)",
+    flags=re.IGNORECASE,
+)
+MONTH_YEAR_NOMINAL_DATE_RE = re.compile(
+    r"Oficial\s+de\s*(\w+)(?# nominal month in pt)\s+d?e?\s*(\d{4})(?# year)",
+    flags=re.IGNORECASE,
+)
+
+
+class DetermineEndDatePageMixin:
+    """Collection of attributes and methods to determine the end_date page"""
+
+    BASE_URL: str = (
+        "https://www.campos.rj.gov.br/diario-oficial.php"
+        "?PGpagina={PAGE_NUMBER}&PGporPagina=15"
+    )
+    # the current gazette system only allows at most this number of rows per page,
+    # even when explicitly requesting more than that
+    MAX_ROWS_PER_PAGE: int = 15
+    MINIMUM_ALLOWED_PAGE_NUMBER: int = 1
+
+    def calculate_tentative_page_number_associated_with_end_date(self) -> int:
+        """Determine the page number that the end_date gazette might be at.
+
+        Facts for the design of this method:
+            - The first page of the pagination contains the most recent gazette.
+            - We consider most Saturday and Sunday days have no gazette.
+                Exception example:
+                    Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra
+            - Even if the number of rows for the other days may vary from zero to
+              more than one, we consider that non-Saturday and non-Sunday days
+              will have one gazette
+            - Considering the potential variation of established conditions,
+              such as not having a gazette or having multiple rows for the same day,
+              we tentatively set that the target end_date gazette might be available on
+              the calculated page number or one before that.
+
+        This method adopts the following heuristic: we calculate the number of
+        non-Saturday and non-Sunday from the day this method runs until
+        the target end_date and perform an integer division of the estimated number of
+        days by the maximum number of rows on a page, and the result is
+        the chosen page number.
+
+        We only replace the calculated number when it is less than one:
+        for that case, we replace it with 1, as the page numbering begins at 1.
+
+        It returns a non-zero positive int.
+        """
+
+        today: date = datetime.today().date()
+
+        if today <= self.end_date:
+            return self.MINIMUM_ALLOWED_PAGE_NUMBER
+
+        non_saturday_nor_sunday_day_count: int = 0
+        current_day: date = self.end_date
+        one_day_timedelta: timedelta = timedelta(days=1)
+        saturday_and_sunday_set: set[int] = {
+            6,  # Saturday
+            7,  # Sunday
+        }
+        while current_day <= today:
+            if current_day.isoweekday() not in saturday_and_sunday_set:
+                non_saturday_nor_sunday_day_count += 1
+            current_day = current_day + one_day_timedelta
+
+        self.logger.info(
+            f"Number of non-Saturday and non-Sunday days from {self.end_date} to"
+            f" {today}, inclusive: {non_saturday_nor_sunday_day_count}"
+        )
+
+        tentative_page_number: int = (
+            non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE
+        )
+        if tentative_page_number < self.MINIMUM_ALLOWED_PAGE_NUMBER:
+            tentative_page_number = self.MINIMUM_ALLOWED_PAGE_NUMBER
+
+        self.logger.info(
+            f"Tentative page number for {self.end_date} calculated"
+            f" as {tentative_page_number}"
+        )
+
+        return tentative_page_number
+
+    def extract_date_from_gazette_text(self, gazette_text: str) -> date | None:
+        if not gazette_text:
+            return None
+
+        text: str = (
+            gazette_text
+            # The extra edition for August 28th, 2018 has a typo in the month name.
+            .replace("Agosoto", "Agosto")
+            # The edition for December 17th, 2012 has a typo in the month name.
+            .replace("Dezembrbo", "Dezembro")
+        )
+
+        gazette_date: date | None = None
+
+        if match_ := REGULAR_FULL_NOMINAL_DATE_RE.search(text):
+            textual_date: str = (
+                f"{match_.group(1)} de {match_.group(2)} de {match_.group(3)}"
+            )
+            gazette_date: date = dateparser.parse(textual_date, languages=["pt"]).date()
+            return gazette_date
+
+        # From October 2012 to October 2013, it has a single row per month
+        # The provided data is a rar extension file and some of them are missing
+        if match_ := MONTH_YEAR_NOMINAL_DATE_RE.search(text):
+            # To avoid any issues for the date conversion, we do a safe replacement to
+            # initially consider as the first day of the month
+            textual_date: str = f"01 de {match_.group(1)} de {match_.group(2)}"
+            gazette_date: date = dateparser.parse(textual_date, languages=["pt"]).date()
+
+            # As this case is a collection of gazettes for the full month,
+            # we consider the gazette date as the last day of that month
+            last_day_of_the_month: int = calendar.monthrange(
+                year=gazette_date.year, month=gazette_date.month
+            )[1]
+            gazette_date: date = gazette_date.replace(day=last_day_of_the_month)
+
+            return gazette_date
+
+        self.logger.warning(f"No date could be extracted from '{text}'")
+        return gazette_date
+
+    def validate_date_condition_over_rows(
+        self, remaining_rows_for_current_page, condition: Callable[[date], bool]
+    ) -> date | None:
+        """Returns the found gazette date once the provided condition is valid.
+
+        In case it is not found across the iteration, returns None.
+        """
+        gazette_date: date | None = None
+
+        for row_element in remaining_rows_for_current_page:
+            gazette_text = row_element.css("h4::text").get("").strip()
+            if not gazette_text:
+                continue
+
+            gazette_date = self.extract_date_from_gazette_text(gazette_text)
+            if not gazette_date:
+                continue
+
+            if condition(gazette_date):
+                return gazette_date
+
+        return gazette_date
+
+    def find_first_valid_date_in_rows(
+        self, remaining_rows_for_current_page
+    ) -> date | None:
+        return self.validate_date_condition_over_rows(
+            remaining_rows_for_current_page, condition=lambda date_: date_ is not None
+        )
+
+    def find_end_date_or_earlier_in_rows(
+        self, remaining_rows_for_current_page
+    ) -> date | None:
+        return self.validate_date_condition_over_rows(
+            remaining_rows_for_current_page,
+            condition=lambda date_: date_ <= self.end_date,
+        )
+
+    def find_end_date_page(
+        self,
+        response: Response,
+        current_page_number: int,
+        search_towards_the_past: bool | None,
+    ) -> Generator:
+        """Determine the page for end_date.
+
+        In most cases, the tentative_page_number_associated_with_end_date strategy is
+        a conservative approach, as at 2022-10-05 there are consistently one or more gazettes
+        per non-Saturday and non-Sunday weekdays. It means that to determine the page for
+        end_date we will have to browse greater page numbers to find the actual page associated with end_date.
+
+        However, we might face a period when, for any reason, the gazettes were not published as expected.
+        A fallback strategy consists in browsing pages with lower page number than
+        the tentative_page_number_associated_with_end_date value.
+
+        We control the direction of the search by using the `search_towards_the_past` parameter:
+            - None: we still need to determine which way we need to find the end_date page
+            - False: tentative_page_number_associated_with_end_date strategy failed due to having more
+                     days without any published gazettes.
+            - True: tentative_page_number_associated_with_end_date strategy worked, and we need to find
+                    end_date in pages associated with earlier gazettes.
+        """
+
+        was_end_date_page_found = False
+        remaining_rows_for_current_page = iter(response.css("ul.ul-licitacoes li"))
+
+        gazette_date = self.find_first_valid_date_in_rows(
+            remaining_rows_for_current_page
+        )
+        if not gazette_date:
+            raise ValueError(f"No valid dates were found for this page: {response.url}")
+
+        if not search_towards_the_past:
+            # This case is valid for both when search_towards_the_past is None and when it is False
+            if gazette_date < self.end_date:
+                # First valid date is earlier than end_date
+
+                if current_page_number <= self.MINIMUM_ALLOWED_PAGE_NUMBER:
+                    # We already reached the most recent page, so we will start triaging the data
+                    was_end_date_page_found = True
+                else:
+                    # We need to retrieve pages associated with newer gazettes
+                    search_towards_the_past = False
+            else:
+                search_towards_the_past = True
+
+        if (
+            search_towards_the_past
+        ):  # This condition should not be joined with the one above
+            was_end_date_page_found = (
+                gazette_date <= self.end_date
+                or self.find_end_date_or_earlier_in_rows(
+                    remaining_rows_for_current_page
+                )
+                <= self.end_date
+            )
+
+        if was_end_date_page_found:
+            # As the page was found, we finally start triaging the data
+            yield Request(
+                response.url,
+                callback=self.triage_data_per_page,
+                dont_filter=True,  # as we are requesting the same URL, we don't want to filter it
+            )
+        else:
+            assert search_towards_the_past is not None
+            if search_towards_the_past:
+                next_call_page_number = current_page_number + 1
+            else:
+                next_call_page_number = current_page_number - 1
+            yield Request(
+                url=self.BASE_URL.format(PAGE_NUMBER=next_call_page_number),
+                callback=self.find_end_date_page,
+                cb_kwargs={
+                    "current_page_number": next_call_page_number,
+                    "search_towards_the_past": search_towards_the_past,
+                },
+            )
+
+
+class RjCamposDosGoytacazesSpider(DetermineEndDatePageMixin, BaseGazetteSpider):
+    TERRITORY_ID = "3301009"
+
+    allowed_domains = ["www.campos.rj.gov.br"]
+    name = "rj_campos_dos_goytacazes"
+
+    start_date = date(2010, 6, 10)
+    # November 17th, 2017 was the date of the last Diário Oficial gazette and
+    # also the date of the first Diário Oficial Eletrônico gazette
+
+    def __init__(self, *args, **kwargs) -> None:
+        super(RjCamposDosGoytacazesSpider, self).__init__(*args, **kwargs)
+        self.current_date = None
+        self.current_edition_number = ""
+        self.collected_data_for_current_date = {}
+        self.triaged_data_by_date = {}
+
+    def start_requests(self) -> Generator:
+        tentative_page_number_associated_with_end_date: int = (
+            self.calculate_tentative_page_number_associated_with_end_date()
+        )
+
+        yield Request(
+            url=self.BASE_URL.format(
+                PAGE_NUMBER=tentative_page_number_associated_with_end_date
+            ),
+            callback=self.find_end_date_page,
+            cb_kwargs={
+                "current_page_number": tentative_page_number_associated_with_end_date,
+                "search_towards_the_past": None,
+            },
+            dont_filter=True,  # the page may have already been requested when determining the end_date page
+        )
+
+    def triage_data_per_row(self, gazette_text: str) -> tuple[date | None, bool, str]:
+        """Triage gazette data for a gazette that is from November 17th 2017 or earlier.
+
+        It returns
+            the extracted gazette date,
+            whether it is an extra edition, and
+            the edition number when applicable.
+        """
+        gazette_date: date | None = None
+        is_extra_edition: bool = (
+            gazette_text.startswith("Suplemento") or "Extra" in gazette_text
+        )
+        edition_number: str = ""
+
+        if not gazette_text:
+            return gazette_date, is_extra_edition, edition_number
+
+        gazette_date = self.extract_date_from_gazette_text(gazette_text)
+        if (
+            not gazette_date
+            or gazette_date < self.start_date
+            or self.end_date < gazette_date
+        ):
+            return gazette_date, is_extra_edition, edition_number
+
+        edition_number_match = EDITION_NUMBER_RE.search(gazette_text)
+        if edition_number_match:
+            edition_number = edition_number_match.group(1).strip()
+
+        return gazette_date, is_extra_edition, edition_number
+
+    def instantiate_gazettes_and_reset_stored_data(self) -> list[Gazette]:
+        if not self.current_date:
+            return []
+
+        gazettes: list[Gazette] = [
+            Gazette(
+                date=self.current_date,
+                edition_number=str(self.current_edition_number),
+                file_urls=file_urls,
+                is_extra_edition=is_extra_edition,
+                power="executive",
+            )
+            for (
+                is_extra_edition,
+                file_urls,
+            ) in self.collected_data_for_current_date.items()
+        ]
+
+        self.current_date = None
+        self.current_edition_number = ""
+        self.collected_data_for_current_date = {}
+        return gazettes
+
+    def triage_data_per_page(self, response) -> Generator:
+        """Triage gazette data from a page row.
+
+        Once we determine that all the date range data was triaged,
+        we collect the gazettes.
+
+        Otherwise, we triage from the next page.
+        """
+
+        is_gazette_date_before_start_date: bool = False
+        for row_element in response.css("ul.ul-licitacoes li"):
+            gazette_text = row_element.css("h4::text").get("").strip()
+            file_url = row_element.css("a::attr(href)").get().strip()
+
+            gazette_date, is_extra_edition, edition_number = self.triage_data_per_row(
+                gazette_text
+            )
+
+            if not gazette_date:
+                continue
+
+            if gazette_date < self.start_date:
+                is_gazette_date_before_start_date = True
+                break
+
+            if (
+                self.current_edition_number != edition_number
+                or self.current_date != gazette_date
+            ):
+                for gazette in self.instantiate_gazettes_and_reset_stored_data():
+                    yield gazette
+
+            self.current_edition_number = edition_number
+            self.current_date = gazette_date
+            self.collected_data_for_current_date.setdefault(
+                is_extra_edition, []
+            ).append(file_url)
+
+        next_url = (
+            response.css(".pagination")
+            .xpath("//a[contains(text(), 'Proxima')]/@href")
+            .get()
+        )
+
+        if is_gazette_date_before_start_date:
+            # Collect the gazettes using the triaged data
+            for gazette in self.instantiate_gazettes_and_reset_stored_data():
+                yield gazette
+        elif next_url:
+            # Keep triaging data
+            yield Request(
+                response.urljoin(next_url),
+                callback=self.triage_data_per_page,
+                dont_filter=True,  # the page may have already been requested when determining the end_date page
+            )
+        else:
+            # Due to the Gazette instantiation construct of this spider,
+            # this is a corner case when we collect without an explicit start_date,
+            # meaning that is_gazette_date_before_start_date wouldn't be set to True
+            for gazette in self.instantiate_gazettes_and_reset_stored_data():
+                yield gazette
diff --git a/data_collection/gazette/spiders/rj_campos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_goytacazes.py
deleted file mode 100644
index 8501874ee8..0000000000
--- a/data_collection/gazette/spiders/rj_campos_goytacazes.py
+++ /dev/null
@@ -1,62 +0,0 @@
-import re
-
-import dateparser
-from scrapy import Request
-
-from gazette.items import Gazette
-from gazette.spiders.base import BaseGazetteSpider
-
-
-class RjCampoGoytacazesSpider(BaseGazetteSpider):
-    TERRITORY_ID = "3301009"
-
-    allowed_domains = ["www.campos.rj.gov.br"]
-    name = "rj_campos_goytacazes"
-    start_urls = [
-        "https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15"
-    ]
-
-    def parse(self, response):
-        """
-        @url https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15
-        @returns requests 1
-        @returns items 15 15
-        @scrapes date file_urls is_extra_edition power
-        """
-
-        for element in response.css("ul.ul-licitacoes li"):
-            gazette_text = element.css("h4::text").get("")
-
-            date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
-            if not date_re:
-                continue
-
-            date = date_re.group(0)
-            # The extra edition for August 28th, 2018 has a typo in the month name.
-            date = date.replace("Agosoto", "Agosto")
-            # The edition for December 17th, 2012 has a typo in the month name.
-            date = date.replace("Dezembrbo", "Dezembro")
-            date = dateparser.parse(date, languages=["pt"]).date()
-
-            path_to_gazette = element.css("a::attr(href)").get().strip()
-            # From November 17th, 2017 and backwards the path to the gazette PDF
-            # is relative.
-            if path_to_gazette.startswith("up/diario_oficial.php"):
-                path_to_gazette = response.urljoin(path_to_gazette)
-
-            is_extra_edition = gazette_text.startswith("Suplemento")
-
-            yield Gazette(
-                date=date,
-                file_urls=[path_to_gazette],
-                is_extra_edition=is_extra_edition,
-                power="executive",
-            )
-
-        next_url = (
-            response.css(".pagination")
-            .xpath("//a[contains(text(), 'Proxima')]/@href")
-            .get()
-        )
-        if next_url:
-            yield Request(response.urljoin(next_url))