Campos dos Goytacazes-RJ spider refactor

The way the spider was implemented assumed that there could only be a single file_url per day per is_extra_edition value, which was not always true. This refactoring gathers all the various files per day and is_extra_edition. The existing code did not address the text format for Saturday gazettes to be considered is_extra_edition. We also included the start_date and end_date handling. resolve okfn-brasil#637
ayharano · Oct 5, 2022 · d6cc9e0 · d6cc9e0
1 parent d04b95c
commit d6cc9e0
Show file tree

Hide file tree

Showing 2 changed files with 239 additions and 62 deletions.
diff --git a/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_dos_goytacazes.py
@@ -0,0 +1,239 @@
+import re
+from datetime import date, datetime, timedelta
+
+import dateparser
+from scrapy import Request
+
+from gazette.items import Gazette
+from gazette.spiders.base import BaseGazetteSpider
+
+
+class RjCamposDosGoytacazesSpider(BaseGazetteSpider):
+    TERRITORY_ID = "3301009"
+
+    allowed_domains = ["www.campos.rj.gov.br"]
+    name = "rj_campos_dos_goytacazes"
+    BASE_URL = (
+        "https://www.campos.rj.gov.br/diario-oficial.php"
+        "?PGpagina={PAGE_NUMBER}&PGporPagina=15"
+    )
+    # the current gazette system only allows at most this number of rows per page,
+    # even when explicitly requesting more than that
+    MAX_ROWS_PER_PAGE = 15
+
+    start_date = date(2010, 6, 10)
+
+    def __init__(self, *args, **kwargs):
+        super(RjCamposDosGoytacazesSpider, self).__init__(*args, **kwargs)
+        self.collected_data_by_date = {}
+
+    def _calculate_tentative_page_number_associated_with_end_date(self) -> int:
+        """Determine the page number that the end_date gazette might be at.
+
+        Facts for the design of this method:
+            - The first page of the pagination contains the most recent gazette.
+            - We consider most Saturday and Sunday days have no gazette.
+                Exception example:
+                    Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra
+            - Even if the number of rows for the other days may vary from zero to
+              more than one, we consider that non-Saturday and non-Sunday days
+              will have one gazette
+            - Considering the potential variation of established conditions,
+              such as not having a gazette or having multiple rows for the same day,
+              we tentatively set that the target end_date gazette might be available on
+              the calculated page number or one before that.
+
+        This method adopts the following heuristic: we calculate the number of
+        non-Saturday and non-Sunday from the day this method runs until
+        the target end_date and perform an integer division of the estimated number of
+        days by the maximum number of rows on a page, and the result is
+        the chosen page number.
+
+        We only replace the calculated number when it is less than one:
+        for that case, we replace it with 1, as the page numbering begins at 1.
+
+        It returns a non-zero positive int.
+        """
+
+        today: date = datetime.today().date()
+
+        if today <= self.end_date:
+            return 1
+
+        non_saturday_nor_sunday_day_count: int = 0
+        current_day: date = self.end_date
+        one_day_timedelta: timedelta = timedelta(days=1)
+        saturday_and_sunday_set = {
+            6,  # Saturday
+            7,  # Sunday
+        }
+        while current_day <= today:
+            if current_day.isoweekday() not in saturday_and_sunday_set:
+                non_saturday_nor_sunday_day_count += 1
+            current_day = current_day + one_day_timedelta
+
+        self.logger.info(
+            f"Number of non-Saturday and non-Sunday days from {self.end_date} to"
+            f" {today}, inclusive: {non_saturday_nor_sunday_day_count}"
+        )
+
+        tentative_page_number_associated_with_end_date: int = (
+            non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE
+        )
+
+        if tentative_page_number_associated_with_end_date < 1:
+            tentative_page_number_associated_with_end_date = 1
+
+        return tentative_page_number_associated_with_end_date
+
+    def start_requests(self):
+        tentative_page_number_associated_with_end_date: int = (
+            self._calculate_tentative_page_number_associated_with_end_date()
+        )
+        self.logger.info(
+            f"Tentative page number for {self.end_date} calculated"
+            f" as {tentative_page_number_associated_with_end_date}"
+        )
+
+        initial_url = self.BASE_URL.format(
+            PAGE_NUMBER=tentative_page_number_associated_with_end_date
+        )
+        yield Request(
+            url=initial_url,
+            callback=self.per_page,
+            cb_kwargs={
+                "searching_for_end_date": True,
+                "current_page_number": tentative_page_number_associated_with_end_date,
+            },
+        )
+
+    def _extract_date_from_gazette_text(self, gazette_text):
+        date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
+        if not date_re:
+            self.logger.warning(f"No date could be extracted from '{gazette_text}'")
+            return
+
+        textual_date = date_re.group(0)
+        # The extra edition for August 28th, 2018 has a typo in the month name.
+        textual_date = textual_date.replace("Agosoto", "Agosto")
+        # The edition for December 17th, 2012 has a typo in the month name.
+        textual_date = textual_date.replace("Dezembrbo", "Dezembro")
+        gazette_date = dateparser.parse(textual_date, languages=["pt"]).date()
+        return gazette_date
+
+    def per_row_and_store_if_valid(self, response, row_element):
+        """Extract gazette data to collect from a page row.
+
+        It returns the extracted gazette date.
+        """
+        gazette_text = row_element.css("h4::text").get("")
+
+        gazette_date = self._extract_date_from_gazette_text(gazette_text)
+        if not gazette_date:
+            return gazette_date
+
+        if gazette_date < self.start_date or self.end_date < gazette_date:
+            # the gazette listing goes from the newest to the oldest,
+            # except for extra edition or additional files
+            return gazette_date
+
+        path_to_gazette = row_element.css("a::attr(href)").get().strip()
+        # From November 17th, 2017 and backwards the path to the gazette PDF
+        # is relative.
+        if path_to_gazette.startswith("up/diario_oficial.php"):
+            path_to_gazette = response.urljoin(path_to_gazette)
+
+        # As the rows are analyzed top-down and multiple entries for
+        # the same day also keep that order, we will use that to compose
+        # in the case of multiple files.
+        #
+        # Example of a regular gazette with multiple files:
+        #   Diário Oficial Eletrônico de 31 de Maio de 2022 - Edição - 1100
+        #   Diário Oficial Eletrônico de 31 de Maio de 2022 - Edição - 1100 - CADERNO 2
+        #
+        # Example of an extra edition with multiple files:
+        #   Suplemento I do Diário Oficial Eletrônico de 05 de Abril de 2022 - Edição - 1064
+        #   Suplemento II do Diário Oficial Eletrônico de 05 de Abril de 2022 - Edição - 1064 - ERRATA
+        #
+        # Example of a gazette on Saturday:
+        #   Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra
+
+        is_extra_edition = (
+            gazette_text.startswith("Suplemento") or "Extra" in gazette_text
+        )
+
+        file_urls_by_is_extra_edition_for_the_date = (
+            self.collected_data_by_date.setdefault(gazette_date, {})
+        )
+
+        file_urls_by_is_extra_edition_for_the_date.setdefault(
+            is_extra_edition, []
+        ).append(path_to_gazette)
+
+        return gazette_date
+
+    def per_page(self, response, current_page_number, searching_for_end_date=False):
+        """Extract gazette data to collect from a page."""
+        is_gazette_date_before_start_date = False
+        for row_element in response.css("ul.ul-licitacoes li"):
+            gazette_date = self.per_row_and_store_if_valid(response, row_element)
+
+            if not gazette_date:
+                continue
+
+            if searching_for_end_date:
+                if self.end_date < gazette_date:
+                    if current_page_number <= 1:
+                        # We already reached the most recent page
+                        searching_for_end_date = False
+                    else:
+                        # The tentative_page_number_associated_with_end_date strategy
+                        # failed, and we need to go to a more recent gazette page
+                        previous_page_number = current_page_number - 1
+                        previous_page_url = self.BASE_URL.format(
+                            PAGE_NUMBER=previous_page_number,
+                        )
+                        yield Request(
+                            url=previous_page_url,
+                            callback=self.per_page,
+                            cb_kwargs={
+                                "searching_for_end_date": True,
+                                "current_page_number": previous_page_number,
+                            },
+                        )
+                else:
+                    searching_for_end_date = False
+
+            if gazette_date < self.start_date:
+                is_gazette_date_before_start_date = True
+                break
+
+        if is_gazette_date_before_start_date:
+            # Process all the collected data per date and then yield the gazettes
+            for (
+                gazette_date,
+                file_urls_by_is_extra_edition,
+            ) in self.collected_data_by_date.items():
+                for (
+                    is_extra_edition,
+                    file_urls,
+                ) in file_urls_by_is_extra_edition.items():
+                    yield Gazette(
+                        date=gazette_date,
+                        file_urls=file_urls,
+                        is_extra_edition=is_extra_edition,
+                        power="executive",
+                    )
+        else:
+            # We still need to collect data from the next page
+            next_url = (
+                response.css(".pagination")
+                .xpath("//a[contains(text(), 'Proxima')]/@href")
+                .get()
+            )
+            if next_url:
+                yield Request(
+                    response.urljoin(next_url),
+                    callback=self.per_page,
+                    cb_kwargs={"current_page_number": current_page_number + 1},
+                )
diff --git a/data_collection/gazette/spiders/rj_campos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_goytacazes.py