start_date and end_date handling for Campos dos Goytacazes-RJ spider

resolve okfn-brasil#637
ayharano · Oct 5, 2022 · 830b411 · 830b411
1 parent d04b95c
commit 830b411
Showing 1 changed file with 108 additions and 17 deletions.
diff --git a/data_collection/gazette/spiders/rj_campos_goytacazes.py b/data_collection/gazette/spiders/rj_campos_goytacazes.py
@@ -1,4 +1,5 @@
 import re
+from datetime import date, datetime, timedelta
 
 import dateparser
 from scrapy import Request
@@ -7,14 +8,93 @@
 from gazette.spiders.base import BaseGazetteSpider
 
 
-class RjCampoGoytacazesSpider(BaseGazetteSpider):
+class RjCamposDosGoytacazesSpider(BaseGazetteSpider):
     TERRITORY_ID = "3301009"
 
     allowed_domains = ["www.campos.rj.gov.br"]
-    name = "rj_campos_goytacazes"
-    start_urls = [
-        "https://www.campos.rj.gov.br/diario-oficial.php?PGpagina=1&PGporPagina=15"
-    ]
+    name = "rj_campos_dos_goytacazes"
+    BASE_URL = (
+        "https://www.campos.rj.gov.br/diario-oficial.php"
+        "?PGpagina={PAGE_NUMBER}&PGporPagina=15"
+    )
+    # the current gazette system only allows at most this number of rows per page,
+    # even when explicitly requesting more than that
+    MAX_ROWS_PER_PAGE = 15
+
+    start_date = date(2010, 6, 10)
+
+    def _calculate_tentative_page_number_associated_with_end_date(self) -> int:
+        """Determine the page number that the end_date gazette might be at.
+
+        Facts for the design of this method:
+            - The first page of the pagination contains the most recent gazette.
+            - We consider most Saturday and Sunday days have no gazette.
+                Exception example:
+                    Diário Oficial Eletrônico de 14 de Agosto de 2021 - Edição Extra
+            - Even if the number of rows for the other days may vary from zero to
+              more than one, we consider that non-Saturday and non-Sunday days
+              will have one gazette
+            - Considering the potential variation of established conditions,
+              such as not having a gazette or having multiple rows for the same day,
+              we tentatively set that the target end_date gazette might be available on
+              the calculated page number or one before that.
+
+        This method adopts the following heuristic: we calculate the number of
+        non-Saturday and non-Sunday from the day this method runs until
+        the target end_date and perform an integer division of the estimated number of
+        days by the maximum number of rows on a page, and the result is
+        the chosen page number.
+
+        We only replace the calculated number when it is less than one:
+        for that case, we replace it with 1, as the page numbering begins at 1.
+
+        It returns a non-zero positive int.
+        """
+
+        today: date = datetime.today().date()
+
+        if today <= self.end_date:
+            return 1
+
+        non_saturday_nor_sunday_day_count: int = 0
+        current_day: date = self.end_date
+        one_day_timedelta: timedelta = timedelta(days=1)
+        saturday_and_sunday_set = {
+            6,  # Saturday
+            7,  # Sunday
+        }
+        while current_day <= today:
+            if current_day.isoweekday() not in saturday_and_sunday_set:
+                non_saturday_nor_sunday_day_count += 1
+            current_day = current_day + one_day_timedelta
+
+        self.logger.info(
+            f"Number of non-Saturday and non-Sunday days from {self.end_date} to"
+            f" {today}, inclusive: {non_saturday_nor_sunday_day_count}"
+        )
+
+        tentative_page_number_associated_with_end_date: int = (
+            non_saturday_nor_sunday_day_count // self.MAX_ROWS_PER_PAGE
+        )
+
+        if tentative_page_number_associated_with_end_date < 1:
+            tentative_page_number_associated_with_end_date = 1
+
+        return tentative_page_number_associated_with_end_date
+
+    def start_requests(self):
+        tentative_page_number_associated_with_end_date: int = (
+            self._calculate_tentative_page_number_associated_with_end_date()
+        )
+        self.logger.info(
+            f"Tentative page number for {self.end_date} calculated"
+            f" as {tentative_page_number_associated_with_end_date}"
+        )
+
+        initial_url = self.BASE_URL.format(
+            PAGE_NUMBER=tentative_page_number_associated_with_end_date
+        )
+        yield Request(url=initial_url)
 
     def parse(self, response):
         """
@@ -24,19 +104,29 @@ def parse(self, response):
         @scrapes date file_urls is_extra_edition power
         """
 
+        is_gazette_date_before_start_date = False
         for element in response.css("ul.ul-licitacoes li"):
             gazette_text = element.css("h4::text").get("")
 
             date_re = re.search(r"(\d{2} de (.*) de \d{4})", gazette_text)
             if not date_re:
                 continue
 
-            date = date_re.group(0)
+            textual_date = date_re.group(0)
             # The extra edition for August 28th, 2018 has a typo in the month name.
-            date = date.replace("Agosoto", "Agosto")
+            textual_date = textual_date.replace("Agosoto", "Agosto")
             # The edition for December 17th, 2012 has a typo in the month name.
-            date = date.replace("Dezembrbo", "Dezembro")
-            date = dateparser.parse(date, languages=["pt"]).date()
+            textual_date = textual_date.replace("Dezembrbo", "Dezembro")
+            gazette_date = dateparser.parse(textual_date, languages=["pt"]).date()
+
+            if self.end_date < gazette_date:
+                # the gazette listing goes from the newest to the oldest,
+                # except for extra edition or additional files
+                continue
+
+            if gazette_date < self.start_date:
+                is_gazette_date_before_start_date = True
+                break
 
             path_to_gazette = element.css("a::attr(href)").get().strip()
             # From November 17th, 2017 and backwards the path to the gazette PDF
@@ -47,16 +137,17 @@ def parse(self, response):
             is_extra_edition = gazette_text.startswith("Suplemento")
 
             yield Gazette(
-                date=date,
+                date=gazette_date,
                 file_urls=[path_to_gazette],
                 is_extra_edition=is_extra_edition,
                 power="executive",
             )
 
-        next_url = (
-            response.css(".pagination")
-            .xpath("//a[contains(text(), 'Proxima')]/@href")
-            .get()
-        )
-        if next_url:
-            yield Request(response.urljoin(next_url))
+        if not is_gazette_date_before_start_date:
+            next_url = (
+                response.css(".pagination")
+                .xpath("//a[contains(text(), 'Proxima')]/@href")
+                .get()
+            )
+            if next_url:
+                yield Request(response.urljoin(next_url))