Update Santana-AP spider removing bare exception

ayharano · Oct 28, 2022 · 970982d · 970982d
1 parent 12410d3
commit 970982d
Showing 1 changed file with 15 additions and 13 deletions.
diff --git a/data_collection/gazette/spiders/ap_santana.py b/data_collection/gazette/spiders/ap_santana.py
@@ -1,37 +1,39 @@
 import re
-
 from datetime import date, datetime
 
 from gazette.items import Gazette
 from gazette.spiders.base import BaseGazetteSpider
 
+
 class ApSantanaSpider(BaseGazetteSpider):
     TERRITORY_ID = "1600600"
     name = "ap_santana"
     allowed_domains = ["santana.ap.gov.br"]
-    start_urls = ["https://www.santana.ap.gov.br/wp-admin/admin-ajax.php?action=datatables_endpoint"]
+    start_urls = [
+        "https://www.santana.ap.gov.br/wp-admin/admin-ajax.php?action=datatables_endpoint"
+    ]
 
     start_date = date(2019, 1, 8)
 
     def parse(self, response):
         for gazette in response.json()["data"]:
-            gazette_date = datetime.strptime(
-                gazette["data"], "%d/%m/%Y"
-            ).date()
-
+            gazette_date = datetime.strptime(gazette["data"], "%d/%m/%Y").date()
+
             if gazette_date < self.start_date or self.end_date < gazette_date:
                 continue
-
-            try:
-                file_html = gazette["arquivo"].split('|')[0]
-                file_url = re.search(r'href=[\'"]?([^\'" >]+)', file_html).group(1)
-            except:
+
+            file_html = gazette["arquivo"]
+            file_url_match = re.search(r'href=[\'"]?([^\'" >]+)[\'"]>Baixar', file_html)
+            if not file_url_match:
                 continue
 
+            file_url = file_url_match.group(1)
+
             yield Gazette(
                 date=gazette_date,
-                file_urls=[file_url],
+                file_urls=[
+                    file_url,
+                ],
                 is_extra_edition=False,
                 power="executive",
             )
-