Skip to content

Commit

Permalink
Update Santana-AP spider removing bare exception
Browse files Browse the repository at this point in the history
  • Loading branch information
rennerocha committed Oct 28, 2022
1 parent 12410d3 commit 970982d
Showing 1 changed file with 15 additions and 13 deletions.
28 changes: 15 additions & 13 deletions data_collection/gazette/spiders/ap_santana.py
Original file line number Diff line number Diff line change
@@ -1,37 +1,39 @@
import re

from datetime import date, datetime

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class ApSantanaSpider(BaseGazetteSpider):
TERRITORY_ID = "1600600"
name = "ap_santana"
allowed_domains = ["santana.ap.gov.br"]
start_urls = ["https://www.santana.ap.gov.br/wp-admin/admin-ajax.php?action=datatables_endpoint"]
start_urls = [
"https://www.santana.ap.gov.br/wp-admin/admin-ajax.php?action=datatables_endpoint"
]

start_date = date(2019, 1, 8)

def parse(self, response):
for gazette in response.json()["data"]:
gazette_date = datetime.strptime(
gazette["data"], "%d/%m/%Y"
).date()

gazette_date = datetime.strptime(gazette["data"], "%d/%m/%Y").date()

if gazette_date < self.start_date or self.end_date < gazette_date:
continue

try:
file_html = gazette["arquivo"].split('|')[0]
file_url = re.search(r'href=[\'"]?([^\'" >]+)', file_html).group(1)
except:

file_html = gazette["arquivo"]
file_url_match = re.search(r'href=[\'"]?([^\'" >]+)[\'"]>Baixar', file_html)
if not file_url_match:
continue

file_url = file_url_match.group(1)

yield Gazette(
date=gazette_date,
file_urls=[file_url],
file_urls=[
file_url,
],
is_extra_edition=False,
power="executive",
)

0 comments on commit 970982d

Please sign in to comment.