Skip to content

Commit

Permalink
new spider sp_cacapava
Browse files Browse the repository at this point in the history
  • Loading branch information
almeidadm committed Feb 3, 2024
1 parent 6cff16c commit ba305c7
Showing 1 changed file with 44 additions and 0 deletions.
44 changes: 44 additions & 0 deletions data_collection/gazette/spiders/sp/sp_cacapava.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
import re
from datetime import date, datetime

from scrapy.http import Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class SpCacapavaSpider(BaseGazetteSpider):
TERRITORY_ID = "3508504"
name = "sp_cacapava"
allowed_domains = ["cacapava.sp.gov.br", "ecrie.com.br"]
start_date = date(2021, 4, 27)
custom_settings = {"DOWNLOAD_DELAY": 0.5, "RANDOMIZE_DOWNLOAD_DELAY": True}

def start_requests(self):
url = "https://cacapava.sp.gov.br/diario-oficial?"
url += f'&dataDe={self.start_date.strftime("%d/%m/%Y")}'
url += f'&dataAte={self.end_date.strftime("%d/%m/%Y")}'
yield Request(url, callback=self.parse_info)

def parse_info(self, response):
base_url = response.url
num_pages = response.css(".pagination__select option::text")[-1].get()
for i in range(1, int(num_pages) + 1):
yield Request(f"{base_url}&pagina={i}")

def parse(self, response):
for gazette in response.css(".list-item__info"):
gazette_number = re.findall(
"Edição nº (\d+)", gazette.css("h3::text").get()
)[0]
raw_date = re.findall("\d{2}/\d{2}/\d{4}", gazette.css("p::text").get())[0]
gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date()
gazette_url = gazette.css("a").attrib["href"]

yield Gazette(
date=gazette_date,
edition_number=gazette_number,
is_extra_edition=False,
power="executive_legislative",
file_urls=[gazette_url],
)

0 comments on commit ba305c7

Please sign in to comment.