Skip to content

Commit

Permalink
Algumas melhorias para simplificar o raspador
Browse files Browse the repository at this point in the history
Signed-off-by: Renne Rocha <[email protected]>
  • Loading branch information
rennerocha authored Apr 19, 2024
1 parent ba305c7 commit 6f8539e
Showing 1 changed file with 16 additions and 17 deletions.
33 changes: 16 additions & 17 deletions data_collection/gazette/spiders/sp/sp_cacapava.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,29 +15,28 @@ class SpCacapavaSpider(BaseGazetteSpider):
custom_settings = {"DOWNLOAD_DELAY": 0.5, "RANDOMIZE_DOWNLOAD_DELAY": True}

def start_requests(self):
url = "https://cacapava.sp.gov.br/diario-oficial?"
url += f'&dataDe={self.start_date.strftime("%d/%m/%Y")}'
url += f'&dataAte={self.end_date.strftime("%d/%m/%Y")}'
yield Request(url, callback=self.parse_info)

def parse_info(self, response):
base_url = response.url
num_pages = response.css(".pagination__select option::text")[-1].get()
for i in range(1, int(num_pages) + 1):
yield Request(f"{base_url}&pagina={i}")
data_de = self.start_date.strftime("%d/%m/%Y")
data_ate = self.end_date.strftime("%d/%m/%Y")
url = f"https://cacapava.sp.gov.br/diario-oficial?dataDe={data_de}&dataAte={data_ate}"
yield Request(url)

def parse(self, response):
num_pages = int(
response.css(".pagination__label::text").re_first(r"\/ (\d+)") or "1"
)
if num_pages > 1:
for page in range(1, num_pages + 1):
yield Request(f"{response.url}&pagina={page}")

for gazette in response.css(".list-item__info"):
gazette_number = re.findall(
"Edição nº (\d+)", gazette.css("h3::text").get()
)[0]
raw_date = re.findall("\d{2}/\d{2}/\d{4}", gazette.css("p::text").get())[0]
gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date()
gazette_url = gazette.css("a").attrib["href"]
edition_number = gazette.css("h3::text").re_first(r"Edição nº (\d+)")
gazette_raw_date = gazette.css("p::text").re_first(r"\d{2}/\d{2}/\d{4}")
gazette_date = datetime.strptime(gazette_raw_date, "%d/%m/%Y").date()
gazette_url = gazette.css("a::attr(href)")

yield Gazette(
date=gazette_date,
edition_number=gazette_number,
edition_number=edition_number,
is_extra_edition=False,
power="executive_legislative",
file_urls=[gazette_url],
Expand Down

0 comments on commit 6f8539e

Please sign in to comment.