From 6f8539e1c433c48285a89c9b80cc79dd0624b8b6 Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Thu, 18 Apr 2024 21:54:41 -0300 Subject: [PATCH] Algumas melhorias para simplificar o raspador Signed-off-by: Renne Rocha --- .../gazette/spiders/sp/sp_cacapava.py | 33 +++++++++---------- 1 file changed, 16 insertions(+), 17 deletions(-) diff --git a/data_collection/gazette/spiders/sp/sp_cacapava.py b/data_collection/gazette/spiders/sp/sp_cacapava.py index 796b6feec..9844b109d 100644 --- a/data_collection/gazette/spiders/sp/sp_cacapava.py +++ b/data_collection/gazette/spiders/sp/sp_cacapava.py @@ -15,29 +15,28 @@ class SpCacapavaSpider(BaseGazetteSpider): custom_settings = {"DOWNLOAD_DELAY": 0.5, "RANDOMIZE_DOWNLOAD_DELAY": True} def start_requests(self): - url = "https://cacapava.sp.gov.br/diario-oficial?" - url += f'&dataDe={self.start_date.strftime("%d/%m/%Y")}' - url += f'&dataAte={self.end_date.strftime("%d/%m/%Y")}' - yield Request(url, callback=self.parse_info) - - def parse_info(self, response): - base_url = response.url - num_pages = response.css(".pagination__select option::text")[-1].get() - for i in range(1, int(num_pages) + 1): - yield Request(f"{base_url}&pagina={i}") + data_de = self.start_date.strftime("%d/%m/%Y") + data_ate = self.end_date.strftime("%d/%m/%Y") + url = f"https://cacapava.sp.gov.br/diario-oficial?dataDe={data_de}&dataAte={data_ate}" + yield Request(url) def parse(self, response): + num_pages = int( + response.css(".pagination__label::text").re_first(r"\/ (\d+)") or "1" + ) + if num_pages > 1: + for page in range(1, num_pages + 1): + yield Request(f"{response.url}&pagina={page}") + for gazette in response.css(".list-item__info"): - gazette_number = re.findall( - "Edição nº (\d+)", gazette.css("h3::text").get() - )[0] - raw_date = re.findall("\d{2}/\d{2}/\d{4}", gazette.css("p::text").get())[0] - gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date() - gazette_url = gazette.css("a").attrib["href"] + edition_number = gazette.css("h3::text").re_first(r"Edição nº (\d+)") + gazette_raw_date = gazette.css("p::text").re_first(r"\d{2}/\d{2}/\d{4}") + gazette_date = datetime.strptime(gazette_raw_date, "%d/%m/%Y").date() + gazette_url = gazette.css("a::attr(href)") yield Gazette( date=gazette_date, - edition_number=gazette_number, + edition_number=edition_number, is_extra_edition=False, power="executive_legislative", file_urls=[gazette_url],