From 906292444700fdaf4565d60f6696872d434b6e70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?F=C3=A1bio=20Lima?= Date: Tue, 30 Jul 2024 21:22:13 -0300 Subject: [PATCH] =?UTF-8?q?Adiciona=20sistema=20moderniza=C3=A7=C3=A3o=20e?= =?UTF-8?q?=20munic=C3=ADpios=20que=20adotam=20a=20solu=C3=A7=C3=A3o?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Aplicando sugestão da revisão Co-authored-by: Juliana Trevine <44185775+trevineju@users.noreply.github.com> Signed-off-by: Fábio Lima <67182132+slfabio@users.noreply.github.com> Adiciona ajustes em spider base modernizacao --- .../gazette/spiders/base/modernizacao.py | 78 +++++++++++++++++++ .../gazette/spiders/rj/rj_belford_roxo.py | 44 +---------- .../gazette/spiders/rj/rj_mesquita.py | 12 +++ .../gazette/spiders/rj/rj_miguel_pereira.py | 10 +++ .../gazette/spiders/rj/rj_quatis.py | 10 +++ .../gazette/spiders/rj/rj_queimados.py | 10 +++ .../spiders/rj/rj_sao_joao_de_meriti.py | 30 +------ .../spiders/rj/rj_sao_pedro_da_aldeia.py | 11 +++ 8 files changed, 137 insertions(+), 68 deletions(-) create mode 100644 data_collection/gazette/spiders/base/modernizacao.py create mode 100644 data_collection/gazette/spiders/rj/rj_mesquita.py create mode 100644 data_collection/gazette/spiders/rj/rj_miguel_pereira.py create mode 100644 data_collection/gazette/spiders/rj/rj_quatis.py create mode 100644 data_collection/gazette/spiders/rj/rj_queimados.py create mode 100644 data_collection/gazette/spiders/rj/rj_sao_pedro_da_aldeia.py diff --git a/data_collection/gazette/spiders/base/modernizacao.py b/data_collection/gazette/spiders/base/modernizacao.py new file mode 100644 index 000000000..00dcef415 --- /dev/null +++ b/data_collection/gazette/spiders/base/modernizacao.py @@ -0,0 +1,78 @@ +import re +from datetime import date, datetime + +import scrapy +from dateutil.rrule import MONTHLY, rrule + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BaseModernizacaoSpider(BaseGazetteSpider): + power = "executive_legislative" + ver_subpath = "ver20230623" + + custom_settings = { + "CONCURRENT_REQUESTS": 4, + "DOWNLOAD_DELAY": 0.75, + } + + def start_requests(self): + domain = self.allowed_domains[0] + base_url = f"https://{domain}/diario_oficial_get.php" + initial_date = date(self.start_date.year, self.start_date.month, 1) + + for monthly_date in rrule( + freq=MONTHLY, dtstart=initial_date, until=self.end_date + ): + month_year = monthly_date.strftime("%m/%Y").lstrip("0") + yield scrapy.FormRequest( + method="GET", + url=base_url, + formdata={"mesano": month_year}, + ) + + def parse(self, response): + for gazette_data in response.json(): + raw_gazette_date = gazette_data["Data_Formatada"] + raw_gazette_date + gazette_date = datetime.strptime(raw_gazette_date, "%d/%m/%Y").date() + if not self.start_date <= gazette_date <= self.end_date: + continue + + gazette_code = gazette_data["Codigo_ANEXO"] + gazette_url = response.urljoin( + f"{self.ver_subpath}/WEB-ObterAnexo.rule?sys=LAI&codigo={gazette_code}" + ) + + raw_edition_number = gazette_data["ANEXO"] + gazette_edition_number = re.search(r"\d+", raw_edition_number) + + if gazette_edition_number is None: + gazette_edition_number = "" + else: + gazette_edition_number = gazette_edition_number.group(0) + + is_extra_edition = bool( + re.search(r"extra|supl|ee|esp", raw_edition_number, re.IGNORECASE) + ) + + yield scrapy.Request( + gazette_url, + method="GET", + callback=self.parse_valid_gazette_file, + cb_kwargs={ + "gazette": Gazette( + date=gazette_date, + edition_number=gazette_edition_number, + file_urls=[gazette_url], + is_extra_edition=is_extra_edition, + power=self.power, + ) + }, + ) + + def parse_valid_gazette_file(self, response, gazette): + # o header so possui Content-Length quando o PDF esta indisponivel + if not response.headers.getlist("Content-Length"): + yield gazette diff --git a/data_collection/gazette/spiders/rj/rj_belford_roxo.py b/data_collection/gazette/spiders/rj/rj_belford_roxo.py index f477834f6..acd3c5e2c 100644 --- a/data_collection/gazette/spiders/rj/rj_belford_roxo.py +++ b/data_collection/gazette/spiders/rj/rj_belford_roxo.py @@ -1,47 +1,11 @@ -from datetime import date, datetime +from datetime import date -import scrapy -from dateutil.rrule import MONTHLY, rrule +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider - -class RjBelfordRoxoSpider(BaseGazetteSpider): +class RjBelfordRoxoSpider(BaseModernizacaoSpider): TERRITORY_ID = "3300456" name = "rj_belford_roxo" allowed_domains = ["transparencia.prefeituradebelfordroxo.rj.gov.br"] - BASE_URL = "https://transparencia.prefeituradebelfordroxo.rj.gov.br/webrun/WEB-ObterAnexo.rule?sys=LAI&codigo={ATTACHMENT_CODE}" - start_date = date(2019, 1, 2) - - def start_requests(self): - url = "https://transparencia.prefeituradebelfordroxo.rj.gov.br/diario_oficial_get.php" - initial_date = date(self.start_date.year, self.start_date.month, 1) - - for monthly_date in rrule( - freq=MONTHLY, dtstart=initial_date, until=self.end_date - ): - month_year = monthly_date.strftime("%m/%Y").lstrip("0") # like 9/2022 - yield scrapy.FormRequest( - url=url, - formdata={"mesano": month_year}, - ) - - def parse(self, response): - for gazette_data in response.json(): - raw_gazette_date = gazette_data["Data_Formatada"] - gazette_date = datetime.strptime(raw_gazette_date, "%d/%m/%Y").date() - if gazette_date < self.start_date or self.end_date < gazette_date: - continue - gazette_code = gazette_data["Codigo_ANEXO"] - gazette_edition_number = gazette_data["ANEXO"] - gazette_url = self.BASE_URL.format(ATTACHMENT_CODE=gazette_code) - - yield Gazette( - date=gazette_date, - edition_number=gazette_edition_number, - file_urls=[gazette_url], - is_extra_edition=False, - power="executive", - ) + power = "executive" diff --git a/data_collection/gazette/spiders/rj/rj_mesquita.py b/data_collection/gazette/spiders/rj/rj_mesquita.py new file mode 100644 index 000000000..4526529e8 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_mesquita.py @@ -0,0 +1,12 @@ +import datetime as dt + +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider + + +class RjMesquitaSpider(BaseModernizacaoSpider): + TERRITORY_ID = "3302858" + name = "rj_mesquita" + allowed_domains = ["transparencia.mesquita.rj.gov.br"] + ver_subpath = "ver20240713" + start_date = dt.date(2015, 7, 15) + power = "executive" diff --git a/data_collection/gazette/spiders/rj/rj_miguel_pereira.py b/data_collection/gazette/spiders/rj/rj_miguel_pereira.py new file mode 100644 index 000000000..7eb3c4ad5 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_miguel_pereira.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider + + +class RjMiguelPereiraSpider(BaseModernizacaoSpider): + TERRITORY_ID = "3302908" + name = "rj_miguel_pereira" + allowed_domains = ["transparencia.miguelpereira.rj.gov.br"] + start_date = dt.date(2021, 9, 3) diff --git a/data_collection/gazette/spiders/rj/rj_quatis.py b/data_collection/gazette/spiders/rj/rj_quatis.py new file mode 100644 index 000000000..d28270504 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_quatis.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider + + +class RjQuatisSpider(BaseModernizacaoSpider): + TERRITORY_ID = "3304128" + name = "rj_quatis" + allowed_domains = ["transparencia.quatis.rj.gov.br"] + start_date = dt.date(2021, 1, 11) diff --git a/data_collection/gazette/spiders/rj/rj_queimados.py b/data_collection/gazette/spiders/rj/rj_queimados.py new file mode 100644 index 000000000..9a150aa9e --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_queimados.py @@ -0,0 +1,10 @@ +import datetime as dt + +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider + + +class RjQueimadosSpider(BaseModernizacaoSpider): + TERRITORY_ID = "3304144" + name = "rj_queimados" + allowed_domains = ["transparencia.queimados.rj.gov.br"] + start_date = dt.date(2018, 1, 3) diff --git a/data_collection/gazette/spiders/rj/rj_sao_joao_de_meriti.py b/data_collection/gazette/spiders/rj/rj_sao_joao_de_meriti.py index d901fda34..e46a7f22a 100644 --- a/data_collection/gazette/spiders/rj/rj_sao_joao_de_meriti.py +++ b/data_collection/gazette/spiders/rj/rj_sao_joao_de_meriti.py @@ -1,36 +1,10 @@ import datetime as dt -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider -class RjSaoJoaoDeMeritiSpider(BaseGazetteSpider): +class RjSaoJoaoDeMeritiSpider(BaseModernizacaoSpider): TERRITORY_ID = "3305109" name = "rj_sao_joao_de_meriti" allowed_domains = ["transparencia.meriti.rj.gov.br"] - start_urls = ["https://transparencia.meriti.rj.gov.br/diario_oficial_get.php"] - BASE_URL = "https://transparencia.meriti.rj.gov.br/ver20230623/WEB-ObterAnexo.rule?sys=LAI&codigo=" start_date = dt.date(2017, 1, 1) - custom_settings = {"DOWNLOAD_DELAY": 0.5, "RANDOMIZE_DOWNLOAD_DELAY": True} - - def parse(self, response): - for gazette_data in response.json(): - raw_gazette_date = gazette_data["Data_Formatada"] - gazette_date = dt.datetime.strptime(raw_gazette_date, "%d/%m/%Y").date() - - if not self.start_date <= gazette_date <= self.end_date: - continue - gazette_code = gazette_data["Codigo_ANEXO"] - # links quebrados no portal de transparência - if gazette_code == 1: - continue - gazette_edition_number = gazette_data["ANEXO"] - gazette_url = f"{self.BASE_URL}{gazette_code}" - - yield Gazette( - date=gazette_date, - edition_number=gazette_edition_number, - file_urls=[gazette_url], - is_extra_edition=False, - power="executive_legislative", - ) diff --git a/data_collection/gazette/spiders/rj/rj_sao_pedro_da_aldeia.py b/data_collection/gazette/spiders/rj/rj_sao_pedro_da_aldeia.py new file mode 100644 index 000000000..e6b8bdd62 --- /dev/null +++ b/data_collection/gazette/spiders/rj/rj_sao_pedro_da_aldeia.py @@ -0,0 +1,11 @@ +import datetime as dt + +from gazette.spiders.base.modernizacao import BaseModernizacaoSpider + + +class RjSaoPedroDaAldeiaSpider(BaseModernizacaoSpider): + TERRITORY_ID = "3305208" + name = "rj_sao_pedro_da_aldeia" + allowed_domains = ["transparencia.pmspa.rj.gov.br"] + start_date = dt.date(2018, 1, 15) + ver_subpath = "ver20240713"