From 218e838b99c846b81b0af6d358235cacfd38f720 Mon Sep 17 00:00:00 2001 From: trevineju Date: Fri, 31 May 2024 19:07:19 -0300 Subject: [PATCH 1/4] =?UTF-8?q?Modifica=20classe=20DOEM=20para=20fazer=20r?= =?UTF-8?q?equisi=C3=A7=C3=B5es=20por=20m=C3=AAs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/base/doem.py | 76 +++++++------------- 1 file changed, 27 insertions(+), 49 deletions(-) diff --git a/data_collection/gazette/spiders/base/doem.py b/data_collection/gazette/spiders/base/doem.py index 2f920dde8..7c9b84f50 100644 --- a/data_collection/gazette/spiders/base/doem.py +++ b/data_collection/gazette/spiders/base/doem.py @@ -1,7 +1,6 @@ -import datetime as dt - import dateparser import scrapy +from dateutil.rrule import MONTHLY, rrule from gazette.items import Gazette from gazette.spiders.base import BaseGazetteSpider @@ -12,27 +11,31 @@ class DoemGazetteSpider(BaseGazetteSpider): Base spider for all cities listed on https://doem.org.br """ + allowed_domains = ["doem.org.br"] + + # Must be defined in child class + state_city_url_part = None + start_date = None + custom_settings = { "DOWNLOAD_FAIL_ON_DATALOSS": False, } - allowed_domains = ["doem.org.br"] - start_date = dt.date(2009, 1, 1) - def start_requests(self): - yield scrapy.Request(self.get_url()) - - def parse_pagination(self, response): - """ - This parse function is used to get all the pages available and - return request object for each one - """ - return [ - scrapy.Request(self.get_url(page), callback=self.parse) - for page in range(1, 1 + self.get_last_page(response)) + month_years = [ + dt.strftime("%Y/%m") + for dt in rrule(freq=MONTHLY, dtstart=self.start_date, until=self.end_date) ] - def parse(self, response, page=1): + if self.end_date.strftime("%Y/%m") not in month_years: + month_years.append(self.end_date.strftime("%Y/%m")) + + for month_year in month_years: + yield scrapy.Request( + f"https://doem.org.br/{self.state_city_url_part}/diarios/{month_year}" + ) + + def parse(self, response): """ Parse each page from the results page and yield the gazette issues available. """ @@ -43,39 +46,14 @@ def parse(self, response, page=1): date = self.get_gazette_date(gazette_box) edition_number = self.get_edition_number(gazette_box) - if date > self.end_date: - continue - elif date < self.start_date: - return - - yield Gazette( - date=date, - file_urls=[file_url], - edition_number=edition_number, - is_extra_edition=False, - power="executive_legislative", - ) - - last_page = self.get_last_page(response) - if page < last_page: - yield scrapy.Request( - url=self.get_url(page + 1), cb_kwargs={"page": page + 1} - ) - - def get_url(self, page=1): - url = f"https://doem.org.br/{self.state_city_url_part}" - start_date = self.start_date.strftime("%Y-%m-%d") - end_date = self.end_date.strftime("%Y-%m-%d") - return f"{url}/pesquisar?data_inicial={start_date}&data_final={end_date}&page={page}" - - def get_last_page(self, response): - """ - Gets the last page number available in the pages navigation menu - """ - pages = response.css("ul.pagination li a::text").getall() - if len(pages) == 0: - return 1 - return max([int(page) for page in pages if page.isnumeric()]) + if self.start_date < date < self.end_date: + yield Gazette( + date=date, + file_urls=[file_url], + edition_number=edition_number, + is_extra_edition=False, + power="executive_legislative", + ) def get_pdf_url(self, response_item): """ From 2b3a163203bc0cfc083122110b17e62156e44ed9 Mon Sep 17 00:00:00 2001 From: trevineju Date: Fri, 31 May 2024 01:41:14 -0300 Subject: [PATCH 2/4] Atualiza start_date de raspadores DOEM --- data_collection/gazette/spiders/ba/ba_acajutiba.py | 2 +- data_collection/gazette/spiders/ba/ba_alagoinhas.py | 2 +- data_collection/gazette/spiders/ba/ba_alcobaca.py | 2 +- data_collection/gazette/spiders/ba/ba_campo_formoso.py | 2 +- data_collection/gazette/spiders/ba/ba_cipo.py | 2 +- data_collection/gazette/spiders/ba/ba_irara.py | 2 +- data_collection/gazette/spiders/ba/ba_itapicuru.py | 2 +- data_collection/gazette/spiders/ba/ba_ituacu.py | 2 +- data_collection/gazette/spiders/ba/ba_juazeiro.py | 2 +- data_collection/gazette/spiders/ba/ba_monte_santo.py | 2 +- data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py | 2 +- data_collection/gazette/spiders/ba/ba_mucuri.py | 2 +- data_collection/gazette/spiders/ba/ba_prado.py | 2 +- data_collection/gazette/spiders/ba/ba_santo_estevao.py | 2 +- data_collection/gazette/spiders/ba/ba_senhor_do_bonfim.py | 2 +- data_collection/gazette/spiders/ba/ba_tucano.py | 2 +- 16 files changed, 16 insertions(+), 16 deletions(-) diff --git a/data_collection/gazette/spiders/ba/ba_acajutiba.py b/data_collection/gazette/spiders/ba/ba_acajutiba.py index 5f560ffd6..ab4a469f2 100644 --- a/data_collection/gazette/spiders/ba/ba_acajutiba.py +++ b/data_collection/gazette/spiders/ba/ba_acajutiba.py @@ -6,5 +6,5 @@ class BaAcajutibaSpider(DoemGazetteSpider): TERRITORY_ID = "2900306" name = "ba_acajutiba" - start_date = date(2018, 1, 2) state_city_url_part = "ba/acajutiba" + start_date = date(2013, 1, 30) diff --git a/data_collection/gazette/spiders/ba/ba_alagoinhas.py b/data_collection/gazette/spiders/ba/ba_alagoinhas.py index e8ce239ca..1a1a5e5f7 100644 --- a/data_collection/gazette/spiders/ba/ba_alagoinhas.py +++ b/data_collection/gazette/spiders/ba/ba_alagoinhas.py @@ -6,5 +6,5 @@ class BaAlagoinhasSpider(DoemGazetteSpider): TERRITORY_ID = "2900702" name = "ba_alagoinhas" - start_date = date(2018, 1, 2) # edition_number 1.950 state_city_url_part = "ba/alagoinhas" + start_date = date(2015, 1, 28) diff --git a/data_collection/gazette/spiders/ba/ba_alcobaca.py b/data_collection/gazette/spiders/ba/ba_alcobaca.py index e38ebb5f5..6d1f268a9 100644 --- a/data_collection/gazette/spiders/ba/ba_alcobaca.py +++ b/data_collection/gazette/spiders/ba/ba_alcobaca.py @@ -7,4 +7,4 @@ class BaAlcobacaSpider(DoemGazetteSpider): TERRITORY_ID = "2900801" name = "ba_alcobaca" state_city_url_part = "ba/alcobaca" - start_date = date(2017, 3, 3) + start_date = date(2012, 1, 2) diff --git a/data_collection/gazette/spiders/ba/ba_campo_formoso.py b/data_collection/gazette/spiders/ba/ba_campo_formoso.py index 35ae798ef..d25753ba3 100644 --- a/data_collection/gazette/spiders/ba/ba_campo_formoso.py +++ b/data_collection/gazette/spiders/ba/ba_campo_formoso.py @@ -6,5 +6,5 @@ class BaCampoFormosoSpider(DoemGazetteSpider): TERRITORY_ID = "2906006" name = "ba_campo_formoso" - start_date = date(2018, 1, 3) # edition_number 873 state_city_url_part = "ba/campoformoso" + start_date = date(2013, 1, 31) diff --git a/data_collection/gazette/spiders/ba/ba_cipo.py b/data_collection/gazette/spiders/ba/ba_cipo.py index 9146e6e79..bb2b2d6ee 100644 --- a/data_collection/gazette/spiders/ba/ba_cipo.py +++ b/data_collection/gazette/spiders/ba/ba_cipo.py @@ -6,5 +6,5 @@ class BaCipoSpider(DoemGazetteSpider): TERRITORY_ID = "2907905" name = "ba_cipo" - start_date = date(2021, 1, 4) state_city_url_part = "ba/cipo" + start_date = date(2012, 1, 2) diff --git a/data_collection/gazette/spiders/ba/ba_irara.py b/data_collection/gazette/spiders/ba/ba_irara.py index df93e118f..fc67818c0 100644 --- a/data_collection/gazette/spiders/ba/ba_irara.py +++ b/data_collection/gazette/spiders/ba/ba_irara.py @@ -6,5 +6,5 @@ class BaIraraSpider(DoemGazetteSpider): TERRITORY_ID = "2914505" name = "ba_irara" - start_date = date(2018, 1, 3) state_city_url_part = "ba/irara" + start_date = date(2014, 4, 24) diff --git a/data_collection/gazette/spiders/ba/ba_itapicuru.py b/data_collection/gazette/spiders/ba/ba_itapicuru.py index 0afd01517..bae036652 100644 --- a/data_collection/gazette/spiders/ba/ba_itapicuru.py +++ b/data_collection/gazette/spiders/ba/ba_itapicuru.py @@ -6,5 +6,5 @@ class BaItapicuruSpider(DoemGazetteSpider): TERRITORY_ID = "2916500" name = "ba_itapicuru" - start_date = date(2021, 1, 4) state_city_url_part = "ba/itapicuru" + start_date = date(2014, 1, 2) diff --git a/data_collection/gazette/spiders/ba/ba_ituacu.py b/data_collection/gazette/spiders/ba/ba_ituacu.py index 5e97055d8..2a6332f5d 100644 --- a/data_collection/gazette/spiders/ba/ba_ituacu.py +++ b/data_collection/gazette/spiders/ba/ba_ituacu.py @@ -6,5 +6,5 @@ class BaItuacuSpider(DoemGazetteSpider): TERRITORY_ID = "2917201" name = "ba_ituacu" - start_date = date(2018, 1, 2) state_city_url_part = "ba/ituacu" + start_date = date(2015, 2, 4) diff --git a/data_collection/gazette/spiders/ba/ba_juazeiro.py b/data_collection/gazette/spiders/ba/ba_juazeiro.py index 76ee57362..216518b1e 100644 --- a/data_collection/gazette/spiders/ba/ba_juazeiro.py +++ b/data_collection/gazette/spiders/ba/ba_juazeiro.py @@ -6,5 +6,5 @@ class BaJuazeiroSpider(DoemGazetteSpider): TERRITORY_ID = "2918407" name = "ba_juazeiro" - start_date = date(2018, 1, 2) # edition_number 1.135 state_city_url_part = "ba/juazeiro" + start_date = date(2013, 2, 1) diff --git a/data_collection/gazette/spiders/ba/ba_monte_santo.py b/data_collection/gazette/spiders/ba/ba_monte_santo.py index 5725be1d2..a87bf2904 100644 --- a/data_collection/gazette/spiders/ba/ba_monte_santo.py +++ b/data_collection/gazette/spiders/ba/ba_monte_santo.py @@ -6,5 +6,5 @@ class BaMonteSantoSpider(DoemGazetteSpider): TERRITORY_ID = "2921500" name = "ba_monte_santo" - start_date = date(2021, 1, 2) state_city_url_part = "ba/montesanto" + start_date = date(2013, 1, 9) diff --git a/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py b/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py index e0803c95b..b173e1e20 100644 --- a/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py +++ b/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py @@ -6,5 +6,5 @@ class BaMorroDoChapeuSpider(DoemGazetteSpider): TERRITORY_ID = "2921708" name = "ba_morro_do_chapeu" - start_date = date(2021, 1, 6) state_city_url_part = "ba/morrodochapeu" + start_date = date(2013, 3, 1) diff --git a/data_collection/gazette/spiders/ba/ba_mucuri.py b/data_collection/gazette/spiders/ba/ba_mucuri.py index 22decdefa..007426877 100644 --- a/data_collection/gazette/spiders/ba/ba_mucuri.py +++ b/data_collection/gazette/spiders/ba/ba_mucuri.py @@ -6,5 +6,5 @@ class BaMucuriSpider(DoemGazetteSpider): TERRITORY_ID = "2922003" name = "ba_mucuri" - start_date = date(2018, 1, 3) state_city_url_part = "ba/mucuri" + start_date = date(2011, 4, 5) diff --git a/data_collection/gazette/spiders/ba/ba_prado.py b/data_collection/gazette/spiders/ba/ba_prado.py index ada3f9cd5..351951510 100644 --- a/data_collection/gazette/spiders/ba/ba_prado.py +++ b/data_collection/gazette/spiders/ba/ba_prado.py @@ -7,4 +7,4 @@ class BaPradoSpider(DoemGazetteSpider): TERRITORY_ID = "2925501" name = "ba_prado" state_city_url_part = "ba/prado" - start_date = date(2018, 1, 2) + start_date = date(2013, 2, 4) diff --git a/data_collection/gazette/spiders/ba/ba_santo_estevao.py b/data_collection/gazette/spiders/ba/ba_santo_estevao.py index 6eaa2f2dc..e7bed2be0 100644 --- a/data_collection/gazette/spiders/ba/ba_santo_estevao.py +++ b/data_collection/gazette/spiders/ba/ba_santo_estevao.py @@ -7,4 +7,4 @@ class BaSantoEstevaoSpider(DoemGazetteSpider): TERRITORY_ID = "2928802" name = "ba_santo_estevao" state_city_url_part = "ba/santoestevao" - start_date = date(2017, 1, 6) + start_date = date(2009, 11, 9) diff --git a/data_collection/gazette/spiders/ba/ba_senhor_do_bonfim.py b/data_collection/gazette/spiders/ba/ba_senhor_do_bonfim.py index fd6c6c53a..150072a41 100644 --- a/data_collection/gazette/spiders/ba/ba_senhor_do_bonfim.py +++ b/data_collection/gazette/spiders/ba/ba_senhor_do_bonfim.py @@ -6,5 +6,5 @@ class BaSenhorDoBonfimSpider(DoemGazetteSpider): TERRITORY_ID = "2930105" name = "ba_senhor_do_bonfim" - start_date = date(2018, 1, 2) # edition_number 1.503 state_city_url_part = "ba/senhordobonfim" + start_date = date(2013, 1, 3) diff --git a/data_collection/gazette/spiders/ba/ba_tucano.py b/data_collection/gazette/spiders/ba/ba_tucano.py index b4b9a8706..e2bc97a89 100644 --- a/data_collection/gazette/spiders/ba/ba_tucano.py +++ b/data_collection/gazette/spiders/ba/ba_tucano.py @@ -7,4 +7,4 @@ class BaTucanoSpider(DoemGazetteSpider): TERRITORY_ID = "2931905" name = "ba_tucano" state_city_url_part = "ba/tucano" - start_date = date(2018, 1, 2) + start_date = date(2013, 1, 4) From 0638df7b66a5d612f591312219fe403c49090a66 Mon Sep 17 00:00:00 2001 From: trevineju Date: Fri, 31 May 2024 01:46:42 -0300 Subject: [PATCH 3/4] Adiciona raspador para Ipiranga-PR e padroniza demais raspadores DOEM --- data_collection/gazette/spiders/ba/ba_angical.py | 2 +- data_collection/gazette/spiders/ba/ba_caetite.py | 2 +- .../gazette/spiders/ba/ba_campo_alegre_de_lourdes.py | 2 +- data_collection/gazette/spiders/ba/ba_canudos.py | 2 +- data_collection/gazette/spiders/ba/ba_cotegipe.py | 2 +- data_collection/gazette/spiders/ba/ba_cristopolis.py | 2 +- .../gazette/spiders/ba/ba_cruz_das_almas.py | 2 +- data_collection/gazette/spiders/ba/ba_esplanada.py | 2 +- .../gazette/spiders/ba/ba_formosa_do_rio_preto.py | 2 +- data_collection/gazette/spiders/ba/ba_itaberaba.py | 2 +- data_collection/gazette/spiders/ba/ba_itamaraju.py | 2 +- data_collection/gazette/spiders/ba/ba_jaguaquara.py | 2 +- data_collection/gazette/spiders/ba/ba_laje.py | 2 +- data_collection/gazette/spiders/ba/ba_lajedao.py | 2 +- data_collection/gazette/spiders/ba/ba_macajuba.py | 2 +- data_collection/gazette/spiders/ba/ba_mascote.py | 2 +- .../gazette/spiders/ba/ba_santa_rita_de_cassia.py | 2 +- data_collection/gazette/spiders/ba/ba_satiro_dias.py | 2 +- data_collection/gazette/spiders/ba/ba_tapiramuta.py | 2 +- data_collection/gazette/spiders/pe/pe_petrolina.py | 4 ++-- data_collection/gazette/spiders/pr/pr_ipiranga.py | 10 ++++++++++ data_collection/gazette/spiders/pr/pr_tamboara.py | 2 +- .../gazette/spiders/se/se_nossa_senhora_do_socorro.py | 2 +- 23 files changed, 33 insertions(+), 23 deletions(-) create mode 100644 data_collection/gazette/spiders/pr/pr_ipiranga.py diff --git a/data_collection/gazette/spiders/ba/ba_angical.py b/data_collection/gazette/spiders/ba/ba_angical.py index fdf7efd76..801ba6eff 100644 --- a/data_collection/gazette/spiders/ba/ba_angical.py +++ b/data_collection/gazette/spiders/ba/ba_angical.py @@ -6,5 +6,5 @@ class BaAngicalSpider(DoemGazetteSpider): TERRITORY_ID = "2901403" name = "ba_angical" - start_date = date(2021, 1, 4) state_city_url_part = "ba/angical" + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_caetite.py b/data_collection/gazette/spiders/ba/ba_caetite.py index abe940903..ffe3a73be 100644 --- a/data_collection/gazette/spiders/ba/ba_caetite.py +++ b/data_collection/gazette/spiders/ba/ba_caetite.py @@ -6,5 +6,5 @@ class BaCaetiteSpider(DoemGazetteSpider): TERRITORY_ID = "2905206" name = "ba_caetite" - start_date = date(2021, 4, 27) state_city_url_part = "ba/caetite" + start_date = date(2021, 4, 27) diff --git a/data_collection/gazette/spiders/ba/ba_campo_alegre_de_lourdes.py b/data_collection/gazette/spiders/ba/ba_campo_alegre_de_lourdes.py index ad118562d..355032c9b 100644 --- a/data_collection/gazette/spiders/ba/ba_campo_alegre_de_lourdes.py +++ b/data_collection/gazette/spiders/ba/ba_campo_alegre_de_lourdes.py @@ -6,5 +6,5 @@ class BaCampoAlegreDeLourdesSpider(DoemGazetteSpider): TERRITORY_ID = "2905909" name = "ba_campo_alegre_de_lourdes" - start_date = date(2020, 11, 30) # Primeira edição em 30/11/2020 state_city_url_part = "ba/campoalegredelourdes" + start_date = date(2020, 11, 30) diff --git a/data_collection/gazette/spiders/ba/ba_canudos.py b/data_collection/gazette/spiders/ba/ba_canudos.py index eca9e9727..7ca7b2c42 100644 --- a/data_collection/gazette/spiders/ba/ba_canudos.py +++ b/data_collection/gazette/spiders/ba/ba_canudos.py @@ -6,5 +6,5 @@ class BaCanudosSpider(DoemGazetteSpider): TERRITORY_ID = "2906824" name = "ba_canudos" - start_date = date(2013, 1, 4) # edition number 444 state_city_url_part = "ba/canudos" + start_date = date(2013, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_cotegipe.py b/data_collection/gazette/spiders/ba/ba_cotegipe.py index 5c6bdf803..9a2772855 100644 --- a/data_collection/gazette/spiders/ba/ba_cotegipe.py +++ b/data_collection/gazette/spiders/ba/ba_cotegipe.py @@ -6,5 +6,5 @@ class BaCotegipeSpider(DoemGazetteSpider): TERRITORY_ID = "2909406" name = "ba_cotegipe" - start_date = date(2023, 1, 5) state_city_url_part = "ba/cotegipe" + start_date = date(2023, 1, 5) diff --git a/data_collection/gazette/spiders/ba/ba_cristopolis.py b/data_collection/gazette/spiders/ba/ba_cristopolis.py index f9496c898..79e8c5e60 100644 --- a/data_collection/gazette/spiders/ba/ba_cristopolis.py +++ b/data_collection/gazette/spiders/ba/ba_cristopolis.py @@ -6,5 +6,5 @@ class BaCristopolisSpider(DoemGazetteSpider): TERRITORY_ID = "2909703" name = "ba_cristopolis" - start_date = date(2021, 1, 12) state_city_url_part = "ba/cristopolis" + start_date = date(2021, 1, 12) diff --git a/data_collection/gazette/spiders/ba/ba_cruz_das_almas.py b/data_collection/gazette/spiders/ba/ba_cruz_das_almas.py index 40f4297ea..f5e69b800 100644 --- a/data_collection/gazette/spiders/ba/ba_cruz_das_almas.py +++ b/data_collection/gazette/spiders/ba/ba_cruz_das_almas.py @@ -6,5 +6,5 @@ class BaCruzDasAlmasSpider(DoemGazetteSpider): TERRITORY_ID = "2909802" name = "ba_cruz_das_almas" - start_date = date(2021, 4, 1) state_city_url_part = "ba/cruzdasalmas" + start_date = date(2021, 4, 1) diff --git a/data_collection/gazette/spiders/ba/ba_esplanada.py b/data_collection/gazette/spiders/ba/ba_esplanada.py index 5c0089396..a4b67c880 100644 --- a/data_collection/gazette/spiders/ba/ba_esplanada.py +++ b/data_collection/gazette/spiders/ba/ba_esplanada.py @@ -6,5 +6,5 @@ class BaEsplanadaSpider(DoemGazetteSpider): TERRITORY_ID = "2910602" name = "ba_esplanada" - start_date = date(2021, 1, 4) state_city_url_part = "ba/esplanada" + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_formosa_do_rio_preto.py b/data_collection/gazette/spiders/ba/ba_formosa_do_rio_preto.py index d303a1ac7..e56e0ac8b 100644 --- a/data_collection/gazette/spiders/ba/ba_formosa_do_rio_preto.py +++ b/data_collection/gazette/spiders/ba/ba_formosa_do_rio_preto.py @@ -6,5 +6,5 @@ class BaFormosaDoRioPretoSpider(DoemGazetteSpider): TERRITORY_ID = "2911105" name = "ba_formosa_do_rio_preto" - start_date = date(2021, 1, 4) state_city_url_part = "ba/formosadoriopreto" + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_itaberaba.py b/data_collection/gazette/spiders/ba/ba_itaberaba.py index f0208e088..153037d26 100644 --- a/data_collection/gazette/spiders/ba/ba_itaberaba.py +++ b/data_collection/gazette/spiders/ba/ba_itaberaba.py @@ -6,5 +6,5 @@ class BaItaberabaSpider(DoemGazetteSpider): TERRITORY_ID = "2914703" name = "ba_itaberaba" - start_date = date(2022, 7, 4) state_city_url_part = "ba/itaberaba" + start_date = date(2022, 7, 4) diff --git a/data_collection/gazette/spiders/ba/ba_itamaraju.py b/data_collection/gazette/spiders/ba/ba_itamaraju.py index a4b486838..0399e4ad2 100644 --- a/data_collection/gazette/spiders/ba/ba_itamaraju.py +++ b/data_collection/gazette/spiders/ba/ba_itamaraju.py @@ -6,5 +6,5 @@ class BaItamarajuSpider(DoemGazetteSpider): TERRITORY_ID = "2915601" name = "ba_itamaraju" - start_date = date(2008, 3, 28) state_city_url_part = "ba/itamaraju" + start_date = date(2008, 3, 28) diff --git a/data_collection/gazette/spiders/ba/ba_jaguaquara.py b/data_collection/gazette/spiders/ba/ba_jaguaquara.py index 1bcf98aca..16130922a 100644 --- a/data_collection/gazette/spiders/ba/ba_jaguaquara.py +++ b/data_collection/gazette/spiders/ba/ba_jaguaquara.py @@ -6,5 +6,5 @@ class BaJaguaquaraSpider(DoemGazetteSpider): TERRITORY_ID = "2917607" name = "ba_jaguaquara" - start_date = date(2021, 4, 5) state_city_url_part = "ba/jaguaquara" + start_date = date(2021, 4, 5) diff --git a/data_collection/gazette/spiders/ba/ba_laje.py b/data_collection/gazette/spiders/ba/ba_laje.py index 6d1bf3d83..4adfddc3d 100644 --- a/data_collection/gazette/spiders/ba/ba_laje.py +++ b/data_collection/gazette/spiders/ba/ba_laje.py @@ -6,5 +6,5 @@ class BaLajeSpider(DoemGazetteSpider): TERRITORY_ID = "2918803" name = "ba_laje" - start_date = date(2020, 1, 8) state_city_url_part = "ba/laje" + start_date = date(2020, 1, 8) diff --git a/data_collection/gazette/spiders/ba/ba_lajedao.py b/data_collection/gazette/spiders/ba/ba_lajedao.py index 9ad0409f9..466b9bcd8 100644 --- a/data_collection/gazette/spiders/ba/ba_lajedao.py +++ b/data_collection/gazette/spiders/ba/ba_lajedao.py @@ -6,5 +6,5 @@ class BaLajedaoSpider(DoemGazetteSpider): TERRITORY_ID = "2918902" name = "ba_lajedao" - start_date = date(2021, 4, 14) state_city_url_part = "ba/lajedao" + start_date = date(2021, 4, 14) diff --git a/data_collection/gazette/spiders/ba/ba_macajuba.py b/data_collection/gazette/spiders/ba/ba_macajuba.py index 2f6a33024..c9ebd90fe 100644 --- a/data_collection/gazette/spiders/ba/ba_macajuba.py +++ b/data_collection/gazette/spiders/ba/ba_macajuba.py @@ -6,5 +6,5 @@ class BaMacajubaSpider(DoemGazetteSpider): TERRITORY_ID = "2919603" name = "ba_macajuba" - start_date = date(2014, 3, 17) state_city_url_part = "ba/macajuba" + start_date = date(2014, 3, 17) diff --git a/data_collection/gazette/spiders/ba/ba_mascote.py b/data_collection/gazette/spiders/ba/ba_mascote.py index 45756be25..e6257073f 100644 --- a/data_collection/gazette/spiders/ba/ba_mascote.py +++ b/data_collection/gazette/spiders/ba/ba_mascote.py @@ -6,5 +6,5 @@ class BaMascoteSpider(DoemGazetteSpider): TERRITORY_ID = "2920908" name = "ba_mascote" - start_date = date(2010, 1, 4) # edition number 1 state_city_url_part = "ba/mascote" + start_date = date(2010, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_santa_rita_de_cassia.py b/data_collection/gazette/spiders/ba/ba_santa_rita_de_cassia.py index 3cca849d9..f41559f11 100644 --- a/data_collection/gazette/spiders/ba/ba_santa_rita_de_cassia.py +++ b/data_collection/gazette/spiders/ba/ba_santa_rita_de_cassia.py @@ -6,5 +6,5 @@ class BaSantaRitaDeCassiaSpider(DoemGazetteSpider): TERRITORY_ID = "2928406" name = "ba_santa_rita_de_cassia" - start_date = date(2021, 1, 4) state_city_url_part = "ba/santaritadecassia" + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_satiro_dias.py b/data_collection/gazette/spiders/ba/ba_satiro_dias.py index 1d0dd0704..04d257f0b 100644 --- a/data_collection/gazette/spiders/ba/ba_satiro_dias.py +++ b/data_collection/gazette/spiders/ba/ba_satiro_dias.py @@ -6,5 +6,5 @@ class BaSatiroDiasSpider(DoemGazetteSpider): TERRITORY_ID = "2929701" name = "ba_satiro_dias" - start_date = date(2021, 3, 30) state_city_url_part = "ba/satirodias" + start_date = date(2021, 3, 30) diff --git a/data_collection/gazette/spiders/ba/ba_tapiramuta.py b/data_collection/gazette/spiders/ba/ba_tapiramuta.py index 91f26c42b..5ba0364b4 100644 --- a/data_collection/gazette/spiders/ba/ba_tapiramuta.py +++ b/data_collection/gazette/spiders/ba/ba_tapiramuta.py @@ -6,5 +6,5 @@ class BaTapiramutaSpider(DoemGazetteSpider): TERRITORY_ID = "2931301" name = "ba_tapiramuta" - start_date = date(2021, 1, 4) state_city_url_part = "ba/tapiramuta" + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/pe/pe_petrolina.py b/data_collection/gazette/spiders/pe/pe_petrolina.py index 383bfbb51..259cef808 100644 --- a/data_collection/gazette/spiders/pe/pe_petrolina.py +++ b/data_collection/gazette/spiders/pe/pe_petrolina.py @@ -1,4 +1,4 @@ -import datetime as dt +from datetime import date from gazette.spiders.base.doem import DoemGazetteSpider @@ -6,5 +6,5 @@ class PePetrolinaSpider(DoemGazetteSpider): TERRITORY_ID = "2611101" name = "pe_petrolina" - start_date = dt.date(2014, 3, 6) state_city_url_part = "pe/petrolina" + start_date = date(2014, 3, 6) diff --git a/data_collection/gazette/spiders/pr/pr_ipiranga.py b/data_collection/gazette/spiders/pr/pr_ipiranga.py new file mode 100644 index 000000000..51eee7052 --- /dev/null +++ b/data_collection/gazette/spiders/pr/pr_ipiranga.py @@ -0,0 +1,10 @@ +from datetime import date + +from gazette.spiders.base.doem import DoemGazetteSpider + + +class PrIpirangaSpider(DoemGazetteSpider): + TERRITORY_ID = "4110508" + name = "pr_ipiranga" + state_city_url_part = "pr/ipiranga" + start_date = date(2015, 9, 28) diff --git a/data_collection/gazette/spiders/pr/pr_tamboara.py b/data_collection/gazette/spiders/pr/pr_tamboara.py index 830216d36..1ab154e94 100644 --- a/data_collection/gazette/spiders/pr/pr_tamboara.py +++ b/data_collection/gazette/spiders/pr/pr_tamboara.py @@ -6,5 +6,5 @@ class PrTamboaraSpider(DoemGazetteSpider): TERRITORY_ID = "4126702" name = "pr_tamboara" - start_date = date(2022, 8, 22) state_city_url_part = "pr/tamboara" + start_date = date(2022, 8, 22) diff --git a/data_collection/gazette/spiders/se/se_nossa_senhora_do_socorro.py b/data_collection/gazette/spiders/se/se_nossa_senhora_do_socorro.py index c1d192706..704eaf47f 100644 --- a/data_collection/gazette/spiders/se/se_nossa_senhora_do_socorro.py +++ b/data_collection/gazette/spiders/se/se_nossa_senhora_do_socorro.py @@ -6,5 +6,5 @@ class SeNossaSenhoraDoSocorroSpider(DoemGazetteSpider): TERRITORY_ID = "2804805" name = "se_nossa_senhora_do_socorro" - start_date = date(2022, 11, 7) # edition_number 1 state_city_url_part = "se/nossasenhoradosocorro" + start_date = date(2022, 11, 7) From b42efdaeadad27eb543debf1c67b6f1658ddfbca Mon Sep 17 00:00:00 2001 From: trevineju Date: Sat, 1 Jun 2024 12:31:39 -0300 Subject: [PATCH 4/4] =?UTF-8?q?Reduz=20o=20start=5Fdate=20de=20raspadores?= =?UTF-8?q?=20com=20s=C3=A9rie=20hist=C3=B3rica=20interrompida?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/ba/ba_alcobaca.py | 2 +- data_collection/gazette/spiders/ba/ba_cipo.py | 2 +- data_collection/gazette/spiders/ba/ba_itapicuru.py | 2 +- data_collection/gazette/spiders/ba/ba_monte_santo.py | 2 +- data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py | 2 +- data_collection/gazette/spiders/ba/ba_santo_estevao.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/data_collection/gazette/spiders/ba/ba_alcobaca.py b/data_collection/gazette/spiders/ba/ba_alcobaca.py index 6d1f268a9..e38ebb5f5 100644 --- a/data_collection/gazette/spiders/ba/ba_alcobaca.py +++ b/data_collection/gazette/spiders/ba/ba_alcobaca.py @@ -7,4 +7,4 @@ class BaAlcobacaSpider(DoemGazetteSpider): TERRITORY_ID = "2900801" name = "ba_alcobaca" state_city_url_part = "ba/alcobaca" - start_date = date(2012, 1, 2) + start_date = date(2017, 3, 3) diff --git a/data_collection/gazette/spiders/ba/ba_cipo.py b/data_collection/gazette/spiders/ba/ba_cipo.py index bb2b2d6ee..e11a2e990 100644 --- a/data_collection/gazette/spiders/ba/ba_cipo.py +++ b/data_collection/gazette/spiders/ba/ba_cipo.py @@ -7,4 +7,4 @@ class BaCipoSpider(DoemGazetteSpider): TERRITORY_ID = "2907905" name = "ba_cipo" state_city_url_part = "ba/cipo" - start_date = date(2012, 1, 2) + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_itapicuru.py b/data_collection/gazette/spiders/ba/ba_itapicuru.py index bae036652..95021c643 100644 --- a/data_collection/gazette/spiders/ba/ba_itapicuru.py +++ b/data_collection/gazette/spiders/ba/ba_itapicuru.py @@ -7,4 +7,4 @@ class BaItapicuruSpider(DoemGazetteSpider): TERRITORY_ID = "2916500" name = "ba_itapicuru" state_city_url_part = "ba/itapicuru" - start_date = date(2014, 1, 2) + start_date = date(2021, 1, 4) diff --git a/data_collection/gazette/spiders/ba/ba_monte_santo.py b/data_collection/gazette/spiders/ba/ba_monte_santo.py index a87bf2904..b5d35e0d5 100644 --- a/data_collection/gazette/spiders/ba/ba_monte_santo.py +++ b/data_collection/gazette/spiders/ba/ba_monte_santo.py @@ -7,4 +7,4 @@ class BaMonteSantoSpider(DoemGazetteSpider): TERRITORY_ID = "2921500" name = "ba_monte_santo" state_city_url_part = "ba/montesanto" - start_date = date(2013, 1, 9) + start_date = date(2021, 1, 2) diff --git a/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py b/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py index b173e1e20..3afa28548 100644 --- a/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py +++ b/data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py @@ -7,4 +7,4 @@ class BaMorroDoChapeuSpider(DoemGazetteSpider): TERRITORY_ID = "2921708" name = "ba_morro_do_chapeu" state_city_url_part = "ba/morrodochapeu" - start_date = date(2013, 3, 1) + start_date = date(2021, 1, 6) diff --git a/data_collection/gazette/spiders/ba/ba_santo_estevao.py b/data_collection/gazette/spiders/ba/ba_santo_estevao.py index e7bed2be0..6eaa2f2dc 100644 --- a/data_collection/gazette/spiders/ba/ba_santo_estevao.py +++ b/data_collection/gazette/spiders/ba/ba_santo_estevao.py @@ -7,4 +7,4 @@ class BaSantoEstevaoSpider(DoemGazetteSpider): TERRITORY_ID = "2928802" name = "ba_santo_estevao" state_city_url_part = "ba/santoestevao" - start_date = date(2009, 11, 9) + start_date = date(2017, 1, 6)