Skip to content

Commit

Permalink
Modifica spider DOEM para coletar mais diários (#1041) (#1152)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Jun 1, 2024
2 parents 29b69e7 + b42efda commit f4e0dcd
Show file tree
Hide file tree
Showing 38 changed files with 74 additions and 86 deletions.
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_acajutiba.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaAcajutibaSpider(DoemGazetteSpider):
TERRITORY_ID = "2900306"
name = "ba_acajutiba"
start_date = date(2018, 1, 2)
state_city_url_part = "ba/acajutiba"
start_date = date(2013, 1, 30)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_alagoinhas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaAlagoinhasSpider(DoemGazetteSpider):
TERRITORY_ID = "2900702"
name = "ba_alagoinhas"
start_date = date(2018, 1, 2) # edition_number 1.950
state_city_url_part = "ba/alagoinhas"
start_date = date(2015, 1, 28)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_angical.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaAngicalSpider(DoemGazetteSpider):
TERRITORY_ID = "2901403"
name = "ba_angical"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/angical"
start_date = date(2021, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_caetite.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCaetiteSpider(DoemGazetteSpider):
TERRITORY_ID = "2905206"
name = "ba_caetite"
start_date = date(2021, 4, 27)
state_city_url_part = "ba/caetite"
start_date = date(2021, 4, 27)
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCampoAlegreDeLourdesSpider(DoemGazetteSpider):
TERRITORY_ID = "2905909"
name = "ba_campo_alegre_de_lourdes"
start_date = date(2020, 11, 30) # Primeira edição em 30/11/2020
state_city_url_part = "ba/campoalegredelourdes"
start_date = date(2020, 11, 30)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_campo_formoso.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCampoFormosoSpider(DoemGazetteSpider):
TERRITORY_ID = "2906006"
name = "ba_campo_formoso"
start_date = date(2018, 1, 3) # edition_number 873
state_city_url_part = "ba/campoformoso"
start_date = date(2013, 1, 31)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_canudos.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCanudosSpider(DoemGazetteSpider):
TERRITORY_ID = "2906824"
name = "ba_canudos"
start_date = date(2013, 1, 4) # edition number 444
state_city_url_part = "ba/canudos"
start_date = date(2013, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_cipo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCipoSpider(DoemGazetteSpider):
TERRITORY_ID = "2907905"
name = "ba_cipo"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/cipo"
start_date = date(2021, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_cotegipe.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCotegipeSpider(DoemGazetteSpider):
TERRITORY_ID = "2909406"
name = "ba_cotegipe"
start_date = date(2023, 1, 5)
state_city_url_part = "ba/cotegipe"
start_date = date(2023, 1, 5)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_cristopolis.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCristopolisSpider(DoemGazetteSpider):
TERRITORY_ID = "2909703"
name = "ba_cristopolis"
start_date = date(2021, 1, 12)
state_city_url_part = "ba/cristopolis"
start_date = date(2021, 1, 12)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_cruz_das_almas.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaCruzDasAlmasSpider(DoemGazetteSpider):
TERRITORY_ID = "2909802"
name = "ba_cruz_das_almas"
start_date = date(2021, 4, 1)
state_city_url_part = "ba/cruzdasalmas"
start_date = date(2021, 4, 1)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_esplanada.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaEsplanadaSpider(DoemGazetteSpider):
TERRITORY_ID = "2910602"
name = "ba_esplanada"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/esplanada"
start_date = date(2021, 1, 4)
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaFormosaDoRioPretoSpider(DoemGazetteSpider):
TERRITORY_ID = "2911105"
name = "ba_formosa_do_rio_preto"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/formosadoriopreto"
start_date = date(2021, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_irara.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaIraraSpider(DoemGazetteSpider):
TERRITORY_ID = "2914505"
name = "ba_irara"
start_date = date(2018, 1, 3)
state_city_url_part = "ba/irara"
start_date = date(2014, 4, 24)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_itaberaba.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaItaberabaSpider(DoemGazetteSpider):
TERRITORY_ID = "2914703"
name = "ba_itaberaba"
start_date = date(2022, 7, 4)
state_city_url_part = "ba/itaberaba"
start_date = date(2022, 7, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_itamaraju.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaItamarajuSpider(DoemGazetteSpider):
TERRITORY_ID = "2915601"
name = "ba_itamaraju"
start_date = date(2008, 3, 28)
state_city_url_part = "ba/itamaraju"
start_date = date(2008, 3, 28)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_itapicuru.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaItapicuruSpider(DoemGazetteSpider):
TERRITORY_ID = "2916500"
name = "ba_itapicuru"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/itapicuru"
start_date = date(2021, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_ituacu.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaItuacuSpider(DoemGazetteSpider):
TERRITORY_ID = "2917201"
name = "ba_ituacu"
start_date = date(2018, 1, 2)
state_city_url_part = "ba/ituacu"
start_date = date(2015, 2, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_jaguaquara.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaJaguaquaraSpider(DoemGazetteSpider):
TERRITORY_ID = "2917607"
name = "ba_jaguaquara"
start_date = date(2021, 4, 5)
state_city_url_part = "ba/jaguaquara"
start_date = date(2021, 4, 5)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_juazeiro.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaJuazeiroSpider(DoemGazetteSpider):
TERRITORY_ID = "2918407"
name = "ba_juazeiro"
start_date = date(2018, 1, 2) # edition_number 1.135
state_city_url_part = "ba/juazeiro"
start_date = date(2013, 2, 1)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_laje.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaLajeSpider(DoemGazetteSpider):
TERRITORY_ID = "2918803"
name = "ba_laje"
start_date = date(2020, 1, 8)
state_city_url_part = "ba/laje"
start_date = date(2020, 1, 8)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_lajedao.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaLajedaoSpider(DoemGazetteSpider):
TERRITORY_ID = "2918902"
name = "ba_lajedao"
start_date = date(2021, 4, 14)
state_city_url_part = "ba/lajedao"
start_date = date(2021, 4, 14)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_macajuba.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaMacajubaSpider(DoemGazetteSpider):
TERRITORY_ID = "2919603"
name = "ba_macajuba"
start_date = date(2014, 3, 17)
state_city_url_part = "ba/macajuba"
start_date = date(2014, 3, 17)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_mascote.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaMascoteSpider(DoemGazetteSpider):
TERRITORY_ID = "2920908"
name = "ba_mascote"
start_date = date(2010, 1, 4) # edition number 1
state_city_url_part = "ba/mascote"
start_date = date(2010, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_monte_santo.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaMonteSantoSpider(DoemGazetteSpider):
TERRITORY_ID = "2921500"
name = "ba_monte_santo"
start_date = date(2021, 1, 2)
state_city_url_part = "ba/montesanto"
start_date = date(2021, 1, 2)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_morro_do_chapeu.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaMorroDoChapeuSpider(DoemGazetteSpider):
TERRITORY_ID = "2921708"
name = "ba_morro_do_chapeu"
start_date = date(2021, 1, 6)
state_city_url_part = "ba/morrodochapeu"
start_date = date(2021, 1, 6)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_mucuri.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaMucuriSpider(DoemGazetteSpider):
TERRITORY_ID = "2922003"
name = "ba_mucuri"
start_date = date(2018, 1, 3)
state_city_url_part = "ba/mucuri"
start_date = date(2011, 4, 5)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_prado.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class BaPradoSpider(DoemGazetteSpider):
TERRITORY_ID = "2925501"
name = "ba_prado"
state_city_url_part = "ba/prado"
start_date = date(2018, 1, 2)
start_date = date(2013, 2, 4)
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaSantaRitaDeCassiaSpider(DoemGazetteSpider):
TERRITORY_ID = "2928406"
name = "ba_santa_rita_de_cassia"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/santaritadecassia"
start_date = date(2021, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_satiro_dias.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaSatiroDiasSpider(DoemGazetteSpider):
TERRITORY_ID = "2929701"
name = "ba_satiro_dias"
start_date = date(2021, 3, 30)
state_city_url_part = "ba/satirodias"
start_date = date(2021, 3, 30)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_senhor_do_bonfim.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaSenhorDoBonfimSpider(DoemGazetteSpider):
TERRITORY_ID = "2930105"
name = "ba_senhor_do_bonfim"
start_date = date(2018, 1, 2) # edition_number 1.503
state_city_url_part = "ba/senhordobonfim"
start_date = date(2013, 1, 3)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_tapiramuta.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class BaTapiramutaSpider(DoemGazetteSpider):
TERRITORY_ID = "2931301"
name = "ba_tapiramuta"
start_date = date(2021, 1, 4)
state_city_url_part = "ba/tapiramuta"
start_date = date(2021, 1, 4)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/ba/ba_tucano.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,4 +7,4 @@ class BaTucanoSpider(DoemGazetteSpider):
TERRITORY_ID = "2931905"
name = "ba_tucano"
state_city_url_part = "ba/tucano"
start_date = date(2018, 1, 2)
start_date = date(2013, 1, 4)
76 changes: 27 additions & 49 deletions data_collection/gazette/spiders/base/doem.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import datetime as dt

import dateparser
import scrapy
from dateutil.rrule import MONTHLY, rrule

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
Expand All @@ -12,27 +11,31 @@ class DoemGazetteSpider(BaseGazetteSpider):
Base spider for all cities listed on https://doem.org.br
"""

allowed_domains = ["doem.org.br"]

# Must be defined in child class
state_city_url_part = None
start_date = None

custom_settings = {
"DOWNLOAD_FAIL_ON_DATALOSS": False,
}

allowed_domains = ["doem.org.br"]
start_date = dt.date(2009, 1, 1)

def start_requests(self):
yield scrapy.Request(self.get_url())

def parse_pagination(self, response):
"""
This parse function is used to get all the pages available and
return request object for each one
"""
return [
scrapy.Request(self.get_url(page), callback=self.parse)
for page in range(1, 1 + self.get_last_page(response))
month_years = [
dt.strftime("%Y/%m")
for dt in rrule(freq=MONTHLY, dtstart=self.start_date, until=self.end_date)
]

def parse(self, response, page=1):
if self.end_date.strftime("%Y/%m") not in month_years:
month_years.append(self.end_date.strftime("%Y/%m"))

for month_year in month_years:
yield scrapy.Request(
f"https://doem.org.br/{self.state_city_url_part}/diarios/{month_year}"
)

def parse(self, response):
"""
Parse each page from the results page and yield the gazette issues available.
"""
Expand All @@ -43,39 +46,14 @@ def parse(self, response, page=1):
date = self.get_gazette_date(gazette_box)
edition_number = self.get_edition_number(gazette_box)

if date > self.end_date:
continue
elif date < self.start_date:
return

yield Gazette(
date=date,
file_urls=[file_url],
edition_number=edition_number,
is_extra_edition=False,
power="executive_legislative",
)

last_page = self.get_last_page(response)
if page < last_page:
yield scrapy.Request(
url=self.get_url(page + 1), cb_kwargs={"page": page + 1}
)

def get_url(self, page=1):
url = f"https://doem.org.br/{self.state_city_url_part}"
start_date = self.start_date.strftime("%Y-%m-%d")
end_date = self.end_date.strftime("%Y-%m-%d")
return f"{url}/pesquisar?data_inicial={start_date}&data_final={end_date}&page={page}"

def get_last_page(self, response):
"""
Gets the last page number available in the pages navigation menu
"""
pages = response.css("ul.pagination li a::text").getall()
if len(pages) == 0:
return 1
return max([int(page) for page in pages if page.isnumeric()])
if self.start_date < date < self.end_date:
yield Gazette(
date=date,
file_urls=[file_url],
edition_number=edition_number,
is_extra_edition=False,
power="executive_legislative",
)

def get_pdf_url(self, response_item):
"""
Expand Down
4 changes: 2 additions & 2 deletions data_collection/gazette/spiders/pe/pe_petrolina.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
import datetime as dt
from datetime import date

from gazette.spiders.base.doem import DoemGazetteSpider


class PePetrolinaSpider(DoemGazetteSpider):
TERRITORY_ID = "2611101"
name = "pe_petrolina"
start_date = dt.date(2014, 3, 6)
state_city_url_part = "pe/petrolina"
start_date = date(2014, 3, 6)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/pr/pr_ipiranga.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from datetime import date

from gazette.spiders.base.doem import DoemGazetteSpider


class PrIpirangaSpider(DoemGazetteSpider):
TERRITORY_ID = "4110508"
name = "pr_ipiranga"
state_city_url_part = "pr/ipiranga"
start_date = date(2015, 9, 28)
2 changes: 1 addition & 1 deletion data_collection/gazette/spiders/pr/pr_tamboara.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class PrTamboaraSpider(DoemGazetteSpider):
TERRITORY_ID = "4126702"
name = "pr_tamboara"
start_date = date(2022, 8, 22)
state_city_url_part = "pr/tamboara"
start_date = date(2022, 8, 22)
Original file line number Diff line number Diff line change
Expand Up @@ -6,5 +6,5 @@
class SeNossaSenhoraDoSocorroSpider(DoemGazetteSpider):
TERRITORY_ID = "2804805"
name = "se_nossa_senhora_do_socorro"
start_date = date(2022, 11, 7) # edition_number 1
state_city_url_part = "se/nossasenhoradosocorro"
start_date = date(2022, 11, 7)

0 comments on commit f4e0dcd

Please sign in to comment.