Skip to content

Commit

Permalink
Adiciona base para o sistema modernizacao (#1236)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Aug 5, 2024
2 parents ca62905 + 9062924 commit 262f95e
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 68 deletions.
78 changes: 78 additions & 0 deletions data_collection/gazette/spiders/base/modernizacao.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
import re
from datetime import date, datetime

import scrapy
from dateutil.rrule import MONTHLY, rrule

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BaseModernizacaoSpider(BaseGazetteSpider):
power = "executive_legislative"
ver_subpath = "ver20230623"

custom_settings = {
"CONCURRENT_REQUESTS": 4,
"DOWNLOAD_DELAY": 0.75,
}

def start_requests(self):
domain = self.allowed_domains[0]
base_url = f"https://{domain}/diario_oficial_get.php"
initial_date = date(self.start_date.year, self.start_date.month, 1)

for monthly_date in rrule(
freq=MONTHLY, dtstart=initial_date, until=self.end_date
):
month_year = monthly_date.strftime("%m/%Y").lstrip("0")
yield scrapy.FormRequest(
method="GET",
url=base_url,
formdata={"mesano": month_year},
)

def parse(self, response):
for gazette_data in response.json():
raw_gazette_date = gazette_data["Data_Formatada"]
raw_gazette_date
gazette_date = datetime.strptime(raw_gazette_date, "%d/%m/%Y").date()
if not self.start_date <= gazette_date <= self.end_date:
continue

gazette_code = gazette_data["Codigo_ANEXO"]
gazette_url = response.urljoin(
f"{self.ver_subpath}/WEB-ObterAnexo.rule?sys=LAI&codigo={gazette_code}"
)

raw_edition_number = gazette_data["ANEXO"]
gazette_edition_number = re.search(r"\d+", raw_edition_number)

if gazette_edition_number is None:
gazette_edition_number = ""
else:
gazette_edition_number = gazette_edition_number.group(0)

is_extra_edition = bool(
re.search(r"extra|supl|ee|esp", raw_edition_number, re.IGNORECASE)
)

yield scrapy.Request(
gazette_url,
method="GET",
callback=self.parse_valid_gazette_file,
cb_kwargs={
"gazette": Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
file_urls=[gazette_url],
is_extra_edition=is_extra_edition,
power=self.power,
)
},
)

def parse_valid_gazette_file(self, response, gazette):
# o header so possui Content-Length quando o PDF esta indisponivel
if not response.headers.getlist("Content-Length"):
yield gazette
44 changes: 4 additions & 40 deletions data_collection/gazette/spiders/rj/rj_belford_roxo.py
Original file line number Diff line number Diff line change
@@ -1,47 +1,11 @@
from datetime import date, datetime
from datetime import date

import scrapy
from dateutil.rrule import MONTHLY, rrule
from gazette.spiders.base.modernizacao import BaseModernizacaoSpider

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class RjBelfordRoxoSpider(BaseGazetteSpider):
class RjBelfordRoxoSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3300456"
name = "rj_belford_roxo"
allowed_domains = ["transparencia.prefeituradebelfordroxo.rj.gov.br"]
BASE_URL = "https://transparencia.prefeituradebelfordroxo.rj.gov.br/webrun/WEB-ObterAnexo.rule?sys=LAI&codigo={ATTACHMENT_CODE}"

start_date = date(2019, 1, 2)

def start_requests(self):
url = "https://transparencia.prefeituradebelfordroxo.rj.gov.br/diario_oficial_get.php"
initial_date = date(self.start_date.year, self.start_date.month, 1)

for monthly_date in rrule(
freq=MONTHLY, dtstart=initial_date, until=self.end_date
):
month_year = monthly_date.strftime("%m/%Y").lstrip("0") # like 9/2022
yield scrapy.FormRequest(
url=url,
formdata={"mesano": month_year},
)

def parse(self, response):
for gazette_data in response.json():
raw_gazette_date = gazette_data["Data_Formatada"]
gazette_date = datetime.strptime(raw_gazette_date, "%d/%m/%Y").date()
if gazette_date < self.start_date or self.end_date < gazette_date:
continue
gazette_code = gazette_data["Codigo_ANEXO"]
gazette_edition_number = gazette_data["ANEXO"]
gazette_url = self.BASE_URL.format(ATTACHMENT_CODE=gazette_code)

yield Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
file_urls=[gazette_url],
is_extra_edition=False,
power="executive",
)
power = "executive"
12 changes: 12 additions & 0 deletions data_collection/gazette/spiders/rj/rj_mesquita.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
import datetime as dt

from gazette.spiders.base.modernizacao import BaseModernizacaoSpider


class RjMesquitaSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3302858"
name = "rj_mesquita"
allowed_domains = ["transparencia.mesquita.rj.gov.br"]
ver_subpath = "ver20240713"
start_date = dt.date(2015, 7, 15)
power = "executive"
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rj/rj_miguel_pereira.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.modernizacao import BaseModernizacaoSpider


class RjMiguelPereiraSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3302908"
name = "rj_miguel_pereira"
allowed_domains = ["transparencia.miguelpereira.rj.gov.br"]
start_date = dt.date(2021, 9, 3)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rj/rj_quatis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.modernizacao import BaseModernizacaoSpider


class RjQuatisSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3304128"
name = "rj_quatis"
allowed_domains = ["transparencia.quatis.rj.gov.br"]
start_date = dt.date(2021, 1, 11)
10 changes: 10 additions & 0 deletions data_collection/gazette/spiders/rj/rj_queimados.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
import datetime as dt

from gazette.spiders.base.modernizacao import BaseModernizacaoSpider


class RjQueimadosSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3304144"
name = "rj_queimados"
allowed_domains = ["transparencia.queimados.rj.gov.br"]
start_date = dt.date(2018, 1, 3)
30 changes: 2 additions & 28 deletions data_collection/gazette/spiders/rj/rj_sao_joao_de_meriti.py
Original file line number Diff line number Diff line change
@@ -1,36 +1,10 @@
import datetime as dt

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider
from gazette.spiders.base.modernizacao import BaseModernizacaoSpider


class RjSaoJoaoDeMeritiSpider(BaseGazetteSpider):
class RjSaoJoaoDeMeritiSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3305109"
name = "rj_sao_joao_de_meriti"
allowed_domains = ["transparencia.meriti.rj.gov.br"]
start_urls = ["https://transparencia.meriti.rj.gov.br/diario_oficial_get.php"]
BASE_URL = "https://transparencia.meriti.rj.gov.br/ver20230623/WEB-ObterAnexo.rule?sys=LAI&codigo="
start_date = dt.date(2017, 1, 1)
custom_settings = {"DOWNLOAD_DELAY": 0.5, "RANDOMIZE_DOWNLOAD_DELAY": True}

def parse(self, response):
for gazette_data in response.json():
raw_gazette_date = gazette_data["Data_Formatada"]
gazette_date = dt.datetime.strptime(raw_gazette_date, "%d/%m/%Y").date()

if not self.start_date <= gazette_date <= self.end_date:
continue
gazette_code = gazette_data["Codigo_ANEXO"]
# links quebrados no portal de transparência
if gazette_code == 1:
continue
gazette_edition_number = gazette_data["ANEXO"]
gazette_url = f"{self.BASE_URL}{gazette_code}"

yield Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
file_urls=[gazette_url],
is_extra_edition=False,
power="executive_legislative",
)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rj/rj_sao_pedro_da_aldeia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import datetime as dt

from gazette.spiders.base.modernizacao import BaseModernizacaoSpider


class RjSaoPedroDaAldeiaSpider(BaseModernizacaoSpider):
TERRITORY_ID = "3305208"
name = "rj_sao_pedro_da_aldeia"
allowed_domains = ["transparencia.pmspa.rj.gov.br"]
start_date = dt.date(2018, 1, 15)
ver_subpath = "ver20240713"

0 comments on commit 262f95e

Please sign in to comment.