Skip to content

Commit

Permalink
cria base portalgov
Browse files Browse the repository at this point in the history
  • Loading branch information
slfabio authored and trevineju committed Sep 6, 2024
1 parent 7ef3f3a commit befda9f
Show file tree
Hide file tree
Showing 3 changed files with 69 additions and 0 deletions.
46 changes: 46 additions & 0 deletions data_collection/gazette/spiders/base/portalgov.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
import re
from datetime import datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class BasePortalGovSpider(BaseGazetteSpider):
power = "executive"

def start_requests(self):
yield scrapy.FormRequest(
url=f"https://{self.domain}/controllers/diario_oficial/class_diario.php",
formdata={
"func": "5",
"param": "1",
},
)

def parse(self, response):
for gazette_data in response.json():
raw_gazette_date = gazette_data["data"]
gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date()
if gazette_date > self.end_date:
continue
if gazette_date < self.start_date:
return

gazette_desc = gazette_data["descricao"]
gazette_edition = gazette_data["numero"]
gazette_edition_number = re.search(r"\d+", gazette_edition).group(0)
is_extra_edition = bool(
re.search(r"extra|supl", gazette_edition + gazette_desc, re.IGNORECASE)
)

gazette_url = f"https://{self.domain}/arquivos/diario_oficial/{gazette_data['arquivo']}"

yield Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
file_urls=[gazette_url],
is_extra_edition=is_extra_edition,
power=self.power,
)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rj/rj_sao_joao_da_barra.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.portalgov import BasePortalGovSpider


class RjSaoJoaoDaBarraSpider(BasePortalGovSpider):
name = "rj_sao_joao_da_barra"
TERRITORY_ID = "3305000"
allowed_domains = ["sjb.rj.gov.br"]
start_date = date(2013, 7, 15)
domain = "www.sjb.rj.gov.br"
12 changes: 12 additions & 0 deletions data_collection/gazette/spiders/rj/rj_varre_sai.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
from datetime import date

from gazette.spiders.base.portalgov import BasePortalGovSpider


class RjVarreSaiSpider(BasePortalGovSpider):
name = "rj_varre_sai"
TERRITORY_ID = "3306156"
allowed_domains = ["varresai.rj.gov.br"]
start_date = date(2019, 9, 21)
power = "executive_legislative"
domain = "varresai.rj.gov.br"

0 comments on commit befda9f

Please sign in to comment.