Skip to content

Commit

Permalink
Cria Spider para Macaé/RJ (#1254)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Sep 25, 2024
2 parents 2efc8b9 + f023d01 commit 324d769
Showing 1 changed file with 55 additions and 0 deletions.
55 changes: 55 additions & 0 deletions data_collection/gazette/spiders/rj/rj_macae.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
import re
from datetime import date, datetime as dt

import scrapy

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class RjMacaeSpider(BaseGazetteSpider):
TERRITORY_ID = "3302403"
name = "rj_macae"
allowed_domains = ["macae.rj.gov.br"]
start_date = date(2020, 5, 22)

def start_requests(self):
yield scrapy.FormRequest(
url="https://sistemas.macae.rj.gov.br:840/diariooficial/index/listarajax",
method="POST",
formdata={
"periodode": self.start_date.strftime("%d/%m/%Y"),
"periodoate": self.end_date.strftime("%d/%m/%Y"),
},
)

def parse(self, response):
for data in response.json()["data"]:
gazette_code = data["DT_RowId"]
gazette_url = f"https://sistemas.macae.rj.gov.br:840/diariooficial/index/download?id={gazette_code}"

gazette_edition = data["edicao"]
gazette_edition_number = re.search(r"\d+", gazette_edition).group(0)

raw_gazette_date = re.search(
r"\d{2}\/\d{2}\/\d{4}", data["publicacao"]
).group()
gazette_date = dt.strptime(raw_gazette_date, "%d/%m/%Y").date()

gazette_item = {
"date": gazette_date,
"edition_number": gazette_edition_number,
"is_extra_edition": "EXTRA" in gazette_edition.upper(),
}

yield scrapy.Request(
url=gazette_url,
method="HEAD",
callback=self.parse_pdf_url,
cb_kwargs={"gazette_item": gazette_item},
)

def parse_pdf_url(self, response, gazette_item):
yield Gazette(
**gazette_item, file_urls=[response.url], power="executive_legislative"
)

0 comments on commit 324d769

Please sign in to comment.