Skip to content

Commit

Permalink
feat: novo spider custom para arraial do cabo (okfn-brasil#1261)
Browse files Browse the repository at this point in the history
  • Loading branch information
jjpaulo2 committed Sep 21, 2024
1 parent c3096ac commit 97b8cb4
Showing 1 changed file with 37 additions and 6 deletions.
43 changes: 37 additions & 6 deletions data_collection/gazette/spiders/rj/rj_arraial_do_cabo.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,42 @@
import datetime
from datetime import date, datetime

from gazette.spiders.base.instar import BaseInstarSpider
from scrapy import Request
from scrapy.http.response.html import HtmlResponse

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider

class RjArraialdoCabopider(BaseInstarSpider):

class RjArraialdoCabopider(BaseGazetteSpider):
TERRITORY_ID = "3300258"

name = "rj_arraial_do_cabo"
allowed_domains = ["arraial.rj.gov.br"]
base_url = "https://www.arraial.rj.gov.br/portal/diario-oficial"
start_date = datetime.date(2019, 2, 7)
allowed_domains = ["portal.arraial.rj.gov.br"]
start_urls = ["https://portal.arraial.rj.gov.br/diarios_oficiais_web"]
start_date = date(2019, 5, 7)

def parse(self, response: HtmlResponse):
for entry in response.css(".row .card.card-margin"):
edition = entry.css("h5.card-title").re_first(r"(\d*) \/ \d{4}")
file_url = entry.css(
".widget-49-meeting-action.mt-2 a::attr(href)"
).extract_first()
publish_date = entry.css(".widget-49-date-day::text").extract_first()
publish_date = datetime.strptime(publish_date, "%d %b %Y").date()

if not self.start_date <= publish_date <= self.end_date:
continue

yield Gazette(
date=publish_date,
file_urls=[file_url],
edition_number=edition,
is_extra_edition=False,
territory_id=self.TERRITORY_ID,
power="executive",
)

if next_page := response.xpath(
'//a[contains(@rel, "next")]/@href'
).extract_first():
yield Request(next_page, callback=self.parse)

0 comments on commit 97b8cb4

Please sign in to comment.