diff --git a/data_collection/gazette/spiders/rj/rj_arraial_do_cabo.py b/data_collection/gazette/spiders/rj/rj_arraial_do_cabo.py index ebd9bb944..f2c09ec46 100644 --- a/data_collection/gazette/spiders/rj/rj_arraial_do_cabo.py +++ b/data_collection/gazette/spiders/rj/rj_arraial_do_cabo.py @@ -1,11 +1,42 @@ -import datetime +from datetime import date, datetime -from gazette.spiders.base.instar import BaseInstarSpider +from scrapy import Request +from scrapy.http.response.html import HtmlResponse +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider -class RjArraialdoCabopider(BaseInstarSpider): + +class RjArraialdoCabopider(BaseGazetteSpider): TERRITORY_ID = "3300258" + name = "rj_arraial_do_cabo" - allowed_domains = ["arraial.rj.gov.br"] - base_url = "https://www.arraial.rj.gov.br/portal/diario-oficial" - start_date = datetime.date(2019, 2, 7) + allowed_domains = ["portal.arraial.rj.gov.br"] + start_urls = ["https://portal.arraial.rj.gov.br/diarios_oficiais_web"] + start_date = date(2019, 5, 7) + + def parse(self, response: HtmlResponse): + for entry in response.css(".row .card.card-margin"): + edition = entry.css("h5.card-title").re_first(r"(\d*) \/ \d{4}") + file_url = entry.css( + ".widget-49-meeting-action.mt-2 a::attr(href)" + ).extract_first() + publish_date = entry.css(".widget-49-date-day::text").extract_first() + publish_date = datetime.strptime(publish_date, "%d %b %Y").date() + + if not self.start_date <= publish_date <= self.end_date: + continue + + yield Gazette( + date=publish_date, + file_urls=[file_url], + edition_number=edition, + is_extra_edition=False, + territory_id=self.TERRITORY_ID, + power="executive", + ) + + if next_page := response.xpath( + '//a[contains(@rel, "next")]/@href' + ).extract_first(): + yield Request(next_page, callback=self.parse)