diff --git a/data_collection/gazette/spiders/to/to_araguaina.py b/data_collection/gazette/spiders/to/to_araguaina.py index 0d44ec621..5d5b5f6bd 100644 --- a/data_collection/gazette/spiders/to/to_araguaina.py +++ b/data_collection/gazette/spiders/to/to_araguaina.py @@ -1,51 +1,12 @@ -import datetime as dt +from datetime import date -import scrapy +from gazette.spiders.base.barcodigital import BarcoDigitalSpider -from gazette.items import Gazette -from gazette.spiders.base import BaseGazetteSpider - - -class ToAraguainaSpider(BaseGazetteSpider): - zyte_smartproxy_enabled = True +class ToAraguainaSpider(BarcoDigitalSpider): name = "to_araguaina" TERRITORY_ID = "1702109" - allowed_domains = [ - "diariooficial.araguaina.to.gov.br", - "diariooficial.araguaina.tk", - ] - start_date = dt.date(2011, 12, 6) - - def start_requests(self): - formatted_start_date = self.start_date.strftime("%d/%m/%Y") - formatted_end_date = self.end_date.strftime("%d/%m/%Y") - yield scrapy.Request( - f"https://diariooficial.araguaina.to.gov.br/Pesquisa/?De={formatted_start_date}&Ate={formatted_end_date}" - ) - - def parse(self, response): - editions = response.css("#ctl00_ContentPlaceHolder1_gvResultado tbody tr") - for edition in editions: - raw_date = edition.xpath(".//td[2]/text()").get() - date = dt.datetime.strptime(raw_date, "%d/%m/%Y").date() - edition_number = edition.xpath(".//td[1]/text()").re_first(r"\d+") - - gazette_item = Gazette( - date=date, - edition_number=edition_number, - is_extra_edition=False, - power="executive_legislative", - ) - - download_url = response.urljoin(edition.xpath(".//td[6]/a/@href").get()) - yield scrapy.Request( - download_url, - method="HEAD", - callback=self.parse_gazette_download_url, - cb_kwargs={"gazette_item": gazette_item}, - ) + allowed_domains = ["api-araguaina.barcodigital.com.br"] + base_url = "https://api-araguaina.barcodigital.com.br" - def parse_gazette_download_url(self, response, gazette_item): - gazette_item["file_urls"] = [response.url] - yield gazette_item + start_date = date(2011, 12, 6)