From d8126df38da60ae9bc2c89bf32ea109708b62aac Mon Sep 17 00:00:00 2001 From: Renne Rocha Date: Mon, 5 Sep 2022 21:31:38 -0300 Subject: [PATCH 1/3] =?UTF-8?q?Uberl=C3=A2ndia-MG=20spider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/mg_uberlandia.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 data_collection/gazette/spiders/mg_uberlandia.py diff --git a/data_collection/gazette/spiders/mg_uberlandia.py b/data_collection/gazette/spiders/mg_uberlandia.py new file mode 100644 index 000000000..471b2d68d --- /dev/null +++ b/data_collection/gazette/spiders/mg_uberlandia.py @@ -0,0 +1,66 @@ +import datetime + +import dateparser +import scrapy +import w3lib +from dateutil.rrule import MONTHLY, rrule + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class MgUberlandiaSpider(BaseGazetteSpider): + TERRITORY_ID = "3170206" + name = "mg_uberlandia" + start_date = datetime.date(2005, 1, 3) + + def start_requests(self): + first_day_of_start_date_month = datetime.date( + self.start_date.year, self.start_date.month, 1 + ) + months_of_interest = rrule( + MONTHLY, dtstart=first_day_of_start_date_month, until=self.end_date + ) + for month_date in months_of_interest: + yield scrapy.Request( + f"https://www.uberlandia.mg.gov.br/{month_date.year}/{month_date.month}/?post_type=diariooficial", + errback=self.on_error, + ) + + def on_error(self, failure): + # month/year URLs have two different valid query parameters: + # post_type=diario_oficial or post_type=diariooficial + # so if the first is not found, it will retry with the second type + if failure.value.response.status == 404: + alternative_url = w3lib.url.add_or_replace_parameter( + failure.value.response.url, "post_type", "diario_oficial" + ) + yield scrapy.Request(alternative_url) + + def parse(self, response): + gazettes = response.css("article.elementor-post") + for gazette in gazettes: + gazette_date = dateparser.parse( + gazette.css( + ".elementor-post-date::text, .ee-post__metas__date::text" + ).get() + ).date() + if gazette_date < self.start_date or gazette_date > self.end_date: + continue + + edition = gazette.css("h3 a::text, h5::text") + edition_number = edition.re_first(r"(\d+)") + is_extra_edition = bool(edition.re(r"\d+.*?([A-Za-z]+)")) + + gazette_url = gazette.css("a::attr(href)").get() + + yield Gazette( + date=gazette_date, + edition_number=edition_number, + is_extra_edition=is_extra_edition, + file_urls=[gazette_url], + power="executive", + ) + + for page_url in response.css("nav a.page-numbers::attr(href)").getall(): + yield scrapy.Request(page_url) From 46342f625a1f4a5ad87bb550f7b64c0ca86cf2a1 Mon Sep 17 00:00:00 2001 From: trevineju Date: Tue, 16 Apr 2024 15:43:33 -0300 Subject: [PATCH 2/3] =?UTF-8?q?Move=20spider=20de=20Uberl=C3=A2ndia=20para?= =?UTF-8?q?=20diret=C3=B3rio=20MG?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- data_collection/gazette/spiders/{ => mg}/mg_uberlandia.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename data_collection/gazette/spiders/{ => mg}/mg_uberlandia.py (100%) diff --git a/data_collection/gazette/spiders/mg_uberlandia.py b/data_collection/gazette/spiders/mg/mg_uberlandia.py similarity index 100% rename from data_collection/gazette/spiders/mg_uberlandia.py rename to data_collection/gazette/spiders/mg/mg_uberlandia.py From 2b1d8f5e26635be11ad53d59ec30e3616a23a665 Mon Sep 17 00:00:00 2001 From: trevineju Date: Tue, 16 Apr 2024 15:45:29 -0300 Subject: [PATCH 3/3] Adiciona w3lib ao requirements.in --- data_collection/requirements.in | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/data_collection/requirements.in b/data_collection/requirements.in index 48dda1918..41cd69666 100644 --- a/data_collection/requirements.in +++ b/data_collection/requirements.in @@ -12,4 +12,5 @@ python-decouple scrapy scrapy-zyte-smartproxy SQLAlchemy -spidermon \ No newline at end of file +spidermon +w3lib \ No newline at end of file