Skip to content

Commit

Permalink
Uberlândia-MG spider (#638)
Browse files Browse the repository at this point in the history
Fix #621
  • Loading branch information
trevineju authored Apr 16, 2024
2 parents c6947f5 + 2b1d8f5 commit a6e515a
Show file tree
Hide file tree
Showing 2 changed files with 68 additions and 1 deletion.
66 changes: 66 additions & 0 deletions data_collection/gazette/spiders/mg/mg_uberlandia.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
import datetime

import dateparser
import scrapy
import w3lib
from dateutil.rrule import MONTHLY, rrule

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class MgUberlandiaSpider(BaseGazetteSpider):
TERRITORY_ID = "3170206"
name = "mg_uberlandia"
start_date = datetime.date(2005, 1, 3)

def start_requests(self):
first_day_of_start_date_month = datetime.date(
self.start_date.year, self.start_date.month, 1
)
months_of_interest = rrule(
MONTHLY, dtstart=first_day_of_start_date_month, until=self.end_date
)
for month_date in months_of_interest:
yield scrapy.Request(
f"https://www.uberlandia.mg.gov.br/{month_date.year}/{month_date.month}/?post_type=diariooficial",
errback=self.on_error,
)

def on_error(self, failure):
# month/year URLs have two different valid query parameters:
# post_type=diario_oficial or post_type=diariooficial
# so if the first is not found, it will retry with the second type
if failure.value.response.status == 404:
alternative_url = w3lib.url.add_or_replace_parameter(
failure.value.response.url, "post_type", "diario_oficial"
)
yield scrapy.Request(alternative_url)

def parse(self, response):
gazettes = response.css("article.elementor-post")
for gazette in gazettes:
gazette_date = dateparser.parse(
gazette.css(
".elementor-post-date::text, .ee-post__metas__date::text"
).get()
).date()
if gazette_date < self.start_date or gazette_date > self.end_date:
continue

edition = gazette.css("h3 a::text, h5::text")
edition_number = edition.re_first(r"(\d+)")
is_extra_edition = bool(edition.re(r"\d+.*?([A-Za-z]+)"))

gazette_url = gazette.css("a::attr(href)").get()

yield Gazette(
date=gazette_date,
edition_number=edition_number,
is_extra_edition=is_extra_edition,
file_urls=[gazette_url],
power="executive",
)

for page_url in response.css("nav a.page-numbers::attr(href)").getall():
yield scrapy.Request(page_url)
3 changes: 2 additions & 1 deletion data_collection/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -12,4 +12,5 @@ python-decouple
scrapy
scrapy-zyte-smartproxy
SQLAlchemy
spidermon
spidermon
w3lib

0 comments on commit a6e515a

Please sign in to comment.