Skip to content

Commit

Permalink
Atualiza raspador de Uberlândia-MG (#1129)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Apr 17, 2024
2 parents 8b2f1cf + d0ad000 commit 73035aa
Showing 1 changed file with 26 additions and 9 deletions.
35 changes: 26 additions & 9 deletions data_collection/gazette/spiders/mg/mg_uberlandia.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import re

import dateparser
import scrapy
Expand All @@ -10,11 +11,14 @@


class MgUberlandiaSpider(BaseGazetteSpider):
zyte_smartproxy_enabled = True

TERRITORY_ID = "3170206"
name = "mg_uberlandia"
start_date = datetime.date(2005, 1, 3)
allowed_domains = ["uberlandia.mg.gov.br"]

custom_settings = {
"USER_AGENT": "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/123.0.0.0 Safari/537.36"
}

def start_requests(self):
first_day_of_start_date_month = datetime.date(
Expand Down Expand Up @@ -54,15 +58,28 @@ def parse(self, response):
edition_number = edition.re_first(r"(\d+)")
is_extra_edition = bool(edition.re(r"\d+.*?([A-Za-z]+)"))

gazette_url = gazette.css("a::attr(href)").get()
intermediary_page_url = gazette.css("a::attr(href)").get()

yield Gazette(
date=gazette_date,
edition_number=edition_number,
is_extra_edition=is_extra_edition,
file_urls=[gazette_url],
power="executive",
gazette_item = {
"date": gazette_date,
"edition_number": edition_number,
"is_extra_edition": is_extra_edition,
}

yield scrapy.Request(
intermediary_page_url,
callback=self.intermediary_page,
cb_kwargs={"gazette_item": gazette_item},
)

for page_url in response.css("nav a.page-numbers::attr(href)").getall():
yield scrapy.Request(page_url)

def intermediary_page(self, response, gazette_item):
gazette_url = re.search(r'location="(.*)";', response.text).group(1)

yield Gazette(
**gazette_item,
file_urls=[gazette_url],
power="executive",
)

0 comments on commit 73035aa

Please sign in to comment.