Skip to content

Commit

Permalink
Habilita Zyte Smart Proxy em Florianópolis-SC (#1039)
Browse files Browse the repository at this point in the history
- Spider fufnciona localmente, mas não funciona na Scrapy Cloud.
- Ajuste de URL inicial para usar HTTPS ao invẽs de HTTP
- Substituir mẽtodos antigos (extract() e extract_first()) por get() e
getall(), considerados o padrão do Scrapy
  • Loading branch information
Giulio Carvalho authored Nov 21, 2023
2 parents a579344 + 55093f1 commit d35f6a4
Showing 1 changed file with 7 additions and 6 deletions.
13 changes: 7 additions & 6 deletions data_collection/gazette/spiders/sc/sc_florianopolis.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,6 @@
class ScFlorianopolisSpider(BaseGazetteSpider):
name = "sc_florianopolis"
TERRITORY_ID = "4205407"

start_date = date(2009, 6, 1)

def start_requests(self):
Expand All @@ -25,7 +24,7 @@ def start_requests(self):
for year, month in periods_of_interest:
data = dict(ano=str(year), mes=str(month), passo="1", enviar="")
yield FormRequest(
"http://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial",
"https://www.pmf.sc.gov.br/governo/index.php?pagina=govdiariooficial",
formdata=data,
)

Expand All @@ -42,22 +41,24 @@ def parse(self, response):
yield Gazette(
date=gazette_date,
edition_number=gazette_edition_number,
file_urls=(url,),
file_urls=[
url,
],
is_extra_edition=self.is_extra(link),
power="executive_legislative",
)

@staticmethod
def get_pdf_url(response, link):
relative_url = link.css("::attr(href)").extract_first()
relative_url = link.css("::attr(href)").get()
if not relative_url.lower().endswith(".pdf"):
return None

return response.urljoin(relative_url)

@staticmethod
def get_date(link):
text = " ".join(link.css("::text").extract())
text = " ".join(link.css("::text").getall())
pattern = r"\d{1,2}\s+de\s+\w+\s+de\s+\d{4}"
match = re.search(pattern, text)
if not match:
Expand All @@ -67,5 +68,5 @@ def get_date(link):

@staticmethod
def is_extra(link):
text = " ".join(link.css("::text").extract())
text = " ".join(link.css("::text").getall())
return "extra" in text.lower()

0 comments on commit d35f6a4

Please sign in to comment.