Skip to content

Commit

Permalink
Adiciona novo spider base dioenet (#1259)
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Sep 5, 2024
2 parents c5ebe3b + 9c3e9dc commit 7ef3f3a
Show file tree
Hide file tree
Showing 5 changed files with 135 additions and 0 deletions.
91 changes: 91 additions & 0 deletions data_collection/gazette/spiders/base/dioenet.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
import re
from collections import deque
from datetime import datetime
from itertools import islice

from dateutil.rrule import WEEKLY, rrule
from scrapy.http import FormRequest, Request

from gazette.items import Gazette
from gazette.spiders.base import BaseGazetteSpider


class DioenetGazetteSpider(BaseGazetteSpider):
"""
Base spider for all cities listed on https://plenussistemas.dioenet.com.br
"""

allowed_domains = ["plenussistemas.dioenet.com.br"]

def start_requests(self):
dates_of_interest = [
dt
for dt in rrule(freq=WEEKLY, dtstart=self.start_date, until=self.end_date)
]

if self.end_date not in dates_of_interest:
dates_of_interest.append(self.end_date)

for start, end in self._sliding_window(dates_of_interest, 2):
params = {
"d": f"{start.strftime('%d/%m/%Y')} a {end.strftime('%d/%m/%Y')}",
"pagina": "1",
}

yield FormRequest(
url=self.BASE_URL,
method="GET",
formdata=params,
cb_kwargs={"params": params},
)

def parse(self, response, params):
for gazette in response.css("ul.lista-diarios li"):
# can return ['Edição nº 841'] or ['Edição nº 842', 'Extra']
raw_edition = gazette.css(".col-one span::text").getall()
gazette_number = re.findall("\d+", raw_edition[0])[0]
gazette_extra = True if "Extra" in raw_edition else False

elem = gazette.css(".col-two a.btn")
gazette_url = elem.attrib["href"]
raw_date = re.findall("(\d{2}/\d{2}/\d{4})", elem.attrib["title"])[0]
gazette_date = datetime.strptime(raw_date, "%d/%m/%Y").date()

gazette_item = {
"date": gazette_date,
"edition_number": gazette_number,
"is_extra_edition": gazette_extra,
"power": self.power,
}

yield Request(
gazette_url,
callback=self.get_gazette_url,
cb_kwargs={"gazette_item": gazette_item},
)

if response.css("ul.pagination li.next.page"):
params["pagina"] = f"{int(params['pagina'])+1}"

yield FormRequest(
url=self.BASE_URL,
method="GET",
formdata=params,
cb_kwargs={"params": params},
)

def get_gazette_url(self, response, gazette_item):
gazette_url = response.xpath("//iframe/@src").get()
gazette_url = re.search(r"file=(.*)", gazette_url).group(1)

yield Gazette(
**gazette_item,
file_urls=[gazette_url],
)

def _sliding_window(self, iterable, n):
it = iter(iterable)
window = deque(islice(it, n - 1), maxlen=n)
for x in it:
window.append(x)
yield tuple(window)
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/pr/pr_marilandia_do_sul.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.dioenet import DioenetGazetteSpider


class PrMarilandiaDoSulSpider(DioenetGazetteSpider):
TERRITORY_ID = "4114906"
name = "pr_marilandia_do_sul"
start_date = date(2019, 12, 17)
BASE_URL = "https://plenussistemas.dioenet.com.br/list/marilandia-do-sul"
power = "executive_legislative"
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rj/rj_nova_friburgo.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.dioenet import DioenetGazetteSpider


class RjNovaFriburgoSpider(DioenetGazetteSpider):
TERRITORY_ID = "3303401"
name = "rj_nova_friburgo"
start_date = date(2019, 10, 17)
BASE_URL = "https://plenussistemas.dioenet.com.br/list/nova-friburgo"
power = "executive_legislative"
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/rj/rj_sumidouro.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.dioenet import DioenetGazetteSpider


class RjSumidouroSpider(DioenetGazetteSpider):
TERRITORY_ID = "3305703"
name = "rj_sumidouro"
start_date = date(2021, 7, 26)
BASE_URL = "https://plenussistemas.dioenet.com.br/list/sumidouro"
power = "executive_legislative"
11 changes: 11 additions & 0 deletions data_collection/gazette/spiders/sp/sp_taubate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
from datetime import date

from gazette.spiders.base.dioenet import DioenetGazetteSpider


class SpTaubateSpider(DioenetGazetteSpider):
TERRITORY_ID = "3554102"
name = "sp_taubate"
start_date = date(2022, 12, 8)
BASE_URL = "https://plenussistemas.dioenet.com.br/list/taubate"
power = "executive_legislative"

0 comments on commit 7ef3f3a

Please sign in to comment.