From 6f3368fdafc93c2214b5429361c405401ea52cd0 Mon Sep 17 00:00:00 2001 From: daniel carvalho Date: Wed, 3 Jul 2024 11:52:00 -0300 Subject: [PATCH 1/5] =?UTF-8?q?Create=20BarcoDigital=20spider=20Co-authore?= =?UTF-8?q?d-by:=20Adelly=20Lima=20?= =?UTF-8?q?=20Co-authored-by:=20L=C3=A9lis=20=20Co-authored-by:=20Heitor=20Carvalho=20=20Co-authored-by:=20Claudio=20Magalh=C3=A3e?= =?UTF-8?q?s=20=20Co-authored-by:=20Jos?= =?UTF-8?q?=C3=A9=20Carlos=20Menezes=20=20Co-authore?= =?UTF-8?q?d-by:=20Miho=20Yamawaki=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/base/barcodigital.py | 40 +++++++++++++++++++ 1 file changed, 40 insertions(+) create mode 100644 data_collection/gazette/spiders/base/barcodigital.py diff --git a/data_collection/gazette/spiders/base/barcodigital.py b/data_collection/gazette/spiders/base/barcodigital.py new file mode 100644 index 000000000..d4cf05492 --- /dev/null +++ b/data_collection/gazette/spiders/base/barcodigital.py @@ -0,0 +1,40 @@ +from datetime import date, datetime + +from dateutil.rrule import MONTHLY, rrule +from scrapy import Request + +from gazette.items import Gazette +from gazette.spiders.base import BaseGazetteSpider + + +class BarcoDigitalSpider(BaseGazetteSpider): + EDITION_TYPE_NORMAL = 1 + EDITION_TYPE_EXTRA = 2 + EDITION_TYPE_SUPPLEMENT = 3 + + def start_requests(self): + initial_date = date(self.start_date.year, self.start_date.month, 1) + end_date = self.end_date + + periods_of_interest = [ + (date.year, date.month) + for date in rrule(freq=MONTHLY, dtstart=initial_date, until=end_date) + ] + + for year, month in periods_of_interest: + url = ( + f"{self.base_url}/api/publico/diario/calendario?mes={month}&ano={year}" + ) + yield Request(url) + + def parse(self, response): + for documents in response.json().values(): + for document in documents: + yield Gazette( + date=datetime.strptime(document.get("data"), "%Y-%m-%d").date(), + edition_number=document.get("edicao"), + is_extra_edition=document.get("tipo_edicao_id") + != self.EDITION_TYPE_NORMAL, + file_urls=[f"{self.base_url}/arquivo/{document.get('url')}"], + power="executive", + ) From 90766e4ea6c33b46d6dba671f3abd4fcd23ef189 Mon Sep 17 00:00:00 2001 From: daniel carvalho Date: Wed, 3 Jul 2024 11:54:21 -0300 Subject: [PATCH 2/5] =?UTF-8?q?Create=20Lagoa=20do=20Tocatins=20Spider=20?= =?UTF-8?q?=20=20=20=20Co-authored-by:=20Adelly=20Lima=20=20=20=20=20=20Co-authored-by:=20L=C3=A9lis?= =?UTF-8?q?=20=20=20=20=20=20Co-authored?= =?UTF-8?q?-by:=20Heitor=20Carvalho=20?= =?UTF-8?q?=20=20=20=20=20Co-authored-by:=20Claudio=20Magalh=C3=A3es=20=20=20=20=20=20Co-authored-by:=20J?= =?UTF-8?q?os=C3=A9=20Carlos=20Menezes=20=20=20=20?= =?UTF-8?q?=20=20Co-authored-by:=20Miho=20Yamawaki=20?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/to/to_lagoa_de_tocantins.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 data_collection/gazette/spiders/to/to_lagoa_de_tocantins.py diff --git a/data_collection/gazette/spiders/to/to_lagoa_de_tocantins.py b/data_collection/gazette/spiders/to/to_lagoa_de_tocantins.py new file mode 100644 index 000000000..2b3895da9 --- /dev/null +++ b/data_collection/gazette/spiders/to/to_lagoa_de_tocantins.py @@ -0,0 +1,12 @@ +from datetime import date + +from gazette.spiders.base.barcodigital import BarcoDigitalSpider + + +class ToLagoaDeTocatinsSpider(BarcoDigitalSpider): + name = "to_lagoa_de_tocantins" + TERRITORY_ID = "1711951" + allowed_domains = ["api-lagoadotocantins.barcodigital.com.br"] + base_url = "https://api-lagoadotocantins.barcodigital.com.br" + + start_date = date(year=2018, month=5, day=1) From 70d5b1031015e51dfebcc4dc582cb0bb0dfb74e1 Mon Sep 17 00:00:00 2001 From: daniel carvalho Date: Wed, 3 Jul 2024 13:08:19 -0300 Subject: [PATCH 3/5] =?UTF-8?q?Add=20Recursol=C3=A2ndia=20spider?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../gazette/spiders/to/to_recursolandia.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) create mode 100644 data_collection/gazette/spiders/to/to_recursolandia.py diff --git a/data_collection/gazette/spiders/to/to_recursolandia.py b/data_collection/gazette/spiders/to/to_recursolandia.py new file mode 100644 index 000000000..a5b2e8494 --- /dev/null +++ b/data_collection/gazette/spiders/to/to_recursolandia.py @@ -0,0 +1,12 @@ +from datetime import date + +from gazette.spiders.base.barcodigital import BarcoDigitalSpider + + +class ToRecursolandiaSpider(BarcoDigitalSpider): + name = "to_recursolandia" + TERRITORY_ID = "1718501" + allowed_domains = ["api-recursolandia.barcodigital.com.br"] + base_url = "https://api-recursolandia.barcodigital.com.br" + + start_date = date(year=2019, month=11, day=11) From 8c275fbd913fc64b7dc03a3e83fa2d04714cb415 Mon Sep 17 00:00:00 2001 From: daniel carvalho Date: Wed, 3 Jul 2024 13:54:20 -0300 Subject: [PATCH 4/5] Set interval limit for end date --- data_collection/gazette/spiders/base/barcodigital.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/data_collection/gazette/spiders/base/barcodigital.py b/data_collection/gazette/spiders/base/barcodigital.py index d4cf05492..009497eee 100644 --- a/data_collection/gazette/spiders/base/barcodigital.py +++ b/data_collection/gazette/spiders/base/barcodigital.py @@ -30,8 +30,15 @@ def start_requests(self): def parse(self, response): for documents in response.json().values(): for document in documents: + document_date = datetime.strptime( + document.get("data"), "%Y-%m-%d" + ).date() + + if document_date > self.end_date: + continue + yield Gazette( - date=datetime.strptime(document.get("data"), "%Y-%m-%d").date(), + date=document_date, edition_number=document.get("edicao"), is_extra_edition=document.get("tipo_edicao_id") != self.EDITION_TYPE_NORMAL, From e25c8d38917acaada3a0448c874fda569afc547f Mon Sep 17 00:00:00 2001 From: daniel carvalho Date: Wed, 7 Aug 2024 15:56:56 -0300 Subject: [PATCH 5/5] Add elif pra evitar o drop de itens no pipeline --- data_collection/gazette/spiders/base/barcodigital.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/data_collection/gazette/spiders/base/barcodigital.py b/data_collection/gazette/spiders/base/barcodigital.py index 009497eee..c59e75c68 100644 --- a/data_collection/gazette/spiders/base/barcodigital.py +++ b/data_collection/gazette/spiders/base/barcodigital.py @@ -36,6 +36,8 @@ def parse(self, response): if document_date > self.end_date: continue + elif document_date < self.start_date: + return yield Gazette( date=document_date,