Skip to content

Commit

Permalink
Upgrade libraries and fix validation fields (#945)
Browse files Browse the repository at this point in the history
  • Loading branch information
rennerocha authored Oct 4, 2023
2 parents 32938aa + 68f8b1a commit bc30466
Show file tree
Hide file tree
Showing 14 changed files with 1,905 additions and 2,149 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/deploy.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: "3.10"
- name: Install shub
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/monthly_crawl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: "3.10"
- name: Prepare environment
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/periodic_crawl.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: "3.10"
- name: Prepare environment
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_spider.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: "3.10"
- name: Prepare environment
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_spider_by_date.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -26,7 +26,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: "3.10"
- name: Prepare environment
run: |
python -m pip install --upgrade pip
Expand Down
2 changes: 1 addition & 1 deletion .github/workflows/schedule_spider_date_range.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ jobs:
- uses: actions/checkout@v2
- uses: actions/setup-python@v2
with:
python-version: 3.9
python-version: "3.10"
- name: Prepare environment
run: |
python -m pip install --upgrade pip
Expand Down
5 changes: 5 additions & 0 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,8 @@ shell:

run_spider_since:
cd $(SRC_DIRS) && scrapy crawl -a start_date=$(START_DATE) $(SPIDER)

compile:
cd data_collection; \
pip-compile --upgrade --no-annotate --allow-unsafe --generate-hashes requirements.in; \
pip-compile --upgrade --no-annotate --allow-unsafe --generate-hashes requirements-dev.in
1 change: 1 addition & 0 deletions data_collection/gazette/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,3 +15,4 @@ class Gazette(scrapy.Item):
scraped_at = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
_validation = scrapy.Field()
30 changes: 17 additions & 13 deletions data_collection/gazette/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
from scrapy import spiderloader
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FilesPipeline
from scrapy.settings import Settings
from scrapy.utils import project
Expand All @@ -23,17 +24,13 @@ def process_item(self, item, spider):


class DefaultValuesPipeline:
"""Add defaults values field, if not already set in the item"""
def process_item(self, item, spider):
item["territory_id"] = getattr(spider, "TERRITORY_ID")

default_field_values = {
"territory_id": lambda spider: getattr(spider, "TERRITORY_ID"),
"scraped_at": lambda spider: dt.datetime.utcnow(),
}
# Date manipulation to allow jsonschema to validate correctly
item["date"] = str(item["date"])
item["scraped_at"] = dt.datetime.utcnow().isoformat("T") + "Z"

def process_item(self, item, spider):
for field in self.default_field_values:
if field not in item:
item[field] = self.default_field_values.get(field)(spider)
return item


Expand Down Expand Up @@ -83,6 +80,12 @@ def process_item(self, item, spider):
"territory_id",
]
gazette_item = {field: item.get(field) for field in fields}
gazette_item["date"] = dt.datetime.strptime(
gazette_item["date"], "%Y-%m-%d"
).date()
gazette_item["scraped_at"] = dt.datetime.strptime(
gazette_item["scraped_at"], "%Y-%m-%dT%H:%M:%S.%fZ"
)

for file_info in item.get("files", []):
already_downloaded = file_info["status"] == "uptodate"
Expand Down Expand Up @@ -138,7 +141,10 @@ def get_media_requests(self, item, info):
"""Makes requests from urls and/or lets through ready requests."""
urls = ItemAdapter(item).get(self.files_urls_field, [])
download_file_headers = getattr(info.spider, "download_file_headers", {})
yield from (Request(u, headers=download_file_headers) for u in urls)
yield from (
Request(u, callback=NO_CALLBACK, headers=download_file_headers)
for u in urls
)

requests = ItemAdapter(item).get(self.files_requests_field, [])
yield from requests
Expand All @@ -161,10 +167,8 @@ def file_path(self, request, response=None, info=None, item=None):
Path to save the files, modified to organize the gazettes in directories.
The files will be under <territory_id>/<gazette date>/.
"""

filepath = super().file_path(request, response=response, info=info, item=item)
# The default path from the scrapy class begins with "full/". In this
# class we replace that with the territory_id and gazette date.
datestr = item["date"].strftime("%Y-%m-%d")
filename = Path(filepath).name
return str(Path(item["territory_id"], datestr, filename))
return str(Path(item["territory_id"], item["date"], filename))
4 changes: 3 additions & 1 deletion data_collection/gazette/resources/gazette_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,9 @@
]
},
"scraped_at":{
"type": "string"
"description": "When the gazzete was scraped",
"type": "string",
"format": "date-time"
},
"files": {
"type": "array",
Expand Down
Loading

0 comments on commit bc30466

Please sign in to comment.