Skip to content

Commit

Permalink
Revert "Fix date and datetime validation to allow upgrade of Spidermon"
Browse files Browse the repository at this point in the history
  • Loading branch information
trevineju authored Sep 22, 2023
1 parent 31822a0 commit 3a54d61
Show file tree
Hide file tree
Showing 8 changed files with 1,994 additions and 1,627 deletions.
1 change: 0 additions & 1 deletion data_collection/gazette/items.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,3 @@ class Gazette(scrapy.Item):
scraped_at = scrapy.Field()
file_urls = scrapy.Field()
files = scrapy.Field()
_validation = scrapy.Field()
24 changes: 13 additions & 11 deletions data_collection/gazette/pipelines.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,6 @@
from scrapy import spiderloader
from scrapy.exceptions import DropItem
from scrapy.http import Request
from scrapy.http.request import NO_CALLBACK
from scrapy.pipelines.files import FilesPipeline
from scrapy.settings import Settings
from scrapy.utils import project
Expand All @@ -24,13 +23,17 @@ def process_item(self, item, spider):


class DefaultValuesPipeline:
def process_item(self, item, spider):
item["territory_id"] = getattr(spider, "TERRITORY_ID")
"""Add defaults values field, if not already set in the item"""

# Date manipulation to allow jsonschema to validate correctly
item["date"] = str(item["date"])
item["scraped_at"] = dt.datetime.utcnow().isoformat("T") + "Z"
default_field_values = {
"territory_id": lambda spider: getattr(spider, "TERRITORY_ID"),
"scraped_at": lambda spider: dt.datetime.utcnow(),
}

def process_item(self, item, spider):
for field in self.default_field_values:
if field not in item:
item[field] = self.default_field_values.get(field)(spider)
return item


Expand Down Expand Up @@ -135,10 +138,7 @@ def get_media_requests(self, item, info):
"""Makes requests from urls and/or lets through ready requests."""
urls = ItemAdapter(item).get(self.files_urls_field, [])
download_file_headers = getattr(info.spider, "download_file_headers", {})
yield from (
Request(u, callback=NO_CALLBACK, headers=download_file_headers)
for u in urls
)
yield from (Request(u, headers=download_file_headers) for u in urls)

requests = ItemAdapter(item).get(self.files_requests_field, [])
yield from requests
Expand All @@ -161,8 +161,10 @@ def file_path(self, request, response=None, info=None, item=None):
Path to save the files, modified to organize the gazettes in directories.
The files will be under <territory_id>/<gazette date>/.
"""

filepath = super().file_path(request, response=response, info=info, item=item)
# The default path from the scrapy class begins with "full/". In this
# class we replace that with the territory_id and gazette date.
datestr = item["date"].strftime("%Y-%m-%d")
filename = Path(filepath).name
return str(Path(item["territory_id"], item["date"], filename))
return str(Path(item["territory_id"], datestr, filename))
4 changes: 1 addition & 3 deletions data_collection/gazette/resources/gazette_schema.json
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,7 @@
]
},
"scraped_at":{
"description": "When the gazzete was scraped",
"type": "string",
"format": "date-time"
"type": "string"
},
"files": {
"type": "array",
Expand Down
2 changes: 1 addition & 1 deletion data_collection/gazette/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@
}
SPIDERMON_ENABLED = config("SPIDERMON_ENABLED", default=True, cast=bool)
SPIDERMON_VALIDATION_SCHEMAS = [
str(importlib.resources.files("gazette") / "resources/gazette_schema.json")
importlib.resources.files("gazette") / "resources/gazette_schema.json"
]

SPIDERMON_VALIDATION_ADD_ERRORS_TO_ITEMS = True
Expand Down
1,864 changes: 1,023 additions & 841 deletions data_collection/requirements-dev.txt

Large diffs are not rendered by default.

4 changes: 2 additions & 2 deletions data_collection/requirements.in
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,10 @@ dateparser
itemadapter
jinja2
psycopg2-binary
pyyaml==5.3.1
python-dateutil
python-decouple
scrapy
scrapy-zyte-smartproxy
schematics
SQLAlchemy
spidermon
spidermon[monitoring]
1,720 changes: 953 additions & 767 deletions data_collection/requirements.txt

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion data_collection/scrapinghub.yml
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
project: 631875
stack: scrapy:2.11
stack: scrapy:2.7
requirements:
file: requirements.txt

0 comments on commit 3a54d61

Please sign in to comment.