From 4834b4236fd47da5ca3ddfd6ca7b35fa602fff84 Mon Sep 17 00:00:00 2001 From: Gerald Rich Date: Tue, 10 Sep 2024 11:34:05 -0700 Subject: [PATCH] fix: sdpd case_id pagination --- clean/ca/san_diego_pd.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/clean/ca/san_diego_pd.py b/clean/ca/san_diego_pd.py index e28968d6..1efa680c 100644 --- a/clean/ca/san_diego_pd.py +++ b/clean/ca/san_diego_pd.py @@ -1,3 +1,4 @@ +import re import time import urllib.parse from pathlib import Path @@ -88,14 +89,18 @@ def _get_asset_links(self) -> Path: # Save links to files, videos, etc with relevant metadata # for downstream processing for link in links: + # Remove pagination part from html_file name + case_id = re.sub( + r"_page=\d+\.html$", + "", + str(html_file).split(f"{self.agency_slug}/")[-1], + ) payload: MetadataDict = { "title": title, "parent_page": str(html_file), "asset_url": link["href"].replace("\n", ""), "name": link.text.strip().replace("\n", ""), - "case_id": str(html_file) - .split(f"{self.agency_slug}/")[-1] - .rstrip(".html"), + "case_id": case_id, } metadata.append(payload) # Store the metadata in a JSON file in the data directory