diff --git a/clean/ca/san_diego_pd.py b/clean/ca/san_diego_pd.py index e28968d6..94d413a1 100644 --- a/clean/ca/san_diego_pd.py +++ b/clean/ca/san_diego_pd.py @@ -1,3 +1,4 @@ +import re import time import urllib.parse from pathlib import Path @@ -93,9 +94,7 @@ def _get_asset_links(self) -> Path: "parent_page": str(html_file), "asset_url": link["href"].replace("\n", ""), "name": link.text.strip().replace("\n", ""), - "case_id": str(html_file) - .split(f"{self.agency_slug}/")[-1] - .rstrip(".html"), + "case_id": re.sub(r"_page=\d+$", "", html_file.stem), } metadata.append(payload) # Store the metadata in a JSON file in the data directory