Skip to content

Commit

Permalink
fix: sdpd case_id pagination
Browse files Browse the repository at this point in the history
  • Loading branch information
newsroomdev committed Sep 10, 2024
1 parent b139e7c commit 4834b42
Showing 1 changed file with 8 additions and 3 deletions.
11 changes: 8 additions & 3 deletions clean/ca/san_diego_pd.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import time
import urllib.parse
from pathlib import Path
Expand Down Expand Up @@ -88,14 +89,18 @@ def _get_asset_links(self) -> Path:
# Save links to files, videos, etc with relevant metadata
# for downstream processing
for link in links:
# Remove pagination part from html_file name
case_id = re.sub(
r"_page=\d+\.html$",
"",
str(html_file).split(f"{self.agency_slug}/")[-1],
)
payload: MetadataDict = {
"title": title,
"parent_page": str(html_file),
"asset_url": link["href"].replace("\n", ""),
"name": link.text.strip().replace("\n", ""),
"case_id": str(html_file)
.split(f"{self.agency_slug}/")[-1]
.rstrip(".html"),
"case_id": case_id,
}
metadata.append(payload)
# Store the metadata in a JSON file in the data directory
Expand Down

0 comments on commit 4834b42

Please sign in to comment.