Skip to content

Commit

Permalink
simplify case_id
Browse files Browse the repository at this point in the history
  • Loading branch information
newsroomdev committed Sep 16, 2024
1 parent 473d4c7 commit 0078d1a
Showing 1 changed file with 2 additions and 3 deletions.
5 changes: 2 additions & 3 deletions clean/ca/san_diego_pd.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import re
import time
import urllib.parse
from pathlib import Path
Expand Down Expand Up @@ -93,9 +94,7 @@ def _get_asset_links(self) -> Path:
"parent_page": str(html_file),
"asset_url": link["href"].replace("\n", ""),
"name": link.text.strip().replace("\n", ""),
"case_id": str(html_file)
.split(f"{self.agency_slug}/")[-1]
.rstrip(".html"),
"case_id": re.sub(r"_page=\d+$", "", html_file.stem),
}
metadata.append(payload)
# Store the metadata in a JSON file in the data directory
Expand Down

0 comments on commit 0078d1a

Please sign in to comment.