Skip to content

Commit

Permalink
Rework URL handling; clean up a little more text
Browse files Browse the repository at this point in the history
  • Loading branch information
stucka committed Sep 3, 2024
1 parent 65d26b5 commit 839d460
Showing 1 changed file with 24 additions and 35 deletions.
59 changes: 24 additions & 35 deletions clean/ca/chula_vista_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,35 +73,34 @@ def scrape_meta(self, throttle=0):
for link in links:
link_href = link.get("href", None)
case_id = link.find_previous("p").text
case_id = case_id.replace("\u00a0", " ")
case_id = case_id.replace("\u00a0", " ").replace("\u2014", "--")
if link_href:
title = link.string
title = title.replace("\u00a0", " ")
if "splash" not in link_href:
link_href = f"https://www.chulavistaca.gov{link_href}"
name = link_href.split("/")[-1]
payload = {
"asset_url": link_href,
"case_id": case_id,
"name": name,
"title": title,
"parent_page": str(filename),
"details": {"case_type": case_type},
}
metadata.append(payload)
title = title.replace("\u00a0", " ").replace("\u2014", "--")
redirect_start = "/?splash="
redirect_end = "&____isexternal=true"

# Clean up links. Check to see if it's a redirect:
if redirect_start in link_href:
link_href = link_href.replace(redirect_start, "").replace(redirect_end, "")
link_href = urllib.parse.unquote(link_href)
name = title
else:
link_href = f"https://www.chulavistaca.gov{link_href}"
link_href = self._convert_splash_link(link_href)
name = link_href.split("/")[-1]
payload = {
"asset_url": link_href,
"case_id": case_id,
"name": name,
"title": title,
"parent_page": str(filename),
"details": {"case_type": case_type},
}
metadata.append(payload)

# See if it's a relative link
if urllib.parse.urlparse(link_href).netloc == "":
link_href = f"https://www.chulavistaca.gov{link_href}"

payload = {
"asset_url": link_href,
"case_id": case_id,
"name": name,
"title": title,
"parent_page": str(filename),
"details": {"case_type": case_type},
}
metadata.append(payload)

time.sleep(throttle)
else:
Expand All @@ -110,13 +109,3 @@ def scrape_meta(self, throttle=0):
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile

def _convert_splash_link(self, link):
# Takes a splash link as input and return the actual link after converting
print(link)
parsed_url = urllib.parse.urlparse(link)
parsed_params = urllib.parse.parse_qs(parsed_url.query)

# Decode the splash URL
decoded_splash_link = urllib.parse.unquote(parsed_params["splash"][0])
return decoded_splash_link

0 comments on commit 839d460

Please sign in to comment.