Skip to content

Commit

Permalink
added corona_pd #73 (#155)
Browse files Browse the repository at this point in the history
Co-authored-by: Gerald Rich <[email protected]>
  • Loading branch information
naumansharifwork and newsroomdev authored Oct 23, 2024
1 parent af63b04 commit ad68694
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 0 deletions.
29 changes: 29 additions & 0 deletions clean/ca/config/corona_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
index_request_headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"cache-control": "max-age=0",
"if-modified-since": "Mon, 12 Aug 2024 09:15:55 GMT",
"priority": "u=0, i",
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
}

download_request_headers = {
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
"accept-language": "en-US,en;q=0.9",
"priority": "u=0, i",
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
"sec-ch-ua-mobile": "?0",
"sec-ch-ua-platform": '"Windows"',
"sec-fetch-dest": "document",
"sec-fetch-mode": "navigate",
"sec-fetch-site": "none",
"sec-fetch-user": "?1",
"upgrade-insecure-requests": "1",
}
96 changes: 96 additions & 0 deletions clean/ca/corona_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,96 @@
import re
import time
from pathlib import Path

from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache
from .config.corona_pd import index_request_headers


class Site:
"""Scrape file metadata and download files for the Corona Police Department.
Attributes:
name (str): The official name of the agency
"""

name = "Corona Police Department."

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.base_url = "https://www.coronaca.gov/government/departments-divisions/police-department/trust-and-transparency#Records%20and%20Community%20Briefing%20Videos"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.cache = Cache(cache_dir)

@property
def agency_slug(self) -> str:
"""Construct the agency slug."""
# Use module path to construct agency slug, which we'll use downstream
mod = Path(__file__)
state_postal = mod.parent.stem
return f"{state_postal}_{mod.stem}" # ca_corona_pd

def scrape_meta(self, throttle=0):
# construct a local filename relative to the cache directory - agency slug + page url (ca_corona_pd/trust-and-transparency.html)
# download the page (if not already cached)
# save the index page url to cache (sensible name)
base_name = f"{self.base_url.split('/')[-1].split('#')[0]}.html"
filename = f"{self.agency_slug}/{base_name}"
self.cache.download(
filename,
self.base_url,
headers=index_request_headers,
)
metadata = []
html = self.cache.read(filename)
soup = BeautifulSoup(html, "html.parser")
body = soup.find("div", class_="accordion_widget mn-accordion")
links = body.find_all("a")
for link in links:
if "public-records-request" not in link["href"]:
case_num = self._get_clean_case_num(link)
name = link.string
title_element = link.find_previous("div", class_="title")
title = title_element.string
asset_url = link["href"]
if "youtu" not in asset_url and "coronaca.gov" not in asset_url:
asset_url = f"https://www.coronaca.gov{asset_url}"
asset_url = asset_url.strip()
payload = {
"asset_url": asset_url,
"case_id": case_num,
"name": name,
"title": title,
"parent_page": str(filename),
}
metadata.append(payload)
time.sleep(throttle)
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile

def _get_clean_case_num(self, element):
parent_tag = element.find_parent(["p", "td"])
if parent_tag:
complete_text = parent_tag.get_text(strip=True)
else:
complete_text = element.get_text(strip=True)

case_number_pattern = r"(LI#\s?\d{2}-\d+|CR#\s?\d{2}-\d+|PI\s?\d{2}-\d{3})"
matches = re.findall(case_number_pattern, complete_text)
for match in matches:
cleaned_case_number = match.strip()
return cleaned_case_number
return None

0 comments on commit ad68694

Please sign in to comment.