-
Notifications
You must be signed in to change notification settings - Fork 10
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
added scrape meta for chula_vista_pd #94 #95
Changes from 1 commit
e384e9d
50de6f2
e1f1cdf
65d26b5
839d460
34459df
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,111 @@ | ||
import time | ||
import urllib.parse | ||
from pathlib import Path | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from .. import utils | ||
from ..cache import Cache | ||
from .config.chula_vista_pd import index_request_headers | ||
|
||
|
||
class Site: | ||
"""Scrape file metadata and download files for the City of Chula Vista Police Department. | ||
|
||
Attributes: | ||
name (str): The official name of the agency | ||
""" | ||
|
||
name = "Chula Vista Police Department" | ||
|
||
def __init__( | ||
self, | ||
data_dir: Path = utils.CLEAN_DATA_DIR, | ||
cache_dir: Path = utils.CLEAN_CACHE_DIR, | ||
): | ||
"""Initialize a new instance. | ||
|
||
Args: | ||
data_dir (Path): The directory where downstream processed files/data will be saved | ||
cache_dir (Path): The directory where files will be cached | ||
""" | ||
self.base_url = "https://www.chulavistaca.gov/departments/police-department/senate-bill-1421" | ||
self.data_dir = data_dir | ||
self.cache_dir = cache_dir | ||
self.cache = Cache(cache_dir) | ||
|
||
@property | ||
def agency_slug(self) -> str: | ||
"""Construct the agency slug.""" | ||
# Use module path to construct agency slug, which we'll use downstream | ||
mod = Path(__file__) | ||
state_postal = mod.parent.stem | ||
return f"{state_postal}_{mod.stem}" # ca_chula_vista_pd | ||
|
||
def scrape_meta(self, throttle=0): | ||
# construct a local filename relative to the cache directory - agency slug + page url (ca_chula_vista_pd/senate-bill-1421.html) | ||
# download the page (if not already cached) | ||
# save the index page url to cache (sensible name) | ||
base_name = f"{self.base_url.split('/')[-1]}.html" | ||
filename = f"{self.agency_slug}/{base_name}" | ||
self.cache.download(filename, self.base_url, headers=index_request_headers) | ||
metadata = [] | ||
html = self.cache.read(filename) | ||
soup = BeautifulSoup(html, "html.parser") | ||
content_areas = soup.find_all("div", class_="content_area clearfix") | ||
desired_element = None | ||
for content_area in content_areas: | ||
previous_h2 = content_area.find_previous("h2") | ||
if previous_h2 and previous_h2.text == "Documents": | ||
desired_element = content_area | ||
break | ||
|
||
if desired_element: | ||
Comment on lines
+62
to
+68
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I like this methodology a lot, except we should probably log an error if there is no There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Good idea |
||
sections = desired_element.find_all("div", class_="accordion-item") | ||
for section in sections: | ||
title = section.find("div", class_="title").get_text(strip=True) | ||
links = section.find_all("a") | ||
for link in links: | ||
link_href = link.get("href", None) | ||
|
||
case_id = link.get_text().replace("\u00a0", " ") | ||
# case_id = encoded_text.encode('latin1').decode('unicode_escape').encode('latin1').decode('utf-8') | ||
if link_href: | ||
if "splash" not in link_href: | ||
link_href = f"https://www.chulavistaca.gov{link_href}" | ||
name = link_href.split("/")[-1] | ||
payload = { | ||
"asset_url": link_href, | ||
"case_id": case_id, | ||
"name": name, | ||
"title": title, | ||
"parent_page": str(filename), | ||
} | ||
metadata.append(payload) | ||
else: | ||
link_href = f"https://www.chulavistaca.gov{link_href}" | ||
link_href = self._convert_splash_link(link_href) | ||
name = link_href.split("/")[-1] | ||
payload = { | ||
"asset_url": link_href, | ||
"case_id": case_id, | ||
"name": name, | ||
"title": title, | ||
"parent_page": str(filename), | ||
} | ||
metadata.append(payload) | ||
|
||
time.sleep(throttle) | ||
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json") | ||
self.cache.write_json(outfile, metadata) | ||
return outfile | ||
|
||
def _convert_splash_link(self, link): | ||
# Takes a splash link as input and return the actual link after converting | ||
print(link) | ||
parsed_url = urllib.parse.urlparse(link) | ||
parsed_params = urllib.parse.parse_qs(parsed_url.query) | ||
|
||
# Decode the splash URL | ||
decoded_splash_link = urllib.parse.unquote(parsed_params["splash"][0]) | ||
return decoded_splash_link |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,15 @@ | ||
index_request_headers = { | ||
"accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", | ||
"accept-language": "en-US,en;q=0.9,nl;q=0.8,ur;q=0.7,ru;q=0.6", | ||
"cache-control": "max-age=0", | ||
"priority": "u=0, i", | ||
"sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"', | ||
"sec-ch-ua-mobile": "?0", | ||
"sec-ch-ua-platform": '"Windows"', | ||
"sec-fetch-dest": "document", | ||
"sec-fetch-mode": "navigate", | ||
"sec-fetch-site": "none", | ||
"sec-fetch-user": "?1", | ||
"upgrade-insecure-requests": "1", | ||
"user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36", | ||
newsroomdev marked this conversation as resolved.
Show resolved
Hide resolved
|
||
} |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
There's only the one index page here -- I think this should include
force = True
to force a rescrape on each run.