-
Notifications
You must be signed in to change notification settings - Fork 10
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Co-authored-by: Gerald Rich <[email protected]>
- Loading branch information
1 parent
dca4286
commit 9c5c1db
Showing
3 changed files
with
185 additions
and
4 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,175 @@ | ||
import re | ||
import time | ||
import urllib.parse | ||
from pathlib import Path | ||
|
||
from bs4 import BeautifulSoup | ||
|
||
from .. import utils | ||
from ..cache import Cache | ||
|
||
|
||
class Site: | ||
"""Scrape file metadata and download files for the City of Napa Police Department. | ||
Attributes: | ||
name (str): The official name of the agency | ||
""" | ||
|
||
name = "City of Napa Police Department" | ||
|
||
def __init__( | ||
self, | ||
data_dir: Path = utils.CLEAN_DATA_DIR, | ||
cache_dir: Path = utils.CLEAN_CACHE_DIR, | ||
): | ||
"""Initialize a new instance. | ||
Args: | ||
data_dir (Path): The directory where downstream processed files/data will be saved | ||
cache_dir (Path): The directory where files will be cached | ||
""" | ||
self.base_url = "https://www.cityofnapa.org/1260/Penal-Code-Section-8327-b" | ||
self.loading_url = "https://www.cityofnapa.org/admin/DocumentCenter/Home/_AjaxLoadingReact?type=0" | ||
self.loading_payload = { | ||
"value": "865", | ||
"expandTree": True, | ||
"loadSource": 7, | ||
"selectedFolder": 865, | ||
} | ||
self.folder_doc_req = "https://www.cityofnapa.org/Admin/DocumentCenter/Home/Document_AjaxBinding?renderMode=0&loadSource=7" | ||
self.folder_doc_req_payload = { | ||
"folderId": 865, | ||
"getDocuments": 1, | ||
"imageRepo": False, | ||
"renderMode": 0, | ||
"loadSource": 7, | ||
"requestingModuleID": 75, | ||
"searchString": "", | ||
"pageNumber": 1, | ||
"rowsPerPage": 10000, | ||
"sortColumn": "DisplayName", | ||
"sortOrder": 0, | ||
} | ||
self.data_dir = data_dir | ||
self.cache_dir = cache_dir | ||
self.cache = Cache(cache_dir) | ||
|
||
@property | ||
def agency_slug(self) -> str: | ||
"""Construct the agency slug.""" | ||
# Use module path to construct agency slug, which we'll use downstream | ||
mod = Path(__file__) | ||
state_postal = mod.parent.stem | ||
return f"{state_postal}_{mod.stem}" # ca_napa_pd | ||
|
||
def scrape_meta(self, throttle=0): | ||
# construct a local filename relative to the cache directory - agency slug + page url (ca_napa_pd/Penal-Code-Section-8327-b.html) | ||
# download the page (if not already cached) | ||
# save the index page url to cache (sensible name) | ||
base_name = f"{self.base_url.split('/')[-1]}.html" | ||
filename = f"{self.agency_slug}/{base_name}" | ||
self.cache.download(filename, self.base_url) | ||
metadata = [] | ||
html = self.cache.read(filename) | ||
soup = BeautifulSoup(html, "html.parser") | ||
body = soup.find("div", class_="moduleContentNew") | ||
sections = body.find_all("div", class_="row outer wide") | ||
for section in sections[1:]: | ||
li_items = section.find_all("li", class_="widgetItem") | ||
links = [li.find("a") for li in li_items if li.find("a")] | ||
for link in links: | ||
link_href = link.get("href", None) | ||
if link_href: | ||
title = link.get_text(strip=True) | ||
pattern = r"(NPD\d{8}|NPD\d{2}-\d{6}|NSD\d{2}-\d{6}|10-\d{4})" | ||
re.search(pattern, title) | ||
match = re.search(pattern, title) | ||
if match: | ||
case_id = match.group() | ||
else: | ||
case_id = title | ||
if "#" not in link_href: | ||
if "youtube" in link_href: | ||
youtube_links = utils.get_youtube_url_with_metadata( | ||
link_href | ||
) | ||
for yt_data in youtube_links: | ||
name = yt_data["name"] | ||
yt_url = yt_data["url"] | ||
payload = { | ||
"asset_url": yt_url, | ||
"case_id": case_id, | ||
"name": name, | ||
"title": title, | ||
"parent_page": str(filename), | ||
} | ||
metadata.append(payload) | ||
if "DocumentCenter" in link_href: | ||
if "Index" in link_href: | ||
folder_id = link_href.split("/")[-1] | ||
print("folder_id: ", folder_id) | ||
document_list = self.process_document_center( | ||
folder_id, folder_id | ||
) | ||
print("Document list: ", len(document_list)) | ||
for document in document_list: | ||
asset_link = f'https://www.cityofnapa.org{document.get("URL", "")}' | ||
name = document.get("DisplayName") | ||
parent_filename = document.get("parent_filename") | ||
payload = { | ||
"asset_url": asset_link, | ||
"case_id": case_id, | ||
"name": name, | ||
"title": title, | ||
"parent_page": str(parent_filename), | ||
} | ||
metadata.append(payload) | ||
else: | ||
if "cityofnapa.org" not in link_href: | ||
link_href = f"https://www.cityofnapa.org{link_href}" | ||
name = link_href.split("/")[-1] | ||
name = urllib.parse.unquote(name) | ||
payload = { | ||
"asset_url": link_href, | ||
"case_id": case_id, | ||
"name": name, | ||
"title": title, | ||
"parent_page": str(filename), | ||
} | ||
metadata.append(payload) | ||
time.sleep(throttle) | ||
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json") | ||
self.cache.write_json(outfile, metadata) | ||
return outfile | ||
|
||
def process_document_center(self, folder_id, folder_name): | ||
documents_list = [] | ||
self.loading_payload["value"] = folder_id | ||
self.loading_payload["selectedFolder"] = int(folder_id) | ||
filename = f"{self.agency_slug}/{folder_name}.json" | ||
output_json = self.cache_dir.joinpath(filename) | ||
with utils.post_url(self.loading_url, json=self.loading_payload) as r: | ||
self.cache.write_json(output_json, r.json()) | ||
folder_json = self.cache.read_json(output_json) | ||
folder_list = folder_json.get("Data", []) | ||
if len(folder_list) > 0: | ||
for folder in folder_list: | ||
name = folder.get("Text") | ||
new_folder_id = folder.get("Value") | ||
documents_list.extend(self.process_document_center(new_folder_id, name)) | ||
|
||
else: | ||
self.folder_doc_req_payload["folderId"] = folder_id | ||
filename = f"{self.agency_slug}/{folder_name}.json" | ||
output_json = self.cache_dir.joinpath(filename) | ||
with utils.post_url( | ||
self.folder_doc_req, json=self.folder_doc_req_payload | ||
) as r: | ||
self.cache.write_json(output_json, r.json()) | ||
folder_json = self.cache.read_json(output_json) | ||
documents_list = folder_json.get("Documents", []) | ||
for document in documents_list: | ||
document["parent_filename"] = filename | ||
|
||
return documents_list |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,29 +1,35 @@ | ||
-i https://pypi.org/simple | ||
beautifulsoup4==4.12.3; python_full_version >= '3.6.0' | ||
brotli==1.1.0; implementation_name == 'cpython' | ||
bs4==0.0.2 | ||
certifi==2024.7.4; python_version >= '3.6' | ||
cffi==1.16.0; python_version >= '3.8' | ||
charset-normalizer==3.3.2; python_full_version >= '3.7.0' | ||
click==8.1.7; python_version >= '3.7' | ||
colorama==0.4.6; platform_system == 'Windows' | ||
cryptography==43.0.1; python_version >= '3.7' | ||
decorator==5.1.1; python_version >= '3.5' | ||
html5lib==1.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' | ||
idna==3.7; python_version >= '3.5' | ||
jellyfish==1.1.0; python_version >= '3.7' | ||
mutagen==1.47.0; python_version >= '3.7' | ||
pdfminer.six==20231228; python_version >= '3.6' | ||
pdfplumber==0.11.2; python_version >= '3.8' | ||
pillow==10.4.0; python_version >= '3.8' | ||
py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4' | ||
pycparser==2.22; python_version >= '3.8' | ||
pycryptodomex==3.21.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' | ||
pypdfium2==4.30.0; python_version >= '3.6' | ||
python-dotenv==1.0.1; python_version >= '3.8' | ||
pytube==15.0.0; python_version >= '3.7' | ||
requests==2.32.3; python_version >= '3.8' | ||
retry==0.9.2 | ||
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2' | ||
six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3' | ||
soupsieve==2.5; python_version >= '3.8' | ||
tenacity==9.0.0; python_version >= '3.8' | ||
typing-extensions==4.12.2; python_version >= '3.8' | ||
urllib3==1.26.18; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5' | ||
us==3.2.0; python_version >= '3.8' | ||
webencodings==0.5.1 | ||
websockets==13.1; python_version >= '3.8' | ||
yt-dlp==2024.10.22; python_version >= '3.8' |