added napa_pd (#156)

Co-authored-by: Gerald Rich <[email protected]>
biglocalnews · Oct 24, 2024 · 9c5c1db · 9c5c1db
1 parent dca4286
commit 9c5c1db
Show file tree

Hide file tree

Showing 3 changed files with 185 additions and 4 deletions.
diff --git a/Pipfile.lock b/Pipfile.lock
diff --git a/clean/ca/napa_pd.py b/clean/ca/napa_pd.py
@@ -0,0 +1,175 @@
+import re
+import time
+import urllib.parse
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Napa Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "City of Napa Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.cityofnapa.org/1260/Penal-Code-Section-8327-b"
+        self.loading_url = "https://www.cityofnapa.org/admin/DocumentCenter/Home/_AjaxLoadingReact?type=0"
+        self.loading_payload = {
+            "value": "865",
+            "expandTree": True,
+            "loadSource": 7,
+            "selectedFolder": 865,
+        }
+        self.folder_doc_req = "https://www.cityofnapa.org/Admin/DocumentCenter/Home/Document_AjaxBinding?renderMode=0&loadSource=7"
+        self.folder_doc_req_payload = {
+            "folderId": 865,
+            "getDocuments": 1,
+            "imageRepo": False,
+            "renderMode": 0,
+            "loadSource": 7,
+            "requestingModuleID": 75,
+            "searchString": "",
+            "pageNumber": 1,
+            "rowsPerPage": 10000,
+            "sortColumn": "DisplayName",
+            "sortOrder": 0,
+        }
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_napa_pd
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_napa_pd/Penal-Code-Section-8327-b.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.cache.download(filename, self.base_url)
+        metadata = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        body = soup.find("div", class_="moduleContentNew")
+        sections = body.find_all("div", class_="row outer wide")
+        for section in sections[1:]:
+            li_items = section.find_all("li", class_="widgetItem")
+            links = [li.find("a") for li in li_items if li.find("a")]
+            for link in links:
+                link_href = link.get("href", None)
+                if link_href:
+                    title = link.get_text(strip=True)
+                    pattern = r"(NPD\d{8}|NPD\d{2}-\d{6}|NSD\d{2}-\d{6}|10-\d{4})"
+                    re.search(pattern, title)
+                    match = re.search(pattern, title)
+                    if match:
+                        case_id = match.group()
+                    else:
+                        case_id = title
+                    if "#" not in link_href:
+                        if "youtube" in link_href:
+                            youtube_links = utils.get_youtube_url_with_metadata(
+                                link_href
+                            )
+                            for yt_data in youtube_links:
+                                name = yt_data["name"]
+                                yt_url = yt_data["url"]
+                                payload = {
+                                    "asset_url": yt_url,
+                                    "case_id": case_id,
+                                    "name": name,
+                                    "title": title,
+                                    "parent_page": str(filename),
+                                }
+                                metadata.append(payload)
+                        if "DocumentCenter" in link_href:
+                            if "Index" in link_href:
+                                folder_id = link_href.split("/")[-1]
+                                print("folder_id: ", folder_id)
+                                document_list = self.process_document_center(
+                                    folder_id, folder_id
+                                )
+                                print("Document list: ", len(document_list))
+                                for document in document_list:
+                                    asset_link = f'https://www.cityofnapa.org{document.get("URL", "")}'
+                                    name = document.get("DisplayName")
+                                    parent_filename = document.get("parent_filename")
+                                    payload = {
+                                        "asset_url": asset_link,
+                                        "case_id": case_id,
+                                        "name": name,
+                                        "title": title,
+                                        "parent_page": str(parent_filename),
+                                    }
+                                    metadata.append(payload)
+                            else:
+                                if "cityofnapa.org" not in link_href:
+                                    link_href = f"https://www.cityofnapa.org{link_href}"
+                                name = link_href.split("/")[-1]
+                                name = urllib.parse.unquote(name)
+                                payload = {
+                                    "asset_url": link_href,
+                                    "case_id": case_id,
+                                    "name": name,
+                                    "title": title,
+                                    "parent_page": str(filename),
+                                }
+                                metadata.append(payload)
+                    time.sleep(throttle)
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
+
+    def process_document_center(self, folder_id, folder_name):
+        documents_list = []
+        self.loading_payload["value"] = folder_id
+        self.loading_payload["selectedFolder"] = int(folder_id)
+        filename = f"{self.agency_slug}/{folder_name}.json"
+        output_json = self.cache_dir.joinpath(filename)
+        with utils.post_url(self.loading_url, json=self.loading_payload) as r:
+            self.cache.write_json(output_json, r.json())
+        folder_json = self.cache.read_json(output_json)
+        folder_list = folder_json.get("Data", [])
+        if len(folder_list) > 0:
+            for folder in folder_list:
+                name = folder.get("Text")
+                new_folder_id = folder.get("Value")
+                documents_list.extend(self.process_document_center(new_folder_id, name))
+
+        else:
+            self.folder_doc_req_payload["folderId"] = folder_id
+            filename = f"{self.agency_slug}/{folder_name}.json"
+            output_json = self.cache_dir.joinpath(filename)
+            with utils.post_url(
+                self.folder_doc_req, json=self.folder_doc_req_payload
+            ) as r:
+                self.cache.write_json(output_json, r.json())
+            folder_json = self.cache.read_json(output_json)
+            documents_list = folder_json.get("Documents", [])
+            for document in documents_list:
+                document["parent_filename"] = filename
+
+        return documents_list
diff --git a/requirements.txt b/requirements.txt
@@ -1,29 +1,35 @@
 -i https://pypi.org/simple
 beautifulsoup4==4.12.3; python_full_version >= '3.6.0'
+brotli==1.1.0; implementation_name == 'cpython'
 bs4==0.0.2
 certifi==2024.7.4; python_version >= '3.6'
 cffi==1.16.0; python_version >= '3.8'
 charset-normalizer==3.3.2; python_full_version >= '3.7.0'
 click==8.1.7; python_version >= '3.7'
+colorama==0.4.6; platform_system == 'Windows'
 cryptography==43.0.1; python_version >= '3.7'
 decorator==5.1.1; python_version >= '3.5'
 html5lib==1.1; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
 idna==3.7; python_version >= '3.5'
 jellyfish==1.1.0; python_version >= '3.7'
+mutagen==1.47.0; python_version >= '3.7'
 pdfminer.six==20231228; python_version >= '3.6'
 pdfplumber==0.11.2; python_version >= '3.8'
 pillow==10.4.0; python_version >= '3.8'
 py==1.11.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4'
 pycparser==2.22; python_version >= '3.8'
+pycryptodomex==3.21.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
 pypdfium2==4.30.0; python_version >= '3.6'
 python-dotenv==1.0.1; python_version >= '3.8'
 pytube==15.0.0; python_version >= '3.7'
 requests==2.32.3; python_version >= '3.8'
 retry==0.9.2
-six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2'
+six==1.16.0; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3'
 soupsieve==2.5; python_version >= '3.8'
 tenacity==9.0.0; python_version >= '3.8'
 typing-extensions==4.12.2; python_version >= '3.8'
 urllib3==1.26.18; python_version >= '2.7' and python_version not in '3.0, 3.1, 3.2, 3.3, 3.4, 3.5'
 us==3.2.0; python_version >= '3.8'
 webencodings==0.5.1
+websockets==13.1; python_version >= '3.8'
+yt-dlp==2024.10.22; python_version >= '3.8'