added scrape meta for chula_vista_pd biglocalnews#94

naumansharifwork · Aug 25, 2024 · e384e9d · e384e9d
1 parent 38b610c
commit e384e9d
Show file tree

Hide file tree

Showing 2 changed files with 126 additions and 0 deletions.
diff --git a/clean/ca/chula_vista_pd.py b/clean/ca/chula_vista_pd.py
@@ -0,0 +1,111 @@
+import time
+import urllib.parse
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+from .config.chula_vista_pd import index_request_headers
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Chula Vista Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Chula Vista Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.chulavistaca.gov/departments/police-department/senate-bill-1421"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_chula_vista_pd
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_chula_vista_pd/senate-bill-1421.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.cache.download(filename, self.base_url, headers=index_request_headers)
+        metadata = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        content_areas = soup.find_all("div", class_="content_area clearfix")
+        desired_element = None
+        for content_area in content_areas:
+            previous_h2 = content_area.find_previous("h2")
+            if previous_h2 and previous_h2.text == "Documents":
+                desired_element = content_area
+                break
+
+        if desired_element:
+            sections = desired_element.find_all("div", class_="accordion-item")
+            for section in sections:
+                title = section.find("div", class_="title").get_text(strip=True)
+                links = section.find_all("a")
+                for link in links:
+                    link_href = link.get("href", None)
+
+                    case_id = link.get_text().replace("\u00a0", " ")
+                    # case_id = encoded_text.encode('latin1').decode('unicode_escape').encode('latin1').decode('utf-8')
+                    if link_href:
+                        if "splash" not in link_href:
+                            link_href = f"https://www.chulavistaca.gov{link_href}"
+                            name = link_href.split("/")[-1]
+                            payload = {
+                                "asset_url": link_href,
+                                "case_id": case_id,
+                                "name": name,
+                                "title": title,
+                                "parent_page": str(filename),
+                            }
+                            metadata.append(payload)
+                        else:
+                            link_href = f"https://www.chulavistaca.gov{link_href}"
+                            link_href = self._convert_splash_link(link_href)
+                            name = link_href.split("/")[-1]
+                            payload = {
+                                "asset_url": link_href,
+                                "case_id": case_id,
+                                "name": name,
+                                "title": title,
+                                "parent_page": str(filename),
+                            }
+                            metadata.append(payload)
+
+                    time.sleep(throttle)
+            outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+            self.cache.write_json(outfile, metadata)
+            return outfile
+
+    def _convert_splash_link(self, link):
+        # Takes a splash link as input and return the actual link after converting
+        print(link)
+        parsed_url = urllib.parse.urlparse(link)
+        parsed_params = urllib.parse.parse_qs(parsed_url.query)
+
+        # Decode the splash URL
+        decoded_splash_link = urllib.parse.unquote(parsed_params["splash"][0])
+        return decoded_splash_link
diff --git a/clean/ca/config/chula_vista_pd.py b/clean/ca/config/chula_vista_pd.py
@@ -0,0 +1,15 @@
+index_request_headers = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "en-US,en;q=0.9,nl;q=0.8,ur;q=0.7,ru;q=0.6",
+    "cache-control": "max-age=0",
+    "priority": "u=0, i",
+    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": '"Windows"',
+    "sec-fetch-dest": "document",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+    "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/127.0.0.0 Safari/537.36",
+}