NextRequest #84 ; BART #96 ; LAPD #18 (#105)

Co-authored-by: Gerald Rich <[email protected]>
biglocalnews · Oct 23, 2024 · d603207 · d603207
1 parent effda8e
commit d603207
Show file tree

Hide file tree

Showing 3 changed files with 333 additions and 4 deletions.
diff --git a/clean/ca/bay_area_rapid_transit_pd.py b/clean/ca/bay_area_rapid_transit_pd.py
@@ -0,0 +1,73 @@
+import logging
+from pathlib import Path
+from typing import Dict, List
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.nextrequest import process_nextrequest
+
+# from ..utils import MetadataDict
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata for the Bay Area Rapid Transit Police Department -- BART PD.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Bay Area Rapid Transit Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_bay_area_rapid_transit_pd"
+        self.base_url = "https://bart.nextrequest.com"
+        # Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
+        # along with additional index pages
+        self.disclosure_url = "https://bart.nextrequest.com/requests/21-107"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.cache = Cache(cache_dir)
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        to_be_scraped: Dict = {
+            "https://bart.nextrequest.com/requests/21-107": True,
+        }
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_nextrequest(
+                subpages_dir, start_url, force, throttle
+            )
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
diff --git a/clean/ca/los_angeles_pd.py b/clean/ca/los_angeles_pd.py
@@ -0,0 +1,257 @@
+import logging
+from pathlib import Path
+from time import sleep
+from typing import Dict, List, Set
+from urllib.parse import unquote, urlparse
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+from ..platforms.nextrequest import process_nextrequest
+
+logger = logging.getLogger(__name__)
+
+
+"""
+To-do:
+
+Not doing -- as there's no persistence:
+    -- Track which subpage files have been read through the indexes, but lets also check to see if any
+     subpage files were NOT indexed and read them
+"""
+
+
+class Site:
+    """Scrape file metadata for the Los Angeles Police Department -- LAPD.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Los Angeles Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.site_slug = "ca_los_angeles_pd"
+        self.first_url = (
+            "https://www.lapdonline.org/senate-bill-1421-senate-bill-16-sb-16/"
+        )
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
+        self.indexes_dir = cache_dir / self.site_slug
+        self.cache = Cache(cache_dir)
+        self.rescrape_all_case_files = False  # Do we need to rescrape all the subpages?
+
+        for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
+            utils.create_directory(localdir)
+
+        self.detail_urls = self.indexes_dir / "url_details.json"
+        self.indexes_scraped = self.indexes_dir / "indexes-scraped.json"
+
+        # Build a list of URLs that should not be scraped
+        # FIXME: Remove in favor of Cosmos DB's existing list & its notification system
+        self.broken_urls = [
+            "https://lacity.nextrequest.com/documents?folder_filter=F009-01",
+            "https://lacity.nextrequest.com/documents?folder_filter=F050-20",
+            "https://lacity.nextrequest.com/documents?folder_filter=F025-15",
+        ]
+
+        # Build a dict of URLs that need to be patched up
+        self.url_fixes = {
+            "https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/": "https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unlawful-arrest-unlawful-search/",
+            "F118-04 November 22, 2004": "https://lacity.nextrequest.com/documents?folder_filter=F118-04",
+            " https://lacity.nextrequest.com/documents?folder_filter=CF01-3445": "https://lacity.nextrequest.com/documents?folder_filter=CF01-3445",
+        }
+
+    def scrape_meta(self, throttle: int = 2) -> Path:
+        """Gather metadata on downloadable files (videos, etc.).
+
+        Args:
+            throttle (int): Number of seconds to wait between requests. Defaults to 0.
+
+        Returns:
+            Path: Local path of JSON file containing metadata on downloadable files
+        """
+        lookup = self.fetch_indexes(throttle)
+        json_filename, metadata = self.fetch_subpages(throttle)
+
+        logger.debug("Adding origin details to metadata")
+        for i, entry in enumerate(metadata):
+            if entry["case_id"] in lookup:
+                metadata[i]["details"]["bln_source"] = lookup[entry["case_id"]]
+        self.cache.write_json(json_filename, metadata)
+
+        return json_filename
+
+    def url_to_filename(self, url: str):
+        """Turn a URL into a proposed filename."""
+        # We really really really need a slugify thing
+        path = urlparse(url).path
+        if path.startswith("/"):
+            path = path[1:]
+        if path.endswith("/"):
+            path = path[:-1]
+        path = path.replace("/", "_")
+        path += ".html"
+        return path
+
+    def clean_url(self, page_url, local_url):
+        """Correct bad URLs.
+
+        Args:
+            page_url: The URL of the page that got us the link
+            local_url: The proposed URL we're trying to clean up
+        Returns:
+            Cleaned URL, with full domain and scheme as needed.
+            URL is checked against a data in self.init for replacement.
+        """
+        if local_url in self.url_fixes:
+            local_url = self.url_fixes[local_url]
+        if urlparse(local_url).netloc == "":
+            local_url = urlparse(page_url).netloc + local_url
+        if urlparse(local_url).scheme == "":
+            local_url = "https" + local_url
+        return local_url
+
+    def fetch_indexes(self, throttle: int = 2):
+        """Recursively download LAPD index pages to find subpage URLs.
+
+        Args:
+            throttle (int): Time to wait between requests
+        Returns:
+            lookup (dict): Supplemental data to add to metadata details
+        Writes:
+            detailed_urls.json
+            indexes_scraped.json
+        """
+        scraping_complete = False
+
+        detail_urls: Dict = {}
+        indexes_scraped: Dict = {}
+        indexes_todo: Set = set()
+        index_passes = 0
+
+        indexes_todo.add(self.first_url)
+
+        # Need to add sleep between calls
+
+        while not scraping_complete:
+            index_passes += 1
+            for page_url in list(
+                indexes_todo
+            ):  # work with a copy so we're not thrashing the original
+                filename = self.url_to_filename(page_url)
+                filename = self.indexes_dir / filename
+                indexes_scraped[page_url] = {
+                    "subindexes": [],
+                    "details": 0,
+                }
+                cleaned_page_url = self.clean_url(page_url, page_url)
+                logger.debug(f"Trying {cleaned_page_url}")
+                r = utils.get_url(cleaned_page_url)
+
+                self.cache.write_binary(filename, r.content)
+
+                sleep(throttle)
+
+                # Need to write the page
+                soup = BeautifulSoup(r.content, features="html.parser")
+
+                page_title = soup.title
+                if page_title:
+                    page_title = unquote(page_title.text.strip())  # type: ignore
+
+                content_divs = soup.findAll("div", {"class": "grid-content"})
+                content_divs.extend(soup.findAll("div", {"class": "link-box"}))
+                for content_div in content_divs:
+                    links = content_div.findAll("a")
+                    for link in links:
+                        original_href = link["href"]
+                        href = self.clean_url(page_url, original_href)
+                        if urlparse(href).netloc.endswith(".nextrequest.com"):
+                            if original_href in self.broken_urls:
+                                logger.debug(f"Not scraping broken URL {original_href}")
+                            else:
+                                if href not in detail_urls:
+                                    detail_urls[href] = []
+                                detail_urls[href].append(
+                                    {"page_title": page_title, "page_url": page_url}
+                                )
+                                indexes_scraped[page_url]["details"] += 1
+                        else:
+                            if original_href not in indexes_scraped:
+                                indexes_todo.add(original_href)
+                            indexes_scraped[page_url]["subindexes"].append(
+                                original_href
+                            )
+
+            for url in indexes_scraped:
+                if url in indexes_todo:
+                    indexes_todo.remove(url)
+            if len(indexes_todo) == 0:
+                logger.debug(
+                    f"Index scraping complete, after {len(indexes_scraped):,} indexes reviewed."
+                )
+                logger.debug(f"{len(detail_urls):,} case URLs found.")
+                scraping_complete = True
+            else:
+                logger.debug(
+                    f"Index scraping pass {index_passes:,}: {len(indexes_scraped):,} indexes scraped, {len(detail_urls):,} case URLs found"
+                )
+
+        self.cache.write_json(self.detail_urls, detail_urls)
+
+        self.cache.write_json(self.indexes_scraped, indexes_scraped)
+
+        lookup: Dict = {}
+        for entry in detail_urls:
+            lookup[entry.split("=")[-1]] = detail_urls[entry]
+
+        return lookup
+
+    def fetch_subpages(self, throttle):
+        """Download all subpage URLs as needed; parse all pages.
+
+        Args:
+            throttle: Time to wait between requests
+        Notes:
+            cache.rescrape_all_case_files decides whether already existent files should be downloaded
+        Returns:
+            Filename of JSON metadata
+            Metadata
+        """
+        # Determine whether everything needs to be rescraped
+        force = self.rescrape_all_case_files
+
+        detail_urls = self.cache.read_json(self.detail_urls)
+
+        # Let's not do anything but reads to detail_urls
+        to_be_scraped: Dict = {}
+        for detail_url in detail_urls.keys():
+            to_be_scraped[detail_url] = force
+
+        metadata: List = []
+
+        subpages_dir = self.subpages_dir
+
+        for start_url in to_be_scraped:
+            force = to_be_scraped[start_url]
+            local_metadata = process_nextrequest(
+                subpages_dir, start_url, force, throttle
+            )
+            metadata.extend(local_metadata)
+
+        json_filename = self.data_dir / (self.site_slug + ".json")
+        self.cache.write_json(json_filename, metadata)
+        return json_filename, metadata
diff --git a/clean/platforms/nextrequest.py b/clean/platforms/nextrequest.py
@@ -277,19 +277,18 @@ def fingerprint_nextrequest(start_url: str):
     """
     line = None
     parsed_url = urlparse(start_url)
-    folder_id = parse_qs(parsed_url.query)["folder_filter"][0]
     if parsed_url.path == "/documents":  # LAPDish type
         base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
         line = {
             "site_type": "lapdish",  # LAPDish type
             "base_url": base_url,
-            "folder_id": folder_id,
+            "folder_id": parse_qs(parsed_url.query)["folder_filter"][0],
             "page_size": 50,
             "doc_limit": 9950,  # Max number of accessible docs in a folder
             "tally_field": "total_count",
             "bln_page_url": "bln_page_url",
             "bln_total_documents": "bln_total_documents",
-            "json_url": f"{base_url}/client/documents?sort_field=count&sort_order=desc&page_size=50&folder_filter={folder_id}&page_number=",  # type: ignore
+            "json_url": f"{base_url}/client/documents?sort_field=count&sort_order=desc&page_size=50&folder_filter={line['folder_id']}&page_number=",  # type: ignore
             "details": {
                 "document_path": "document_path",
                 "description": "description",
@@ -354,4 +353,4 @@ def fingerprint_nextrequest(start_url: str):
 
 
 def find_max_pages(item_count: int, page_size: int):
-    return ceil(item_count / page_size)  # type: ignore
+    return ceil(item_count, page_size)  # type: ignore