refactor: deprecate scrape method (#82)

* refactor: deprecate scrape method * fix: case_number >> case_id --------- Co-authored-by: Gerald Rich <[email protected]>
biglocalnews · Aug 19, 2024 · 9de55ae · 9de55ae
1 parent ec8bd12
commit 9de55ae
Show file tree

Hide file tree

Showing 10 changed files with 8 additions and 226 deletions.
diff --git a/clean/ca/humboldt_pd.py b/clean/ca/humboldt_pd.py
@@ -1,6 +1,5 @@
 import time
 from pathlib import Path
-from typing import List
 
 from bs4 import BeautifulSoup, Tag
 
@@ -93,7 +92,7 @@ def _get_asset_links(self, pages, parent_page) -> list:
                             name = link.string
                             payload = {
                                 "title": title,
-                                "case_number": name,
+                                "case_id": name,
                                 "parent_page": str(parent_page),
                                 "asset_url": f"{'https://humboldtgov.org'}{href}",
                                 "name": name,
@@ -108,12 +107,12 @@ def _get_asset_links(self, pages, parent_page) -> list:
                         if soup.title and isinstance(soup.title.string, str)
                         else None
                     )
-                    case_number = page["page_name"].split("/")[-1].split("_")[0]
+                    case_id = page["page_name"].split("/")[-1].split("_")[0]
                     header = soup.find("h1")
                     name = header.get_text(strip=True) if header else None
                     payload = {
                         "title": title,
-                        "case_number": case_number,
+                        "case_id": case_id,
                         "parent_page": str(parent_page),
                         "download_page": str(page["page_name"]),
                         "asset_url": f"https://humboldtgov.nextrequest.com{link['href']}",
@@ -122,20 +121,8 @@ def _get_asset_links(self, pages, parent_page) -> list:
                     metadata.append(payload)
         return metadata
 
-    def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        dl_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            dl_path = self._make_download_path(asset)
-            time.sleep(throttle)
-            dl_assets.append(self.cache.download(str(dl_path), url))
-        return dl_assets
-
     def _make_download_path(self, asset):
-        folder_name = asset["case_number"]
+        folder_name = asset["case_id"]
         name = asset["name"]
         # If name has has no extension mark it as pdf as its a document format by meta-data
         if len(name.split(".")) == 1:

diff --git a/clean/ca/orange_county_sheriff.py b/clean/ca/orange_county_sheriff.py
@@ -1,6 +1,4 @@
-import time
 from pathlib import Path
-from typing import List
 
 from bs4 import BeautifulSoup
 
@@ -32,24 +30,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
         downloadable_files = self._create_json()
         return downloadable_files
 
-    def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        downloaded_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            if filter and filter not in url:
-                continue
-            index_dir = (
-                asset["parent_page"].split(f"{self.agency_slug}/")[-1].rstrip(".html")
-            )
-            asset_name = asset["name"].replace(" ", "_")
-            download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
-            time.sleep(throttle)
-            downloaded_assets.append(self.cache.download(str(download_path), url))
-        return downloaded_assets
-
     def _create_json(self) -> Path:
         metadata = []
         file_stem = self.disclosure_url.split("/")[-1]

diff --git a/clean/ca/sacramento_pd.py b/clean/ca/sacramento_pd.py
@@ -19,7 +19,7 @@
 
 class Site:
     """
-    Scrape file metadata and download files for the Sacramento Police Department for SB16/SB1421/AB748 data.
+    Scrape file metadata and asset_urls for the Sacramento Police Department for SB16/SB1421/AB748 data.
 
     Attributes:
         name (str): The official name of the agency

diff --git a/clean/ca/san_diego_pd.py b/clean/ca/san_diego_pd.py
@@ -71,36 +71,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
         downloadable_files = self._get_asset_links()
         return downloadable_files
 
-    def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
-        """Download file assets from agency.
-
-        Args:
-            throttle (int): Number of seconds to wait between requests. Defaults to 0.
-            filter (str): Only download URLs that match the filter. Defaults to None.
-
-        Returns:
-            List[Path]: List of local paths to downloaded files
-        """
-        # Get metadata on downloadable files
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        downloaded_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            # Skip non-matching files if filter applied
-            if filter and filter not in url:
-                continue
-            # Get relative path to parent index_page directory
-            index_dir = asset["case_id"]
-            asset_name = asset["name"].replace(" ", "_")
-            download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
-            # Download the file to agency directory/assets/index_page_dir/case_name/file_name
-            # Example: 'ca_san_diego_pd/assets/sb16-sb1421-ab748/11-21-2022_IA_2022-013/November_21,_2022_IA_#2022-013_Audio_Interview_Complainant_Redacted_KM.wav'
-            time.sleep(throttle)
-            downloaded_assets.append(self.cache.download(str(download_path), url))
-        return downloaded_assets
-
     # Helper functions
     def _get_asset_links(self) -> Path:
         """Extract link to files and videos from child pages."""

diff --git a/clean/ca/santa_rosa.py b/clean/ca/santa_rosa.py
@@ -1,6 +1,5 @@
 import time
 from pathlib import Path
-from typing import List
 
 from bs4 import BeautifulSoup
 
@@ -69,18 +68,6 @@ def scrape_meta(self, throttle=0):
         self.cache.write_json(outfile, metadata)
         return outfile
 
-    def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        dl_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            dl_path = self._make_download_path(asset)
-            time.sleep(throttle)
-            dl_assets.append(self.cache.download(str(dl_path), url))
-        return dl_assets
-
     def _make_download_path(self, asset):
         url = asset["asset_url"]
         # If name ends in `pdf?dl=1`, handle one way

diff --git a/clean/ca/sonoma_county_sheriff.py b/clean/ca/sonoma_county_sheriff.py
@@ -1,6 +1,4 @@
-import time
 from pathlib import Path
-from typing import List
 
 from bs4 import BeautifulSoup
 
@@ -64,18 +62,6 @@ def scrape_meta(self, throttle=0):
         self.cache.write_json(outfile, metadata)
         return outfile
 
-    def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        dl_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            dl_path = self._make_download_path(asset)
-            time.sleep(throttle)
-            dl_assets.append(self.cache.download(str(dl_path), url))
-        return dl_assets
-
     def _make_download_path(self, asset):
         # TODO: Update the logic to gracefully handle PDFs in addition to zip fiiles
         url = asset["asset_url"]

diff --git a/clean/cli.py b/clean/cli.py
@@ -16,17 +16,16 @@ def cli():
 def list_agencies():
     """List all available agencies and their slugs.
 
-    Agency slugs can then used with the scrape-meta and scrape subcommands
+    Agency slugs can then used with the scrape-meta subcommand
     """
     for state, agencies in utils.get_all_scrapers().items():
         click.echo(f"{state.upper()}:")
         for record in sorted(agencies, key=lambda x: x["slug"]):
             click.echo(f" - {record['slug']} ({record['agency']})")
     message = (
-        "\nTo scrape an agency's file metadata or download files, pass an "
-        "agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta or scrape subcommands: \n\n"
+        "\nTo scrape an agency's file metadata, pass an "
+        "agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta subcommand: \n\n"
         "\tclean-scraper scrape-meta ca_san_diego_pd\n"
-        "\tclean-scraper scrape ca_san_diego_pd\n"
     )
     click.echo(message)
 
@@ -108,92 +107,8 @@ def scrape_meta(
     runner.scrape_meta(agency)
 
 
-@click.command()
-@click.argument("agency")
-@click.option(
-    "--data-dir",
-    default=utils.CLEAN_DATA_DIR,
-    type=click.Path(),
-    help="The Path were generated data/intermediate files will be saved",
-)
-@click.option(
-    "--cache-dir",
-    default=utils.CLEAN_CACHE_DIR,
-    type=click.Path(),
-    help="The Path where results can be cached",
-)
-@click.option(
-    "--filter",
-    "-f",
-    default="",
-    type=str,
-    help="Only download files that match a filter str",
-)
-@click.option(
-    "--delete/--no-delete",
-    default=False,
-    help="Delete generated files from the cache",
-)
-@click.option(
-    "--log-level",
-    "-l",
-    default="INFO",
-    type=click.Choice(
-        ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False
-    ),
-    help="Set the logging level",
-)
-@click.option(
-    "--throttle",
-    "-t",
-    default=0,
-    help="Set throttle on scraping in seconds. Default is no delay on file downloads.",
-)
-def scrape(
-    agency: str,
-    data_dir: Path,
-    cache_dir: Path,
-    filter: str,
-    delete: bool,
-    log_level: str,
-    throttle: int,
-):
-    """
-    Command-line interface for downloading CLEAN files.
-
-    AGENCY -- An agency slug (e.g. ca_san_diego_pd) to scrape.
-
-    Use the 'list' command to see available agencies and their slugs.
-
-      clean-scraper list
-
-    The 'scrape-meta' command must be run first to generate a JSON file containing metadata on downloadable files.
-    """
-    # Set higher log-level on third-party libs that use DEBUG logging,
-    # In order to limit debug logging to our library
-    logging.getLogger("urllib3").setLevel(logging.ERROR)
-
-    # Local logging config
-    logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s")
-    logger = logging.getLogger(__name__)
-
-    # Runner config
-    data_dir = Path(data_dir)
-    cache_dir = Path(cache_dir)
-    runner = Runner(data_dir, cache_dir, throttle)
-
-    # Delete files, if asked
-    if delete:
-        logger.info("Deleting files generated from previous scraper run.")
-        runner.delete()
-
-    # Try running the scraper
-    runner.scrape(agency, filter=filter)
-
-
 cli.add_command(list_agencies)
 cli.add_command(scrape_meta)
-cli.add_command(scrape)
 
 if __name__ == "__main__":
     cli()
diff --git a/clean/runner.py b/clean/runner.py
@@ -65,35 +65,6 @@ def scrape_meta(self, agency_slug: str) -> Path:
         logger.info(f"Generated {data_path}")
         return data_path
 
-    def scrape(self, agency_slug: str, filter: str = "") -> Path:
-        """Run the scraper for the provided agency.
-
-        This method will operate on the metadata JSON file generated by the scrape_meta method.
-
-        Args:
-            agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
-            filter (str): Filter to limit which files are downloaded. Defaults to None.
-
-        Returns: a Path object leading to directory where downloaded files are stored.
-        """
-        # Get the module
-        if agency_slug[2] != "_":
-            message = "Scraper slugs must be prefixed with the state postal code and an underscore. "
-            message += "Example: clean-scraper scrape ca_san_diego_pd. "
-            message += f"Your supplied agency, {agency_slug}, has no state prefix."
-            logger.critical(message)
-
-        state = agency_slug[:2].strip().lower()
-        slug = agency_slug[3:].strip().lower()
-        state_mod = import_module(f"clean.{state}.{slug}")
-        # Run the scrape method
-        logger.info(f"Download files for {agency_slug}")
-        site = state_mod.Site(self.data_dir, self.cache_dir)
-        data_path = site.scrape(throttle=self.throttle, filter=filter)
-        # Run the path to the data file
-        logger.info(f"Generated {data_path}")
-        return data_path
-
     def delete(self):
         """Delete the files in the output directories."""
         logger.debug(f"Deleting files in {self.data_dir}")

diff --git a/docs/contributing.md b/docs/contributing.md
@@ -122,12 +122,6 @@ class Site:
         # 2. Generate a metadata JSON file and store in the cache
         # 3. Return the path to the metadata JSON
         pass
-
-    def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
-        # 1. Use the metadata JSON generated by `scrape_meta` to download available files
-        #    to the cache/assets directory (once again, check out Cache.download).
-        # 2. Return a list of paths to downloaded files
-        pass
 ```
 
 When creating a scraper, there are a few rules of thumb.
@@ -265,7 +259,6 @@ Options:
 
 Commands:
   list         List all available agencies and their slugs.
-  scrape       Command-line interface for downloading CLEAN files.
   scrape-meta  Command-line interface for generating metadata CSV about...
 ```
 

diff --git a/tests/test_runner.py b/tests/test_runner.py
@@ -16,10 +16,3 @@ def test_scrape_meta(runner):
         runner.scrape_meta("ca_san_diego_pd")
         # Assert that the scrape_meta method was called
         mock_scrape_meta.assert_called_once_with(throttle=0)
-
-
-def test_scrape(runner):
-    with patch("clean.ca.san_diego_pd.Site.scrape") as mock_scrape:
-        runner.scrape("ca_san_diego_pd")
-        # Assert that the scrape method was called
-        mock_scrape.assert_called_once_with(throttle=0, filter="")