Skip to content

Commit

Permalink
refactor: deprecate scrape method (#82)
Browse files Browse the repository at this point in the history
* refactor: deprecate scrape method

* fix: case_number >> case_id

---------

Co-authored-by: Gerald Rich <[email protected]>
  • Loading branch information
newsroomdev and newsroomdev committed Aug 19, 2024
1 parent ec8bd12 commit 9de55ae
Show file tree
Hide file tree
Showing 10 changed files with 8 additions and 226 deletions.
21 changes: 4 additions & 17 deletions clean/ca/humboldt_pd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -93,7 +92,7 @@ def _get_asset_links(self, pages, parent_page) -> list:
name = link.string
payload = {
"title": title,
"case_number": name,
"case_id": name,
"parent_page": str(parent_page),
"asset_url": f"{'https://humboldtgov.org'}{href}",
"name": name,
Expand All @@ -108,12 +107,12 @@ def _get_asset_links(self, pages, parent_page) -> list:
if soup.title and isinstance(soup.title.string, str)
else None
)
case_number = page["page_name"].split("/")[-1].split("_")[0]
case_id = page["page_name"].split("/")[-1].split("_")[0]
header = soup.find("h1")
name = header.get_text(strip=True) if header else None
payload = {
"title": title,
"case_number": case_number,
"case_id": case_id,
"parent_page": str(parent_page),
"download_page": str(page["page_name"]),
"asset_url": f"https://humboldtgov.nextrequest.com{link['href']}",
Expand All @@ -122,20 +121,8 @@ def _get_asset_links(self, pages, parent_page) -> list:
metadata.append(payload)
return metadata

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
folder_name = asset["case_number"]
folder_name = asset["case_id"]
name = asset["name"]
# If name has has no extension mark it as pdf as its a document format by meta-data
if len(name.split(".")) == 1:
Expand Down
20 changes: 0 additions & 20 deletions clean/ca/orange_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -32,24 +30,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
downloadable_files = self._create_json()
return downloadable_files

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
downloaded_assets = []
for asset in metadata:
url = asset["asset_url"]
if filter and filter not in url:
continue
index_dir = (
asset["parent_page"].split(f"{self.agency_slug}/")[-1].rstrip(".html")
)
asset_name = asset["name"].replace(" ", "_")
download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
time.sleep(throttle)
downloaded_assets.append(self.cache.download(str(download_path), url))
return downloaded_assets

def _create_json(self) -> Path:
metadata = []
file_stem = self.disclosure_url.split("/")[-1]
Expand Down
2 changes: 1 addition & 1 deletion clean/ca/sacramento_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

class Site:
"""
Scrape file metadata and download files for the Sacramento Police Department for SB16/SB1421/AB748 data.
Scrape file metadata and asset_urls for the Sacramento Police Department for SB16/SB1421/AB748 data.
Attributes:
name (str): The official name of the agency
Expand Down
30 changes: 0 additions & 30 deletions clean/ca/san_diego_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,36 +71,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
downloadable_files = self._get_asset_links()
return downloadable_files

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
"""Download file assets from agency.
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
filter (str): Only download URLs that match the filter. Defaults to None.
Returns:
List[Path]: List of local paths to downloaded files
"""
# Get metadata on downloadable files
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
downloaded_assets = []
for asset in metadata:
url = asset["asset_url"]
# Skip non-matching files if filter applied
if filter and filter not in url:
continue
# Get relative path to parent index_page directory
index_dir = asset["case_id"]
asset_name = asset["name"].replace(" ", "_")
download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
# Download the file to agency directory/assets/index_page_dir/case_name/file_name
# Example: 'ca_san_diego_pd/assets/sb16-sb1421-ab748/11-21-2022_IA_2022-013/November_21,_2022_IA_#2022-013_Audio_Interview_Complainant_Redacted_KM.wav'
time.sleep(throttle)
downloaded_assets.append(self.cache.download(str(download_path), url))
return downloaded_assets

# Helper functions
def _get_asset_links(self) -> Path:
"""Extract link to files and videos from child pages."""
Expand Down
13 changes: 0 additions & 13 deletions clean/ca/santa_rosa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -69,18 +68,6 @@ def scrape_meta(self, throttle=0):
self.cache.write_json(outfile, metadata)
return outfile

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
url = asset["asset_url"]
# If name ends in `pdf?dl=1`, handle one way
Expand Down
14 changes: 0 additions & 14 deletions clean/ca/sonoma_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -64,18 +62,6 @@ def scrape_meta(self, throttle=0):
self.cache.write_json(outfile, metadata)
return outfile

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
# TODO: Update the logic to gracefully handle PDFs in addition to zip fiiles
url = asset["asset_url"]
Expand Down
91 changes: 3 additions & 88 deletions clean/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ def cli():
def list_agencies():
"""List all available agencies and their slugs.
Agency slugs can then used with the scrape-meta and scrape subcommands
Agency slugs can then used with the scrape-meta subcommand
"""
for state, agencies in utils.get_all_scrapers().items():
click.echo(f"{state.upper()}:")
for record in sorted(agencies, key=lambda x: x["slug"]):
click.echo(f" - {record['slug']} ({record['agency']})")
message = (
"\nTo scrape an agency's file metadata or download files, pass an "
"agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta or scrape subcommands: \n\n"
"\nTo scrape an agency's file metadata, pass an "
"agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta subcommand: \n\n"
"\tclean-scraper scrape-meta ca_san_diego_pd\n"
"\tclean-scraper scrape ca_san_diego_pd\n"
)
click.echo(message)

Expand Down Expand Up @@ -108,92 +107,8 @@ def scrape_meta(
runner.scrape_meta(agency)


@click.command()
@click.argument("agency")
@click.option(
"--data-dir",
default=utils.CLEAN_DATA_DIR,
type=click.Path(),
help="The Path were generated data/intermediate files will be saved",
)
@click.option(
"--cache-dir",
default=utils.CLEAN_CACHE_DIR,
type=click.Path(),
help="The Path where results can be cached",
)
@click.option(
"--filter",
"-f",
default="",
type=str,
help="Only download files that match a filter str",
)
@click.option(
"--delete/--no-delete",
default=False,
help="Delete generated files from the cache",
)
@click.option(
"--log-level",
"-l",
default="INFO",
type=click.Choice(
("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False
),
help="Set the logging level",
)
@click.option(
"--throttle",
"-t",
default=0,
help="Set throttle on scraping in seconds. Default is no delay on file downloads.",
)
def scrape(
agency: str,
data_dir: Path,
cache_dir: Path,
filter: str,
delete: bool,
log_level: str,
throttle: int,
):
"""
Command-line interface for downloading CLEAN files.
AGENCY -- An agency slug (e.g. ca_san_diego_pd) to scrape.
Use the 'list' command to see available agencies and their slugs.
clean-scraper list
The 'scrape-meta' command must be run first to generate a JSON file containing metadata on downloadable files.
"""
# Set higher log-level on third-party libs that use DEBUG logging,
# In order to limit debug logging to our library
logging.getLogger("urllib3").setLevel(logging.ERROR)

# Local logging config
logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s")
logger = logging.getLogger(__name__)

# Runner config
data_dir = Path(data_dir)
cache_dir = Path(cache_dir)
runner = Runner(data_dir, cache_dir, throttle)

# Delete files, if asked
if delete:
logger.info("Deleting files generated from previous scraper run.")
runner.delete()

# Try running the scraper
runner.scrape(agency, filter=filter)


cli.add_command(list_agencies)
cli.add_command(scrape_meta)
cli.add_command(scrape)

if __name__ == "__main__":
cli()
29 changes: 0 additions & 29 deletions clean/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,35 +65,6 @@ def scrape_meta(self, agency_slug: str) -> Path:
logger.info(f"Generated {data_path}")
return data_path

def scrape(self, agency_slug: str, filter: str = "") -> Path:
"""Run the scraper for the provided agency.
This method will operate on the metadata JSON file generated by the scrape_meta method.
Args:
agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
filter (str): Filter to limit which files are downloaded. Defaults to None.
Returns: a Path object leading to directory where downloaded files are stored.
"""
# Get the module
if agency_slug[2] != "_":
message = "Scraper slugs must be prefixed with the state postal code and an underscore. "
message += "Example: clean-scraper scrape ca_san_diego_pd. "
message += f"Your supplied agency, {agency_slug}, has no state prefix."
logger.critical(message)

state = agency_slug[:2].strip().lower()
slug = agency_slug[3:].strip().lower()
state_mod = import_module(f"clean.{state}.{slug}")
# Run the scrape method
logger.info(f"Download files for {agency_slug}")
site = state_mod.Site(self.data_dir, self.cache_dir)
data_path = site.scrape(throttle=self.throttle, filter=filter)
# Run the path to the data file
logger.info(f"Generated {data_path}")
return data_path

def delete(self):
"""Delete the files in the output directories."""
logger.debug(f"Deleting files in {self.data_dir}")
Expand Down
7 changes: 0 additions & 7 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,6 @@ class Site:
# 2. Generate a metadata JSON file and store in the cache
# 3. Return the path to the metadata JSON
pass

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
# 1. Use the metadata JSON generated by `scrape_meta` to download available files
# to the cache/assets directory (once again, check out Cache.download).
# 2. Return a list of paths to downloaded files
pass
```

When creating a scraper, there are a few rules of thumb.
Expand Down Expand Up @@ -265,7 +259,6 @@ Options:

Commands:
list List all available agencies and their slugs.
scrape Command-line interface for downloading CLEAN files.
scrape-meta Command-line interface for generating metadata CSV about...
```

Expand Down
7 changes: 0 additions & 7 deletions tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,3 @@ def test_scrape_meta(runner):
runner.scrape_meta("ca_san_diego_pd")
# Assert that the scrape_meta method was called
mock_scrape_meta.assert_called_once_with(throttle=0)


def test_scrape(runner):
with patch("clean.ca.san_diego_pd.Site.scrape") as mock_scrape:
runner.scrape("ca_san_diego_pd")
# Assert that the scrape method was called
mock_scrape.assert_called_once_with(throttle=0, filter="")

0 comments on commit 9de55ae

Please sign in to comment.