Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: deprecate scrape method #82

Merged
merged 2 commits into from
Aug 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 4 additions & 17 deletions clean/ca/humboldt_pd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -93,7 +92,7 @@ def _get_asset_links(self, pages, parent_page) -> list:
name = link.string
payload = {
"title": title,
"case_number": name,
"case_id": name,
"parent_page": str(parent_page),
"asset_url": f"{'https://humboldtgov.org'}{href}",
"name": name,
Expand All @@ -108,12 +107,12 @@ def _get_asset_links(self, pages, parent_page) -> list:
if soup.title and isinstance(soup.title.string, str)
else None
)
case_number = page["page_name"].split("/")[-1].split("_")[0]
case_id = page["page_name"].split("/")[-1].split("_")[0]
header = soup.find("h1")
name = header.get_text(strip=True) if header else None
payload = {
"title": title,
"case_number": case_number,
"case_id": case_id,
"parent_page": str(parent_page),
"download_page": str(page["page_name"]),
"asset_url": f"https://humboldtgov.nextrequest.com{link['href']}",
Expand All @@ -122,20 +121,8 @@ def _get_asset_links(self, pages, parent_page) -> list:
metadata.append(payload)
return metadata

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
folder_name = asset["case_number"]
folder_name = asset["case_id"]
name = asset["name"]
# If name has has no extension mark it as pdf as its a document format by meta-data
if len(name.split(".")) == 1:
Expand Down
20 changes: 0 additions & 20 deletions clean/ca/orange_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -32,24 +30,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
downloadable_files = self._create_json()
return downloadable_files

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
downloaded_assets = []
for asset in metadata:
url = asset["asset_url"]
if filter and filter not in url:
continue
index_dir = (
asset["parent_page"].split(f"{self.agency_slug}/")[-1].rstrip(".html")
)
asset_name = asset["name"].replace(" ", "_")
download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
time.sleep(throttle)
downloaded_assets.append(self.cache.download(str(download_path), url))
return downloaded_assets

def _create_json(self) -> Path:
metadata = []
file_stem = self.disclosure_url.split("/")[-1]
Expand Down
2 changes: 1 addition & 1 deletion clean/ca/sacramento_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

class Site:
"""
Scrape file metadata and download files for the Sacramento Police Department for SB16/SB1421/AB748 data.
Scrape file metadata and asset_urls for the Sacramento Police Department for SB16/SB1421/AB748 data.

Attributes:
name (str): The official name of the agency
Expand Down
30 changes: 0 additions & 30 deletions clean/ca/san_diego_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -71,36 +71,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
downloadable_files = self._get_asset_links()
return downloadable_files

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
"""Download file assets from agency.

Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
filter (str): Only download URLs that match the filter. Defaults to None.

Returns:
List[Path]: List of local paths to downloaded files
"""
# Get metadata on downloadable files
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
downloaded_assets = []
for asset in metadata:
url = asset["asset_url"]
# Skip non-matching files if filter applied
if filter and filter not in url:
continue
# Get relative path to parent index_page directory
index_dir = asset["case_id"]
asset_name = asset["name"].replace(" ", "_")
download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
# Download the file to agency directory/assets/index_page_dir/case_name/file_name
# Example: 'ca_san_diego_pd/assets/sb16-sb1421-ab748/11-21-2022_IA_2022-013/November_21,_2022_IA_#2022-013_Audio_Interview_Complainant_Redacted_KM.wav'
time.sleep(throttle)
downloaded_assets.append(self.cache.download(str(download_path), url))
return downloaded_assets

# Helper functions
def _get_asset_links(self) -> Path:
"""Extract link to files and videos from child pages."""
Expand Down
13 changes: 0 additions & 13 deletions clean/ca/santa_rosa.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -69,18 +68,6 @@ def scrape_meta(self, throttle=0):
self.cache.write_json(outfile, metadata)
return outfile

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
url = asset["asset_url"]
# If name ends in `pdf?dl=1`, handle one way
Expand Down
14 changes: 0 additions & 14 deletions clean/ca/sonoma_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -64,18 +62,6 @@ def scrape_meta(self, throttle=0):
self.cache.write_json(outfile, metadata)
return outfile

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
# TODO: Update the logic to gracefully handle PDFs in addition to zip fiiles
url = asset["asset_url"]
Expand Down
91 changes: 3 additions & 88 deletions clean/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,17 +16,16 @@ def cli():
def list_agencies():
"""List all available agencies and their slugs.

Agency slugs can then used with the scrape-meta and scrape subcommands
Agency slugs can then used with the scrape-meta subcommand
"""
for state, agencies in utils.get_all_scrapers().items():
click.echo(f"{state.upper()}:")
for record in sorted(agencies, key=lambda x: x["slug"]):
click.echo(f" - {record['slug']} ({record['agency']})")
message = (
"\nTo scrape an agency's file metadata or download files, pass an "
"agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta or scrape subcommands: \n\n"
"\nTo scrape an agency's file metadata, pass an "
"agency slug (e.g. ca_san_diego_pd) as the argument to the scrape-meta subcommand: \n\n"
"\tclean-scraper scrape-meta ca_san_diego_pd\n"
"\tclean-scraper scrape ca_san_diego_pd\n"
)
click.echo(message)

Expand Down Expand Up @@ -108,92 +107,8 @@ def scrape_meta(
runner.scrape_meta(agency)


@click.command()
@click.argument("agency")
@click.option(
"--data-dir",
default=utils.CLEAN_DATA_DIR,
type=click.Path(),
help="The Path were generated data/intermediate files will be saved",
)
@click.option(
"--cache-dir",
default=utils.CLEAN_CACHE_DIR,
type=click.Path(),
help="The Path where results can be cached",
)
@click.option(
"--filter",
"-f",
default="",
type=str,
help="Only download files that match a filter str",
)
@click.option(
"--delete/--no-delete",
default=False,
help="Delete generated files from the cache",
)
@click.option(
"--log-level",
"-l",
default="INFO",
type=click.Choice(
("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False
),
help="Set the logging level",
)
@click.option(
"--throttle",
"-t",
default=0,
help="Set throttle on scraping in seconds. Default is no delay on file downloads.",
)
def scrape(
agency: str,
data_dir: Path,
cache_dir: Path,
filter: str,
delete: bool,
log_level: str,
throttle: int,
):
"""
Command-line interface for downloading CLEAN files.

AGENCY -- An agency slug (e.g. ca_san_diego_pd) to scrape.

Use the 'list' command to see available agencies and their slugs.

clean-scraper list

The 'scrape-meta' command must be run first to generate a JSON file containing metadata on downloadable files.
"""
# Set higher log-level on third-party libs that use DEBUG logging,
# In order to limit debug logging to our library
logging.getLogger("urllib3").setLevel(logging.ERROR)

# Local logging config
logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s")
logger = logging.getLogger(__name__)

# Runner config
data_dir = Path(data_dir)
cache_dir = Path(cache_dir)
runner = Runner(data_dir, cache_dir, throttle)

# Delete files, if asked
if delete:
logger.info("Deleting files generated from previous scraper run.")
runner.delete()

# Try running the scraper
runner.scrape(agency, filter=filter)


cli.add_command(list_agencies)
cli.add_command(scrape_meta)
cli.add_command(scrape)

if __name__ == "__main__":
cli()
29 changes: 0 additions & 29 deletions clean/runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,35 +65,6 @@ def scrape_meta(self, agency_slug: str) -> Path:
logger.info(f"Generated {data_path}")
return data_path

def scrape(self, agency_slug: str, filter: str = "") -> Path:
"""Run the scraper for the provided agency.

This method will operate on the metadata JSON file generated by the scrape_meta method.

Args:
agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd
filter (str): Filter to limit which files are downloaded. Defaults to None.

Returns: a Path object leading to directory where downloaded files are stored.
"""
# Get the module
if agency_slug[2] != "_":
message = "Scraper slugs must be prefixed with the state postal code and an underscore. "
message += "Example: clean-scraper scrape ca_san_diego_pd. "
message += f"Your supplied agency, {agency_slug}, has no state prefix."
logger.critical(message)

state = agency_slug[:2].strip().lower()
slug = agency_slug[3:].strip().lower()
state_mod = import_module(f"clean.{state}.{slug}")
# Run the scrape method
logger.info(f"Download files for {agency_slug}")
site = state_mod.Site(self.data_dir, self.cache_dir)
data_path = site.scrape(throttle=self.throttle, filter=filter)
# Run the path to the data file
logger.info(f"Generated {data_path}")
return data_path

def delete(self):
"""Delete the files in the output directories."""
logger.debug(f"Deleting files in {self.data_dir}")
Expand Down
7 changes: 0 additions & 7 deletions docs/contributing.md
Original file line number Diff line number Diff line change
Expand Up @@ -122,12 +122,6 @@ class Site:
# 2. Generate a metadata JSON file and store in the cache
# 3. Return the path to the metadata JSON
pass

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
# 1. Use the metadata JSON generated by `scrape_meta` to download available files
# to the cache/assets directory (once again, check out Cache.download).
# 2. Return a list of paths to downloaded files
pass
```

When creating a scraper, there are a few rules of thumb.
Expand Down Expand Up @@ -265,7 +259,6 @@ Options:

Commands:
list List all available agencies and their slugs.
scrape Command-line interface for downloading CLEAN files.
scrape-meta Command-line interface for generating metadata CSV about...
```

Expand Down
7 changes: 0 additions & 7 deletions tests/test_runner.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,10 +16,3 @@ def test_scrape_meta(runner):
runner.scrape_meta("ca_san_diego_pd")
# Assert that the scrape_meta method was called
mock_scrape_meta.assert_called_once_with(throttle=0)


def test_scrape(runner):
with patch("clean.ca.san_diego_pd.Site.scrape") as mock_scrape:
runner.scrape("ca_san_diego_pd")
# Assert that the scrape method was called
mock_scrape.assert_called_once_with(throttle=0, filter="")