docs: add download_agency notes #141

newsroomdev · 2024-10-22T01:04:30Z

Description

To facilitate one-off runs on Berkeley's servers and confirm files are downloading as expected, I've added a runner and cli method to the library to read the asset_url and save it locally. Rather than creating separate scrape methods, this centralizes the logic to runner.py. Note: This logic satisfies basic one-off scraper needs. Additional platform-specific download logic may be required c.f. Laserfiche #50

clean-scraper/clean/runner.py

Lines 86 to 130 in 4f9211d

    
               def download_agency(self, agency_slug: str) -> Path: 
        
                   """Download files for the provided agency. 
        
                   Args: 
        
                       geo_id (str): A full-name of the 
        
                       agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd 
        
                   Returns: a Path object leading to a CSV file. 
        
                   """ 
        
                   state, slug = self._validate_agency_slug(agency_slug) 
        
                   # Define the path to the JSON file 
        
                   json_path = Path.home() / f".clean-scraper/exports/{agency_slug}.json" 
        
                   # Load the JSON file 
        
                   with open(json_path) as f: 
        
                       data = json.load(f) 
        
                   # Create the download directory if it doesn't exist 
        
                   download_dir = self.assets_dir / f"case_files/{slug}" 
        
                   download_dir.mkdir(parents=True, exist_ok=True) 
        
                   # Download each asset 
        
                   for item, index in data: 
        
                       asset_url = item.get("asset_url") 
        
                       if asset_url: 
        
                           current_date = datetime.now().strftime("%Y%m%d") 
        
                           local_filepath = ( 
        
                               download_dir 
        
                               / f"{current_date}/assets/{item.get('case_id')}/{item.get('name')}.pdf" 
        
                           ) 
        
                           local_filepath.parent.mkdir(parents=True, exist_ok=True) 
        
                           try: 
        
                               response = requests.get( 
        
                                   asset_url, 
        
                                   headers={"User-Agent": "Big Local News (biglocalnews.org)"}, 
        
                               ) 
        
                               response.raise_for_status()  # Check for request errors 
        
                               with open(local_filepath, "wb") as file: 
        
                                   file.write(response.content) 
        
                               logger.info(f"Downloaded {asset_url} to {local_filepath}") 
        
                               logger.info(f"asset_url {index} / {len(data)}") 
        
                           except Exception as e: 
        
                               logger.error(f"Failed to download asset {asset_url}: {e}") 
        
                   return download_dir

clean-scraper/clean/cli.py

Lines 110 to 169 in 4f9211d

    
           @click.command() 
        
           @click.argument("agency") 
        
           @click.option( 
        
               "--data-dir", 
        
               default=utils.CLEAN_DATA_DIR, 
        
               type=click.Path(), 
        
               help="The Path were the results will be saved", 
        
           ) 
        
           @click.option( 
        
               "--cache-dir", 
        
               default=utils.CLEAN_CACHE_DIR, 
        
               type=click.Path(), 
        
               help="The Path where results can be cached", 
        
           ) 
        
           @click.option( 
        
               "--assets-dir", 
        
               default=utils.CLEAN_ASSETS_DIR, 
        
               type=click.Path(), 
        
               help="The Path where assets will be saved", 
        
           ) 
        
           @click.option( 
        
               "--log-level", 
        
               "-l", 
        
               default="INFO", 
        
               type=click.Choice( 
        
                   ("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False 
        
               ), 
        
               help="Set the logging level", 
        
           ) 
        
           def download_agency( 
        
               agency: str, 
        
               data_dir: Path, 
        
               cache_dir: Path, 
        
               assets_dir: Path, 
        
               log_level: str, 
        
               throttle: int, 
        
           ): 
        
               """ 
        
               Command-line interface for downloading files from a CLEAN agency. 
        
               AGENCY -- An agency slug (e.g. ca_san_diego_pd) 
        
               Use the 'list' command to see available agencies and their slugs. 
        
                 clean-scraper list 
        
               """ 
        
               # Set higher log-level on third-party libs that use DEBUG logging, 
        
               # In order to limit debug logging to our library 
        
               logging.getLogger("urllib3").setLevel(logging.ERROR) 
        
               # Local logging config 
        
               logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s") 
        
               # Runner config 
        
               data_dir = Path(data_dir) 
        
               cache_dir = Path(cache_dir) 
        
               runner = Runner(data_dir, cache_dir, assets_dir, throttle) 
        
               # Try running the scraper 
        
               runner.download_agency(agency)

Summary of Changes

Updates usage.md
Updates contributing.md

Related Issues

This should reduce the number of steps required for pull request approval. Users can ssh to Berkeley's server and initiate downloads.

How to Review

Please review the documentation. A separate PR can contain additional code changes to the download logic. This PR is meant to ensure documentation is up-to-date and contributors are aware of the new pattern.

cc @naumansharifwork

docs: add download_agency notes

46b618e

newsroomdev requested review from stucka and tarakc02 October 22, 2024 01:04

newsroomdev force-pushed the dev branch from 08c3e5f to d603207 Compare October 23, 2024 20:43

newsroomdev force-pushed the docs/download_agency branch from 4f9211d to 46b618e Compare October 23, 2024 20:43

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

docs: add download_agency notes #141

docs: add download_agency notes #141

newsroomdev commented Oct 22, 2024

	def download_agency(self, agency_slug: str) -> Path:
	"""Download files for the provided agency.

	Args:
	geo_id (str): A full-name of the
	agency_slug (str): Unique scraper slug composed of two-letter state postal code and agency slug: e.g. ca_san_diego_pd

	Returns: a Path object leading to a CSV file.
	"""
	state, slug = self._validate_agency_slug(agency_slug)
	# Define the path to the JSON file
	json_path = Path.home() / f".clean-scraper/exports/{agency_slug}.json"

	# Load the JSON file
	with open(json_path) as f:
	data = json.load(f)

	# Create the download directory if it doesn't exist
	download_dir = self.assets_dir / f"case_files/{slug}"
	download_dir.mkdir(parents=True, exist_ok=True)

	# Download each asset
	for item, index in data:
	asset_url = item.get("asset_url")
	if asset_url:
	current_date = datetime.now().strftime("%Y%m%d")
	local_filepath = (
	download_dir
	/ f"{current_date}/assets/{item.get('case_id')}/{item.get('name')}.pdf"
	)
	local_filepath.parent.mkdir(parents=True, exist_ok=True)
	try:
	response = requests.get(
	asset_url,
	headers={"User-Agent": "Big Local News (biglocalnews.org)"},
	)
	response.raise_for_status() # Check for request errors
	with open(local_filepath, "wb") as file:
	file.write(response.content)
	logger.info(f"Downloaded {asset_url} to {local_filepath}")
	logger.info(f"asset_url {index} / {len(data)}")
	except Exception as e:
	logger.error(f"Failed to download asset {asset_url}: {e}")

	return download_dir

	@click.command()
	@click.argument("agency")
	@click.option(
	"--data-dir",
	default=utils.CLEAN_DATA_DIR,
	type=click.Path(),
	help="The Path were the results will be saved",
	)
	@click.option(
	"--cache-dir",
	default=utils.CLEAN_CACHE_DIR,
	type=click.Path(),
	help="The Path where results can be cached",
	)
	@click.option(
	"--assets-dir",
	default=utils.CLEAN_ASSETS_DIR,
	type=click.Path(),
	help="The Path where assets will be saved",
	)
	@click.option(
	"--log-level",
	"-l",
	default="INFO",
	type=click.Choice(
	("DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"), case_sensitive=False
	),
	help="Set the logging level",
	)
	def download_agency(
	agency: str,
	data_dir: Path,
	cache_dir: Path,
	assets_dir: Path,
	log_level: str,
	throttle: int,
	):
	"""
	Command-line interface for downloading files from a CLEAN agency.

	AGENCY -- An agency slug (e.g. ca_san_diego_pd)

	Use the 'list' command to see available agencies and their slugs.

	clean-scraper list
	"""
	# Set higher log-level on third-party libs that use DEBUG logging,
	# In order to limit debug logging to our library
	logging.getLogger("urllib3").setLevel(logging.ERROR)

	# Local logging config
	logging.basicConfig(level=log_level, format="%(asctime)s - %(name)s - %(message)s")

	# Runner config
	data_dir = Path(data_dir)
	cache_dir = Path(cache_dir)
	runner = Runner(data_dir, cache_dir, assets_dir, throttle)

	# Try running the scraper
	runner.download_agency(agency)

docs: add download_agency notes #141

Are you sure you want to change the base?

docs: add download_agency notes #141

Conversation

newsroomdev commented Oct 22, 2024

Description

Summary of Changes

Related Issues

How to Review