Skip to content

Commit

Permalink
NextRequest #84 ; BART #96 ; LAPD #18 (#105)
Browse files Browse the repository at this point in the history
Co-authored-by: Gerald Rich <[email protected]>
  • Loading branch information
stucka and newsroomdev authored Oct 23, 2024
1 parent effda8e commit d603207
Show file tree
Hide file tree
Showing 3 changed files with 333 additions and 4 deletions.
73 changes: 73 additions & 0 deletions clean/ca/bay_area_rapid_transit_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.nextrequest import process_nextrequest

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)


class Site:
"""Scrape file metadata for the Bay Area Rapid Transit Police Department -- BART PD.
Attributes:
name (str): The official name of the agency
"""

name = "Bay Area Rapid Transit Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_bay_area_rapid_transit_pd"
self.base_url = "https://bart.nextrequest.com"
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.disclosure_url = "https://bart.nextrequest.com/requests/21-107"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
to_be_scraped: Dict = {
"https://bart.nextrequest.com/requests/21-107": True,
}

metadata: List = []

subpages_dir = self.subpages_dir

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_nextrequest(
subpages_dir, start_url, force, throttle
)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
257 changes: 257 additions & 0 deletions clean/ca/los_angeles_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import logging
from pathlib import Path
from time import sleep
from typing import Dict, List, Set
from urllib.parse import unquote, urlparse

from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache
from ..platforms.nextrequest import process_nextrequest

logger = logging.getLogger(__name__)


"""
To-do:
Not doing -- as there's no persistence:
-- Track which subpage files have been read through the indexes, but lets also check to see if any
subpage files were NOT indexed and read them
"""


class Site:
"""Scrape file metadata for the Los Angeles Police Department -- LAPD.
Attributes:
name (str): The official name of the agency
"""

name = "Los Angeles Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_los_angeles_pd"
self.first_url = (
"https://www.lapdonline.org/senate-bill-1421-senate-bill-16-sb-16/"
)
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.indexes_dir = cache_dir / self.site_slug
self.cache = Cache(cache_dir)
self.rescrape_all_case_files = False # Do we need to rescrape all the subpages?

for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

self.detail_urls = self.indexes_dir / "url_details.json"
self.indexes_scraped = self.indexes_dir / "indexes-scraped.json"

# Build a list of URLs that should not be scraped
# FIXME: Remove in favor of Cosmos DB's existing list & its notification system
self.broken_urls = [
"https://lacity.nextrequest.com/documents?folder_filter=F009-01",
"https://lacity.nextrequest.com/documents?folder_filter=F050-20",
"https://lacity.nextrequest.com/documents?folder_filter=F025-15",
]

# Build a dict of URLs that need to be patched up
self.url_fixes = {
"https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/": "https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unlawful-arrest-unlawful-search/",
"F118-04 November 22, 2004": "https://lacity.nextrequest.com/documents?folder_filter=F118-04",
" https://lacity.nextrequest.com/documents?folder_filter=CF01-3445": "https://lacity.nextrequest.com/documents?folder_filter=CF01-3445",
}

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).
Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.
Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
lookup = self.fetch_indexes(throttle)
json_filename, metadata = self.fetch_subpages(throttle)

logger.debug("Adding origin details to metadata")
for i, entry in enumerate(metadata):
if entry["case_id"] in lookup:
metadata[i]["details"]["bln_source"] = lookup[entry["case_id"]]
self.cache.write_json(json_filename, metadata)

return json_filename

def url_to_filename(self, url: str):
"""Turn a URL into a proposed filename."""
# We really really really need a slugify thing
path = urlparse(url).path
if path.startswith("/"):
path = path[1:]
if path.endswith("/"):
path = path[:-1]
path = path.replace("/", "_")
path += ".html"
return path

def clean_url(self, page_url, local_url):
"""Correct bad URLs.
Args:
page_url: The URL of the page that got us the link
local_url: The proposed URL we're trying to clean up
Returns:
Cleaned URL, with full domain and scheme as needed.
URL is checked against a data in self.init for replacement.
"""
if local_url in self.url_fixes:
local_url = self.url_fixes[local_url]
if urlparse(local_url).netloc == "":
local_url = urlparse(page_url).netloc + local_url
if urlparse(local_url).scheme == "":
local_url = "https" + local_url
return local_url

def fetch_indexes(self, throttle: int = 2):
"""Recursively download LAPD index pages to find subpage URLs.
Args:
throttle (int): Time to wait between requests
Returns:
lookup (dict): Supplemental data to add to metadata details
Writes:
detailed_urls.json
indexes_scraped.json
"""
scraping_complete = False

detail_urls: Dict = {}
indexes_scraped: Dict = {}
indexes_todo: Set = set()
index_passes = 0

indexes_todo.add(self.first_url)

# Need to add sleep between calls

while not scraping_complete:
index_passes += 1
for page_url in list(
indexes_todo
): # work with a copy so we're not thrashing the original
filename = self.url_to_filename(page_url)
filename = self.indexes_dir / filename
indexes_scraped[page_url] = {
"subindexes": [],
"details": 0,
}
cleaned_page_url = self.clean_url(page_url, page_url)
logger.debug(f"Trying {cleaned_page_url}")
r = utils.get_url(cleaned_page_url)

self.cache.write_binary(filename, r.content)

sleep(throttle)

# Need to write the page
soup = BeautifulSoup(r.content, features="html.parser")

page_title = soup.title
if page_title:
page_title = unquote(page_title.text.strip()) # type: ignore

content_divs = soup.findAll("div", {"class": "grid-content"})
content_divs.extend(soup.findAll("div", {"class": "link-box"}))
for content_div in content_divs:
links = content_div.findAll("a")
for link in links:
original_href = link["href"]
href = self.clean_url(page_url, original_href)
if urlparse(href).netloc.endswith(".nextrequest.com"):
if original_href in self.broken_urls:
logger.debug(f"Not scraping broken URL {original_href}")
else:
if href not in detail_urls:
detail_urls[href] = []
detail_urls[href].append(
{"page_title": page_title, "page_url": page_url}
)
indexes_scraped[page_url]["details"] += 1
else:
if original_href not in indexes_scraped:
indexes_todo.add(original_href)
indexes_scraped[page_url]["subindexes"].append(
original_href
)

for url in indexes_scraped:
if url in indexes_todo:
indexes_todo.remove(url)
if len(indexes_todo) == 0:
logger.debug(
f"Index scraping complete, after {len(indexes_scraped):,} indexes reviewed."
)
logger.debug(f"{len(detail_urls):,} case URLs found.")
scraping_complete = True
else:
logger.debug(
f"Index scraping pass {index_passes:,}: {len(indexes_scraped):,} indexes scraped, {len(detail_urls):,} case URLs found"
)

self.cache.write_json(self.detail_urls, detail_urls)

self.cache.write_json(self.indexes_scraped, indexes_scraped)

lookup: Dict = {}
for entry in detail_urls:
lookup[entry.split("=")[-1]] = detail_urls[entry]

return lookup

def fetch_subpages(self, throttle):
"""Download all subpage URLs as needed; parse all pages.
Args:
throttle: Time to wait between requests
Notes:
cache.rescrape_all_case_files decides whether already existent files should be downloaded
Returns:
Filename of JSON metadata
Metadata
"""
# Determine whether everything needs to be rescraped
force = self.rescrape_all_case_files

detail_urls = self.cache.read_json(self.detail_urls)

# Let's not do anything but reads to detail_urls
to_be_scraped: Dict = {}
for detail_url in detail_urls.keys():
to_be_scraped[detail_url] = force

metadata: List = []

subpages_dir = self.subpages_dir

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_nextrequest(
subpages_dir, start_url, force, throttle
)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)
return json_filename, metadata
7 changes: 3 additions & 4 deletions clean/platforms/nextrequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,18 @@ def fingerprint_nextrequest(start_url: str):
"""
line = None
parsed_url = urlparse(start_url)
folder_id = parse_qs(parsed_url.query)["folder_filter"][0]
if parsed_url.path == "/documents": # LAPDish type
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
line = {
"site_type": "lapdish", # LAPDish type
"base_url": base_url,
"folder_id": folder_id,
"folder_id": parse_qs(parsed_url.query)["folder_filter"][0],
"page_size": 50,
"doc_limit": 9950, # Max number of accessible docs in a folder
"tally_field": "total_count",
"bln_page_url": "bln_page_url",
"bln_total_documents": "bln_total_documents",
"json_url": f"{base_url}/client/documents?sort_field=count&sort_order=desc&page_size=50&folder_filter={folder_id}&page_number=", # type: ignore
"json_url": f"{base_url}/client/documents?sort_field=count&sort_order=desc&page_size=50&folder_filter={line['folder_id']}&page_number=", # type: ignore
"details": {
"document_path": "document_path",
"description": "description",
Expand Down Expand Up @@ -354,4 +353,4 @@ def fingerprint_nextrequest(start_url: str):


def find_max_pages(item_count: int, page_size: int):
return ceil(item_count / page_size) # type: ignore
return ceil(item_count, page_size) # type: ignore

0 comments on commit d603207

Please sign in to comment.