Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

NextRequest #84 ; BART #96 ; LAPD #18 #105

Merged
merged 47 commits into from
Oct 23, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
30ade29
Initial commit
stucka Aug 23, 2024
d6fc85a
f mypy
stucka Aug 27, 2024
93a807c
...
stucka Aug 27, 2024
56090b2
Prove the concept
stucka Aug 27, 2024
159dca3
mypy tweak
stucka Aug 27, 2024
06aeb3c
Stub out fingerprinter
stucka Aug 27, 2024
6698827
Start fingerprinting
stucka Aug 27, 2024
3b06e75
Draft fingerprinting
stucka Aug 28, 2024
147537e
rework fetch to use new schema; draft plan for pagination
stucka Aug 28, 2024
a650f82
Don't remember
stucka Aug 30, 2024
b3ad241
Build out max page calculator
stucka Sep 3, 2024
9cc7080
Draft out pagination
stucka Sep 3, 2024
c8eef48
Scraper on lapdish still works; need much more fingerprinting-parser …
stucka Sep 4, 2024
e3832e8
Modularize toward multiple versions
stucka Sep 5, 2024
ece8231
...
stucka Sep 5, 2024
0c21132
Fix some refs
stucka Sep 5, 2024
873b03d
Bad range
stucka Sep 5, 2024
dd4009d
Sort out subkey scraping problem
stucka Sep 5, 2024
ac16bb0
asset_url is not an asset_url
stucka Sep 6, 2024
26548d6
Monkeypatch NextRequest ID check
stucka Sep 6, 2024
84692e5
Working implementations
stucka Sep 6, 2024
7dbcd1c
Update to-do list
stucka Sep 6, 2024
63e4014
Improve output, update to-do list
stucka Sep 6, 2024
d494d13
Scraping subsequent pages works better if you scrape subsequent pages
stucka Sep 6, 2024
e5bbe1f
Find relative path; add in some BLN variables
stucka Sep 6, 2024
3867f2c
Grab allllll the metadata from our examples
stucka Sep 6, 2024
e43f35d
Initial commit
stucka Sep 9, 2024
9e69ba7
Nix old staging file
stucka Sep 9, 2024
fde9594
Use utils vs. requests
stucka Sep 9, 2024
ad2cc11
Don't rememb
stucka Sep 9, 2024
9886ff1
Incremental work
stucka Sep 9, 2024
b598f75
Initial commit
stucka Sep 13, 2024
e3a0420
Gettin' classy
stucka Sep 13, 2024
85edb6d
Build out subpage scraper
stucka Sep 13, 2024
125939a
Throw warning at 200-page mark
stucka Sep 13, 2024
b1f6623
Buncha fixes
stucka Sep 13, 2024
cab8912
Remove prototypes
stucka Sep 13, 2024
04f0e73
...
stucka Sep 13, 2024
aff5920
Add doc limit to to-do list
stucka Sep 19, 2024
dc22a23
Draft out oversized folder error
stucka Sep 19, 2024
6766556
Clean up mypy BS. Force checks for files that may not exist
stucka Sep 20, 2024
a2779ea
Documentation and cleanup
stucka Sep 20, 2024
13e18f3
Patch weird security flag
stucka Sep 20, 2024
b86531a
Update to-do list
stucka Sep 20, 2024
dc9562c
Streamline fingerprinting; use math.ceil
stucka Sep 23, 2024
6945996
Merge branch 'dev' into platform-84
newsroomdev Oct 23, 2024
fd6fc7d
Update clean/ca/los_angeles_pd.py
newsroomdev Oct 23, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 73 additions & 0 deletions clean/ca/bay_area_rapid_transit_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,73 @@
import logging
from pathlib import Path
from typing import Dict, List

from .. import utils
from ..cache import Cache
from ..platforms.nextrequest import process_nextrequest

# from ..utils import MetadataDict

logger = logging.getLogger(__name__)


class Site:
"""Scrape file metadata for the Bay Area Rapid Transit Police Department -- BART PD.

Attributes:
name (str): The official name of the agency
"""

name = "Bay Area Rapid Transit Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.

Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_bay_area_rapid_transit_pd"
self.base_url = "https://bart.nextrequest.com"
# Initial disclosure page (aka where they start complying with law) contains list of "detail"/child pages with links to the SB16/SB1421/AB748 videos and files
# along with additional index pages
self.disclosure_url = "https://bart.nextrequest.com/requests/21-107"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.cache = Cache(cache_dir)
for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).

Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.

Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
to_be_scraped: Dict = {
"https://bart.nextrequest.com/requests/21-107": True,
}

metadata: List = []

subpages_dir = self.subpages_dir

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_nextrequest(
subpages_dir, start_url, force, throttle
)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)

return json_filename
257 changes: 257 additions & 0 deletions clean/ca/los_angeles_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
import logging
from pathlib import Path
from time import sleep
from typing import Dict, List, Set
from urllib.parse import unquote, urlparse

from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache
from ..platforms.nextrequest import process_nextrequest

logger = logging.getLogger(__name__)


"""
To-do:

Not doing -- as there's no persistence:
-- Track which subpage files have been read through the indexes, but lets also check to see if any
subpage files were NOT indexed and read them
"""


class Site:
"""Scrape file metadata for the Los Angeles Police Department -- LAPD.

Attributes:
name (str): The official name of the agency
"""

name = "Los Angeles Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.

Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.site_slug = "ca_los_angeles_pd"
self.first_url = (
"https://www.lapdonline.org/senate-bill-1421-senate-bill-16-sb-16/"
)
self.data_dir = data_dir
self.cache_dir = cache_dir
self.subpages_dir = cache_dir / (self.site_slug + "/subpages")
self.indexes_dir = cache_dir / self.site_slug
self.cache = Cache(cache_dir)
self.rescrape_all_case_files = False # Do we need to rescrape all the subpages?

for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]:
utils.create_directory(localdir)

self.detail_urls = self.indexes_dir / "url_details.json"
self.indexes_scraped = self.indexes_dir / "indexes-scraped.json"

# Build a list of URLs that should not be scraped
newsroomdev marked this conversation as resolved.
Show resolved Hide resolved
# FIXME: Remove in favor of Cosmos DB's existing list & its notification system
self.broken_urls = [
"https://lacity.nextrequest.com/documents?folder_filter=F009-01",
"https://lacity.nextrequest.com/documents?folder_filter=F050-20",
"https://lacity.nextrequest.com/documents?folder_filter=F025-15",
]

# Build a dict of URLs that need to be patched up
self.url_fixes = {
"https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/risk-management-division__trashed/sustained-complaints-of-unlawful-arrest-unlawful-search/": "https://www.lapdonline.org/office-of-the-chief-of-police/constitutional-policing/sustained-complaints-of-unlawful-arrest-unlawful-search/",
"F118-04 November 22, 2004": "https://lacity.nextrequest.com/documents?folder_filter=F118-04",
" https://lacity.nextrequest.com/documents?folder_filter=CF01-3445": "https://lacity.nextrequest.com/documents?folder_filter=CF01-3445",
}

def scrape_meta(self, throttle: int = 2) -> Path:
"""Gather metadata on downloadable files (videos, etc.).

Args:
throttle (int): Number of seconds to wait between requests. Defaults to 0.

Returns:
Path: Local path of JSON file containing metadata on downloadable files
"""
lookup = self.fetch_indexes(throttle)
json_filename, metadata = self.fetch_subpages(throttle)

logger.debug("Adding origin details to metadata")
for i, entry in enumerate(metadata):
if entry["case_id"] in lookup:
metadata[i]["details"]["bln_source"] = lookup[entry["case_id"]]
self.cache.write_json(json_filename, metadata)

return json_filename

def url_to_filename(self, url: str):
"""Turn a URL into a proposed filename."""
# We really really really need a slugify thing
path = urlparse(url).path
if path.startswith("/"):
path = path[1:]
if path.endswith("/"):
path = path[:-1]
path = path.replace("/", "_")
path += ".html"
return path

def clean_url(self, page_url, local_url):
"""Correct bad URLs.

Args:
page_url: The URL of the page that got us the link
local_url: The proposed URL we're trying to clean up
Returns:
Cleaned URL, with full domain and scheme as needed.
URL is checked against a data in self.init for replacement.
"""
if local_url in self.url_fixes:
local_url = self.url_fixes[local_url]
if urlparse(local_url).netloc == "":
local_url = urlparse(page_url).netloc + local_url
if urlparse(local_url).scheme == "":
local_url = "https" + local_url
return local_url

def fetch_indexes(self, throttle: int = 2):
"""Recursively download LAPD index pages to find subpage URLs.

Args:
throttle (int): Time to wait between requests
Returns:
lookup (dict): Supplemental data to add to metadata details
Writes:
detailed_urls.json
indexes_scraped.json
"""
scraping_complete = False

detail_urls: Dict = {}
indexes_scraped: Dict = {}
indexes_todo: Set = set()
index_passes = 0

indexes_todo.add(self.first_url)

# Need to add sleep between calls

while not scraping_complete:
index_passes += 1
for page_url in list(
indexes_todo
): # work with a copy so we're not thrashing the original
filename = self.url_to_filename(page_url)
filename = self.indexes_dir / filename
indexes_scraped[page_url] = {
"subindexes": [],
"details": 0,
}
cleaned_page_url = self.clean_url(page_url, page_url)
logger.debug(f"Trying {cleaned_page_url}")
r = utils.get_url(cleaned_page_url)

self.cache.write_binary(filename, r.content)

sleep(throttle)

# Need to write the page
soup = BeautifulSoup(r.content, features="html.parser")

page_title = soup.title
if page_title:
page_title = unquote(page_title.text.strip()) # type: ignore

content_divs = soup.findAll("div", {"class": "grid-content"})
content_divs.extend(soup.findAll("div", {"class": "link-box"}))
for content_div in content_divs:
links = content_div.findAll("a")
for link in links:
original_href = link["href"]
href = self.clean_url(page_url, original_href)
if urlparse(href).netloc.endswith(".nextrequest.com"):
if original_href in self.broken_urls:
logger.debug(f"Not scraping broken URL {original_href}")
else:
if href not in detail_urls:
detail_urls[href] = []
detail_urls[href].append(
{"page_title": page_title, "page_url": page_url}
)
indexes_scraped[page_url]["details"] += 1
else:
if original_href not in indexes_scraped:
indexes_todo.add(original_href)
indexes_scraped[page_url]["subindexes"].append(
original_href
)

for url in indexes_scraped:
if url in indexes_todo:
indexes_todo.remove(url)
if len(indexes_todo) == 0:
logger.debug(
f"Index scraping complete, after {len(indexes_scraped):,} indexes reviewed."
)
logger.debug(f"{len(detail_urls):,} case URLs found.")
scraping_complete = True
else:
logger.debug(
f"Index scraping pass {index_passes:,}: {len(indexes_scraped):,} indexes scraped, {len(detail_urls):,} case URLs found"
)

self.cache.write_json(self.detail_urls, detail_urls)

self.cache.write_json(self.indexes_scraped, indexes_scraped)

lookup: Dict = {}
for entry in detail_urls:
lookup[entry.split("=")[-1]] = detail_urls[entry]

return lookup
newsroomdev marked this conversation as resolved.
Show resolved Hide resolved

def fetch_subpages(self, throttle):
"""Download all subpage URLs as needed; parse all pages.

Args:
throttle: Time to wait between requests
Notes:
cache.rescrape_all_case_files decides whether already existent files should be downloaded
Returns:
Filename of JSON metadata
Metadata
"""
# Determine whether everything needs to be rescraped
force = self.rescrape_all_case_files

detail_urls = self.cache.read_json(self.detail_urls)

# Let's not do anything but reads to detail_urls
to_be_scraped: Dict = {}
for detail_url in detail_urls.keys():
to_be_scraped[detail_url] = force

metadata: List = []

subpages_dir = self.subpages_dir

for start_url in to_be_scraped:
force = to_be_scraped[start_url]
local_metadata = process_nextrequest(
subpages_dir, start_url, force, throttle
)
metadata.extend(local_metadata)

json_filename = self.data_dir / (self.site_slug + ".json")
self.cache.write_json(json_filename, metadata)
return json_filename, metadata
7 changes: 3 additions & 4 deletions clean/platforms/nextrequest.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,19 +277,18 @@ def fingerprint_nextrequest(start_url: str):
"""
line = None
parsed_url = urlparse(start_url)
folder_id = parse_qs(parsed_url.query)["folder_filter"][0]
if parsed_url.path == "/documents": # LAPDish type
newsroomdev marked this conversation as resolved.
Show resolved Hide resolved
base_url = f"{parsed_url.scheme}://{parsed_url.netloc}"
line = {
"site_type": "lapdish", # LAPDish type
"base_url": base_url,
"folder_id": folder_id,
"folder_id": parse_qs(parsed_url.query)["folder_filter"][0],
"page_size": 50,
"doc_limit": 9950, # Max number of accessible docs in a folder
"tally_field": "total_count",
"bln_page_url": "bln_page_url",
"bln_total_documents": "bln_total_documents",
"json_url": f"{base_url}/client/documents?sort_field=count&sort_order=desc&page_size=50&folder_filter={folder_id}&page_number=", # type: ignore
"json_url": f"{base_url}/client/documents?sort_field=count&sort_order=desc&page_size=50&folder_filter={line['folder_id']}&page_number=", # type: ignore
"details": {
"document_path": "document_path",
"description": "description",
Expand Down Expand Up @@ -354,4 +353,4 @@ def fingerprint_nextrequest(start_url: str):


def find_max_pages(item_count: int, page_size: int):
return ceil(item_count / page_size) # type: ignore
return ceil(item_count, page_size) # type: ignore