Skip to content

Commit

Permalink
Merge branch 'dev' into ca_ventura_sheriff
Browse files Browse the repository at this point in the history
  • Loading branch information
stucka authored Aug 19, 2024
2 parents 9499a09 + 9de55ae commit bd704ef
Show file tree
Hide file tree
Showing 17 changed files with 326 additions and 228 deletions.
94 changes: 94 additions & 0 deletions .github/workflows/codeql.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,94 @@
# For most projects, this workflow file will not need changing; you simply need
# to commit it to your repository.
#
# You may wish to alter this file to override the set of languages analyzed,
# or to provide custom queries or build logic.
#
# ******** NOTE ********
# We have attempted to detect the languages in your repository. Please check
# the `language` matrix defined below to confirm you have the correct set of
# supported CodeQL languages.
#
name: "CodeQL"

on:
push:
branches: [ "dev" ]
pull_request:
branches: [ "dev" ]
schedule:
- cron: '31 14 * * 5'
merge_group:

jobs:
analyze:
name: Analyze (${{ matrix.language }})
# Runner size impacts CodeQL analysis time. To learn more, please see:
# - https://gh.io/recommended-hardware-resources-for-running-codeql
# - https://gh.io/supported-runners-and-hardware-resources
# - https://gh.io/using-larger-runners (GitHub.com only)
# Consider using larger runners or machines with greater resources for possible analysis time improvements.
runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
permissions:
# required for all workflows
security-events: write

# required to fetch internal or private CodeQL packs
packages: read

# only required for workflows in private repositories
actions: read
contents: read

strategy:
fail-fast: false
matrix:
include:
- language: python
build-mode: none
# CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
# Use `c-cpp` to analyze code written in C, C++ or both
# Use 'java-kotlin' to analyze code written in Java, Kotlin or both
# Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
# To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
# see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
# If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
# your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
steps:
- name: Checkout repository
uses: actions/checkout@v4

# Initializes the CodeQL tools for scanning.
- name: Initialize CodeQL
uses: github/codeql-action/init@v3
with:
languages: ${{ matrix.language }}
build-mode: ${{ matrix.build-mode }}
# If you wish to specify custom queries, you can do so here or in a config file.
# By default, queries listed here will override any specified in a config file.
# Prefix the list here with "+" to use these queries and those in the config file.

# For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
# queries: security-extended,security-and-quality

# If the analyze step fails for one of the languages you are analyzing with
# "We were unable to automatically build your code", modify the matrix above
# to set the build mode to "manual" for that language. Then modify this step
# to build your code.
# ℹ️ Command-line programs to run using the OS shell.
# 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
- if: matrix.build-mode == 'manual'
shell: bash
run: |
echo 'If you are using a "manual" build mode for one or more of the' \
'languages you are analyzing, replace this with the commands to build' \
'your code, for example:'
echo ' make bootstrap'
echo ' make release'
exit 1
- name: Perform CodeQL Analysis
uses: github/codeql-action/analyze@v3
with:
category: "/language:${{matrix.language}}"
1 change: 1 addition & 0 deletions .github/workflows/continuous-deployment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@ on:
- dev
- main
pull_request:
merge_group:
workflow_dispatch:

jobs:
Expand Down
21 changes: 4 additions & 17 deletions clean/ca/humboldt_pd.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,5 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup, Tag

Expand Down Expand Up @@ -93,7 +92,7 @@ def _get_asset_links(self, pages, parent_page) -> list:
name = link.string
payload = {
"title": title,
"case_number": name,
"case_id": name,
"parent_page": str(parent_page),
"asset_url": f"{'https://humboldtgov.org'}{href}",
"name": name,
Expand All @@ -108,12 +107,12 @@ def _get_asset_links(self, pages, parent_page) -> list:
if soup.title and isinstance(soup.title.string, str)
else None
)
case_number = page["page_name"].split("/")[-1].split("_")[0]
case_id = page["page_name"].split("/")[-1].split("_")[0]
header = soup.find("h1")
name = header.get_text(strip=True) if header else None
payload = {
"title": title,
"case_number": case_number,
"case_id": case_id,
"parent_page": str(parent_page),
"download_page": str(page["page_name"]),
"asset_url": f"https://humboldtgov.nextrequest.com{link['href']}",
Expand All @@ -122,20 +121,8 @@ def _get_asset_links(self, pages, parent_page) -> list:
metadata.append(payload)
return metadata

def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
dl_assets = []
for asset in metadata:
url = asset["asset_url"]
dl_path = self._make_download_path(asset)
time.sleep(throttle)
dl_assets.append(self.cache.download(str(dl_path), url))
return dl_assets

def _make_download_path(self, asset):
folder_name = asset["case_number"]
folder_name = asset["case_id"]
name = asset["name"]
# If name has has no extension mark it as pdf as its a document format by meta-data
if len(name.split(".")) == 1:
Expand Down
93 changes: 93 additions & 0 deletions clean/ca/monterey_county_district_attorney.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
import re
import time
from pathlib import Path

from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache
from ..config.monterey_county_district_attorney import index_request_headers


class Site:
"""Scrape file metadata and download files for the Monterey County District Attorney.
Attributes:
name (str): The official name of the agency
"""

name = "Monterey County District Attorney"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.base_url = "https://www.countyofmonterey.gov/government/departments-a-h/district-attorney/press-releases/officer-involved-shootings"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.cache = Cache(cache_dir)

@property
def agency_slug(self) -> str:
"""Construct the agency slug."""
# Use module path to construct agency slug, which we'll use downstream
mod = Path(__file__)
state_postal = mod.parent.stem
return f"{state_postal}_{mod.stem}" # ca_monterey_county_district_attorney

def scrape_meta(self, throttle=0):
# construct a local filename relative to the cache directory - agency slug + page url (ca_monterey_county_district_attorney/officer-involved-shootings.html)
# download the page (if not already cached)
# save the index page url to cache (sensible name)

date_pattern = re.compile(r"(\w+\s\d{1,2},\s?\d{4})")
name_pattern = re.compile(r"\(([^)]+)\)")
case_pattern = re.compile(r"Case:\s*(\w+)")
year_pattern = re.compile(r"\d{4}")
base_name = f"{self.base_url.split('/')[-1]}.html"
filename = f"{self.agency_slug}/{base_name}"

self.cache.download(filename, self.base_url, headers=index_request_headers)
metadata = []
html = self.cache.read(filename)
soup = BeautifulSoup(html, "html.parser")
body = soup.find("table", id="oisTable")
links = body.find_all("a")
for link in links:
td_tag = link.find_parent("td")
title = td_tag.get_text(strip=True)
td_text = td_tag.get_text(separator=" ").strip()
# Extract date
date_match = date_pattern.search(td_text)
date = date_match.group(1) if date_match else None
# Extract year from date
if date:
year_from_date = year_pattern.search(date).group()
else:
year_from_date = None
# Extract name
name_match = name_pattern.search(td_text)
name = name_match.group(1) if name_match else None
# Extract case number
case_match = case_pattern.search(td_text)
case_number = case_match.group(1) if case_match else title
payload = {
"asset_url": link["href"],
"case_id": case_number,
"name": name,
"title": title,
"parent_page": str(filename),
"details": {"date": date, "year": year_from_date},
}
metadata.append(payload)
time.sleep(throttle)
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile
20 changes: 0 additions & 20 deletions clean/ca/orange_county_sheriff.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,4 @@
import time
from pathlib import Path
from typing import List

from bs4 import BeautifulSoup

Expand Down Expand Up @@ -32,24 +30,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
downloadable_files = self._create_json()
return downloadable_files

def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
metadata = self.cache.read_json(
self.data_dir.joinpath(f"{self.agency_slug}.json")
)
downloaded_assets = []
for asset in metadata:
url = asset["asset_url"]
if filter and filter not in url:
continue
index_dir = (
asset["parent_page"].split(f"{self.agency_slug}/")[-1].rstrip(".html")
)
asset_name = asset["name"].replace(" ", "_")
download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
time.sleep(throttle)
downloaded_assets.append(self.cache.download(str(download_path), url))
return downloaded_assets

def _create_json(self) -> Path:
metadata = []
file_stem = self.disclosure_url.split("/")[-1]
Expand Down
89 changes: 89 additions & 0 deletions clean/ca/riverside_pd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import time
import urllib.parse
from pathlib import Path

from bs4 import BeautifulSoup

from .. import utils
from ..cache import Cache


class Site:
"""Scrape file metadata and download files for the City of Riverside Police Department.
Attributes:
name (str): The official name of the agency
"""

name = "Riverside Police Department"

def __init__(
self,
data_dir: Path = utils.CLEAN_DATA_DIR,
cache_dir: Path = utils.CLEAN_CACHE_DIR,
):
"""Initialize a new instance.
Args:
data_dir (Path): The directory where downstream processed files/data will be saved
cache_dir (Path): The directory where files will be cached
"""
self.sec_website_url = "https://riversideca.gov"
self.un_sec_website_url = "http://riversideca.gov"
self.base_url = "https://www.riversideca.gov/cityclerk/boards-commissions/community-police-review-commission/officer-involved-deaths-oid/officer-involved"
self.data_dir = data_dir
self.cache_dir = cache_dir
self.cache = Cache(cache_dir)

@property
def agency_slug(self) -> str:
"""Construct the agency slug."""
# Use module path to construct agency slug, which we'll use downstream
mod = Path(__file__)
state_postal = mod.parent.stem
return f"{state_postal}_{mod.stem}" # ca_river_side_pd

def scrape_meta(self, throttle=0):
# construct a local filename relative to the cache directory - agency slug + page url (ca_river_side_pd/officer-involved-deaths-oid.html)
# download the page (if not already cached)
# save the index page url to cache (sensible name)
base_name = f"{self.base_url.split('/')[-2]}.html"
filename = f"{self.agency_slug}/{base_name}"
self.cache.download(filename, self.base_url)
metadata = []
html = self.cache.read(filename)
soup = BeautifulSoup(html, "html.parser")
body = soup.find("section", class_="col-sm-9")
sections = body.select("div#accordion>div.panel.panel-default")
for section in sections:
section_text = section.select_one("h4.panel-title>a")
title = section_text.find("strong").get_text(strip=True)
date = section_text.find("span", class_="pull-right").get_text(strip=True)
case_id = section_text.get_text(strip=True)
case_id = case_id.replace(title, "").replace(date, "").strip()
links = section.find_all("a")
for link in links:
link_href = link.get("href", None)
if link_href:
if "#" not in link_href:
link_href = link_href.rstrip('"')
if (
self.sec_website_url not in link_href
and self.un_sec_website_url not in link_href
):
link_href = f"{self.sec_website_url}{link_href}"
name = link_href.split("/")[-1]
name = urllib.parse.unquote(name)
payload = {
"asset_url": link_href,
"case_id": case_id,
"name": name,
"title": title,
"parent_page": str(filename),
"details": {"date": date},
}
metadata.append(payload)
time.sleep(throttle)
outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
self.cache.write_json(outfile, metadata)
return outfile
2 changes: 1 addition & 1 deletion clean/ca/sacramento_pd.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@

class Site:
"""
Scrape file metadata and download files for the Sacramento Police Department for SB16/SB1421/AB748 data.
Scrape file metadata and asset_urls for the Sacramento Police Department for SB16/SB1421/AB748 data.
Attributes:
name (str): The official name of the agency
Expand Down
Loading

0 comments on commit bd704ef

Please sign in to comment.