Merge branch 'dev' into ca_ventura_sheriff

biglocalnews · Aug 19, 2024 · bd704ef · bd704ef
2 parents 9499a09 + 9de55ae
commit bd704ef
Show file tree

Hide file tree

Showing 17 changed files with 326 additions and 228 deletions.
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
@@ -0,0 +1,94 @@
+# For most projects, this workflow file will not need changing; you simply need
+# to commit it to your repository.
+#
+# You may wish to alter this file to override the set of languages analyzed,
+# or to provide custom queries or build logic.
+#
+# ******** NOTE ********
+# We have attempted to detect the languages in your repository. Please check
+# the `language` matrix defined below to confirm you have the correct set of
+# supported CodeQL languages.
+#
+name: "CodeQL"
+
+on:
+  push:
+    branches: [ "dev" ]
+  pull_request:
+    branches: [ "dev" ]
+  schedule:
+    - cron: '31 14 * * 5'
+  merge_group:
+
+jobs:
+  analyze:
+    name: Analyze (${{ matrix.language }})
+    # Runner size impacts CodeQL analysis time. To learn more, please see:
+    #   - https://gh.io/recommended-hardware-resources-for-running-codeql
+    #   - https://gh.io/supported-runners-and-hardware-resources
+    #   - https://gh.io/using-larger-runners (GitHub.com only)
+    # Consider using larger runners or machines with greater resources for possible analysis time improvements.
+    runs-on: ${{ (matrix.language == 'swift' && 'macos-latest') || 'ubuntu-latest' }}
+    timeout-minutes: ${{ (matrix.language == 'swift' && 120) || 360 }}
+    permissions:
+      # required for all workflows
+      security-events: write
+
+      # required to fetch internal or private CodeQL packs
+      packages: read
+
+      # only required for workflows in private repositories
+      actions: read
+      contents: read
+
+    strategy:
+      fail-fast: false
+      matrix:
+        include:
+        - language: python
+          build-mode: none
+        # CodeQL supports the following values keywords for 'language': 'c-cpp', 'csharp', 'go', 'java-kotlin', 'javascript-typescript', 'python', 'ruby', 'swift'
+        # Use `c-cpp` to analyze code written in C, C++ or both
+        # Use 'java-kotlin' to analyze code written in Java, Kotlin or both
+        # Use 'javascript-typescript' to analyze code written in JavaScript, TypeScript or both
+        # To learn more about changing the languages that are analyzed or customizing the build mode for your analysis,
+        # see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/customizing-your-advanced-setup-for-code-scanning.
+        # If you are analyzing a compiled language, you can modify the 'build-mode' for that language to customize how
+        # your codebase is analyzed, see https://docs.github.com/en/code-security/code-scanning/creating-an-advanced-setup-for-code-scanning/codeql-code-scanning-for-compiled-languages
+    steps:
+    - name: Checkout repository
+      uses: actions/checkout@v4
+
+    # Initializes the CodeQL tools for scanning.
+    - name: Initialize CodeQL
+      uses: github/codeql-action/init@v3
+      with:
+        languages: ${{ matrix.language }}
+        build-mode: ${{ matrix.build-mode }}
+        # If you wish to specify custom queries, you can do so here or in a config file.
+        # By default, queries listed here will override any specified in a config file.
+        # Prefix the list here with "+" to use these queries and those in the config file.
+
+        # For more details on CodeQL's query packs, refer to: https://docs.github.com/en/code-security/code-scanning/automatically-scanning-your-code-for-vulnerabilities-and-errors/configuring-code-scanning#using-queries-in-ql-packs
+        # queries: security-extended,security-and-quality
+
+    # If the analyze step fails for one of the languages you are analyzing with
+    # "We were unable to automatically build your code", modify the matrix above
+    # to set the build mode to "manual" for that language. Then modify this step
+    # to build your code.
+    # ℹ️ Command-line programs to run using the OS shell.
+    # 📚 See https://docs.github.com/en/actions/using-workflows/workflow-syntax-for-github-actions#jobsjob_idstepsrun
+    - if: matrix.build-mode == 'manual'
+      shell: bash
+      run: |
+        echo 'If you are using a "manual" build mode for one or more of the' \
+          'languages you are analyzing, replace this with the commands to build' \
+          'your code, for example:'
+        echo '  make bootstrap'
+        echo '  make release'
+        exit 1
+
+    - name: Perform CodeQL Analysis
+      uses: github/codeql-action/analyze@v3
+      with:
+        category: "/language:${{matrix.language}}"
diff --git a/.github/workflows/continuous-deployment.yml b/.github/workflows/continuous-deployment.yml
@@ -5,6 +5,7 @@ on:
       - dev
       - main
   pull_request:
+  merge_group:
   workflow_dispatch:
 
 jobs:

diff --git a/clean/ca/humboldt_pd.py b/clean/ca/humboldt_pd.py
@@ -1,6 +1,5 @@
 import time
 from pathlib import Path
-from typing import List
 
 from bs4 import BeautifulSoup, Tag
 
@@ -93,7 +92,7 @@ def _get_asset_links(self, pages, parent_page) -> list:
                             name = link.string
                             payload = {
                                 "title": title,
-                                "case_number": name,
+                                "case_id": name,
                                 "parent_page": str(parent_page),
                                 "asset_url": f"{'https://humboldtgov.org'}{href}",
                                 "name": name,
@@ -108,12 +107,12 @@ def _get_asset_links(self, pages, parent_page) -> list:
                         if soup.title and isinstance(soup.title.string, str)
                         else None
                     )
-                    case_number = page["page_name"].split("/")[-1].split("_")[0]
+                    case_id = page["page_name"].split("/")[-1].split("_")[0]
                     header = soup.find("h1")
                     name = header.get_text(strip=True) if header else None
                     payload = {
                         "title": title,
-                        "case_number": case_number,
+                        "case_id": case_id,
                         "parent_page": str(parent_page),
                         "download_page": str(page["page_name"]),
                         "asset_url": f"https://humboldtgov.nextrequest.com{link['href']}",
@@ -122,20 +121,8 @@ def _get_asset_links(self, pages, parent_page) -> list:
                     metadata.append(payload)
         return metadata
 
-    def scrape(self, throttle: int = 4, filter: str = "") -> List[Path]:
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        dl_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            dl_path = self._make_download_path(asset)
-            time.sleep(throttle)
-            dl_assets.append(self.cache.download(str(dl_path), url))
-        return dl_assets
-
     def _make_download_path(self, asset):
-        folder_name = asset["case_number"]
+        folder_name = asset["case_id"]
         name = asset["name"]
         # If name has has no extension mark it as pdf as its a document format by meta-data
         if len(name.split(".")) == 1:

diff --git a/clean/ca/monterey_county_district_attorney.py b/clean/ca/monterey_county_district_attorney.py
@@ -0,0 +1,93 @@
+import re
+import time
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+from ..config.monterey_county_district_attorney import index_request_headers
+
+
+class Site:
+    """Scrape file metadata and download files for the Monterey County District Attorney.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Monterey County District Attorney"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.countyofmonterey.gov/government/departments-a-h/district-attorney/press-releases/officer-involved-shootings"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_monterey_county_district_attorney
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_monterey_county_district_attorney/officer-involved-shootings.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+
+        date_pattern = re.compile(r"(\w+\s\d{1,2},\s?\d{4})")
+        name_pattern = re.compile(r"\(([^)]+)\)")
+        case_pattern = re.compile(r"Case:\s*(\w+)")
+        year_pattern = re.compile(r"\d{4}")
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+
+        self.cache.download(filename, self.base_url, headers=index_request_headers)
+        metadata = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        body = soup.find("table", id="oisTable")
+        links = body.find_all("a")
+        for link in links:
+            td_tag = link.find_parent("td")
+            title = td_tag.get_text(strip=True)
+            td_text = td_tag.get_text(separator=" ").strip()
+            # Extract date
+            date_match = date_pattern.search(td_text)
+            date = date_match.group(1) if date_match else None
+            # Extract year from date
+            if date:
+                year_from_date = year_pattern.search(date).group()
+            else:
+                year_from_date = None
+            # Extract name
+            name_match = name_pattern.search(td_text)
+            name = name_match.group(1) if name_match else None
+            # Extract case number
+            case_match = case_pattern.search(td_text)
+            case_number = case_match.group(1) if case_match else title
+            payload = {
+                "asset_url": link["href"],
+                "case_id": case_number,
+                "name": name,
+                "title": title,
+                "parent_page": str(filename),
+                "details": {"date": date, "year": year_from_date},
+            }
+            metadata.append(payload)
+            time.sleep(throttle)
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
diff --git a/clean/ca/orange_county_sheriff.py b/clean/ca/orange_county_sheriff.py
@@ -1,6 +1,4 @@
-import time
 from pathlib import Path
-from typing import List
 
 from bs4 import BeautifulSoup
 
@@ -32,24 +30,6 @@ def scrape_meta(self, throttle: int = 0) -> Path:
         downloadable_files = self._create_json()
         return downloadable_files
 
-    def scrape(self, throttle: int = 0, filter: str = "") -> List[Path]:
-        metadata = self.cache.read_json(
-            self.data_dir.joinpath(f"{self.agency_slug}.json")
-        )
-        downloaded_assets = []
-        for asset in metadata:
-            url = asset["asset_url"]
-            if filter and filter not in url:
-                continue
-            index_dir = (
-                asset["parent_page"].split(f"{self.agency_slug}/")[-1].rstrip(".html")
-            )
-            asset_name = asset["name"].replace(" ", "_")
-            download_path = Path(self.agency_slug, "assets", index_dir, asset_name)
-            time.sleep(throttle)
-            downloaded_assets.append(self.cache.download(str(download_path), url))
-        return downloaded_assets
-
     def _create_json(self) -> Path:
         metadata = []
         file_stem = self.disclosure_url.split("/")[-1]

diff --git a/clean/ca/riverside_pd.py b/clean/ca/riverside_pd.py
@@ -0,0 +1,89 @@
+import time
+import urllib.parse
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Riverside Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Riverside Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.sec_website_url = "https://riversideca.gov"
+        self.un_sec_website_url = "http://riversideca.gov"
+        self.base_url = "https://www.riversideca.gov/cityclerk/boards-commissions/community-police-review-commission/officer-involved-deaths-oid/officer-involved"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_river_side_pd
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_river_side_pd/officer-involved-deaths-oid.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-2]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.cache.download(filename, self.base_url)
+        metadata = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        body = soup.find("section", class_="col-sm-9")
+        sections = body.select("div#accordion>div.panel.panel-default")
+        for section in sections:
+            section_text = section.select_one("h4.panel-title>a")
+            title = section_text.find("strong").get_text(strip=True)
+            date = section_text.find("span", class_="pull-right").get_text(strip=True)
+            case_id = section_text.get_text(strip=True)
+            case_id = case_id.replace(title, "").replace(date, "").strip()
+            links = section.find_all("a")
+            for link in links:
+                link_href = link.get("href", None)
+                if link_href:
+                    if "#" not in link_href:
+                        link_href = link_href.rstrip('"')
+                        if (
+                            self.sec_website_url not in link_href
+                            and self.un_sec_website_url not in link_href
+                        ):
+                            link_href = f"{self.sec_website_url}{link_href}"
+                        name = link_href.split("/")[-1]
+                        name = urllib.parse.unquote(name)
+                        payload = {
+                            "asset_url": link_href,
+                            "case_id": case_id,
+                            "name": name,
+                            "title": title,
+                            "parent_page": str(filename),
+                            "details": {"date": date},
+                        }
+                        metadata.append(payload)
+                    time.sleep(throttle)
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
diff --git a/clean/ca/sacramento_pd.py b/clean/ca/sacramento_pd.py
@@ -19,7 +19,7 @@
 
 class Site:
     """
-    Scrape file metadata and download files for the Sacramento Police Department for SB16/SB1421/AB748 data.
+    Scrape file metadata and asset_urls for the Sacramento Police Department for SB16/SB1421/AB748 data.
 
     Attributes:
         name (str): The official name of the agency