From b139e7c18e28a3fc46524389558698e740c60917 Mon Sep 17 00:00:00 2001
From: naumansharifwork <157045300+naumansharifwork@users.noreply.github.com>
Date: Tue, 3 Sep 2024 18:19:04 +0500
Subject: [PATCH 1/3] added scrape meta for chula_vista_pd #94 (#95)

* added scrape meta for chula_vista_pd #94

* removed user-agent

* changes done

* Rework URL handling; clean up a little more text

* Linted. Oops.

---------

Co-authored-by: Mike Stucka <stucka@whitedoggies.com>
---
 clean/ca/chula_vista_pd.py        | 113 ++++++++++++++++++++++++++++++
 clean/ca/config/chula_vista_pd.py |  14 ++++
 2 files changed, 127 insertions(+)
 create mode 100644 clean/ca/chula_vista_pd.py
 create mode 100644 clean/ca/config/chula_vista_pd.py

diff --git a/clean/ca/chula_vista_pd.py b/clean/ca/chula_vista_pd.py
new file mode 100644
index 0000000..e987d29
--- /dev/null
+++ b/clean/ca/chula_vista_pd.py
@@ -0,0 +1,113 @@
+import logging
+import time
+import urllib.parse
+from pathlib import Path
+
+from bs4 import BeautifulSoup
+
+from .. import utils
+from ..cache import Cache
+from .config.chula_vista_pd import index_request_headers
+
+logger = logging.getLogger(__name__)
+
+
+class Site:
+    """Scrape file metadata and download files for the City of Chula Vista Police Department.
+
+    Attributes:
+        name (str): The official name of the agency
+    """
+
+    name = "Chula Vista Police Department"
+
+    def __init__(
+        self,
+        data_dir: Path = utils.CLEAN_DATA_DIR,
+        cache_dir: Path = utils.CLEAN_CACHE_DIR,
+    ):
+        """Initialize a new instance.
+
+        Args:
+            data_dir (Path): The directory where downstream processed files/data will be saved
+            cache_dir (Path): The directory where files will be cached
+        """
+        self.base_url = "https://www.chulavistaca.gov/departments/police-department/senate-bill-1421"
+        self.data_dir = data_dir
+        self.cache_dir = cache_dir
+        self.cache = Cache(cache_dir)
+
+    @property
+    def agency_slug(self) -> str:
+        """Construct the agency slug."""
+        # Use module path to construct agency slug, which we'll use downstream
+        mod = Path(__file__)
+        state_postal = mod.parent.stem
+        return f"{state_postal}_{mod.stem}"  # ca_chula_vista_pd
+
+    def scrape_meta(self, throttle=0):
+        # construct a local filename relative to the cache directory - agency slug + page url (ca_chula_vista_pd/senate-bill-1421.html)
+        # download the page (if not already cached)
+        # save the index page url to cache (sensible name)
+        base_name = f"{self.base_url.split('/')[-1]}.html"
+        filename = f"{self.agency_slug}/{base_name}"
+        self.cache.download(
+            filename, self.base_url, force=True, headers=index_request_headers
+        )
+        metadata = []
+        html = self.cache.read(filename)
+        soup = BeautifulSoup(html, "html.parser")
+        content_areas = soup.find_all("div", class_="content_area clearfix")
+        desired_element = None
+        for content_area in content_areas:
+            previous_h2 = content_area.find_previous("h2")
+            if previous_h2 and previous_h2.text == "Documents":
+                desired_element = content_area
+                break
+
+        if desired_element:
+            sections = desired_element.find_all("div", class_="accordion-item")
+            for section in sections:
+                case_type = section.find("div", class_="title").get_text(strip=True)
+                links = section.find_all("a")
+                for link in links:
+                    link_href = link.get("href", None)
+                    case_id = link.find_previous("p").text
+                    case_id = case_id.replace("\u00a0", " ").replace("\u2014", "--")
+                    if link_href:
+                        title = link.string
+                        title = title.replace("\u00a0", " ").replace("\u2014", "--")
+                        redirect_start = "/?splash="
+                        redirect_end = "&____isexternal=true"
+
+                        # Clean up links. Check to see if it's a redirect:
+                        if redirect_start in link_href:
+                            link_href = link_href.replace(redirect_start, "").replace(
+                                redirect_end, ""
+                            )
+                            link_href = urllib.parse.unquote(link_href)
+                            name = title
+                        else:
+                            name = link_href.split("/")[-1]
+
+                        # See if it's a relative link
+                        if urllib.parse.urlparse(link_href).netloc == "":
+                            link_href = f"https://www.chulavistaca.gov{link_href}"
+
+                        payload = {
+                            "asset_url": link_href,
+                            "case_id": case_id,
+                            "name": name,
+                            "title": title,
+                            "parent_page": str(filename),
+                            "details": {"case_type": case_type},
+                        }
+                        metadata.append(payload)
+
+                    time.sleep(throttle)
+        else:
+            logger.error("HTML for the desired Elelemt")
+
+        outfile = self.data_dir.joinpath(f"{self.agency_slug}.json")
+        self.cache.write_json(outfile, metadata)
+        return outfile
diff --git a/clean/ca/config/chula_vista_pd.py b/clean/ca/config/chula_vista_pd.py
new file mode 100644
index 0000000..29ba6ce
--- /dev/null
+++ b/clean/ca/config/chula_vista_pd.py
@@ -0,0 +1,14 @@
+index_request_headers = {
+    "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7",
+    "accept-language": "en-US,en;q=0.9,nl;q=0.8,ur;q=0.7,ru;q=0.6",
+    "cache-control": "max-age=0",
+    "priority": "u=0, i",
+    "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"',
+    "sec-ch-ua-mobile": "?0",
+    "sec-ch-ua-platform": '"Windows"',
+    "sec-fetch-dest": "document",
+    "sec-fetch-mode": "navigate",
+    "sec-fetch-site": "none",
+    "sec-fetch-user": "?1",
+    "upgrade-insecure-requests": "1",
+}

From b6ae8910ef99a044844161aaf945462de9bf56d1 Mon Sep 17 00:00:00 2001
From: Gerald Rich <1578563+newsroomdev@users.noreply.github.com>
Date: Wed, 11 Sep 2024 10:25:48 -0700
Subject: [PATCH 2/3] Add issue templates

---
 .github/ISSUE_TEMPLATE/bug_report.md      | 32 +++++++++++++++++++++++
 .github/ISSUE_TEMPLATE/feature_request.md | 14 ++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md
 create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md

diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md
new file mode 100644
index 0000000..8c232b4
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/bug_report.md
@@ -0,0 +1,32 @@
+---
+name: Bug report
+about: Report an issue
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+**Environment:**
+ - OS: [e.g. OS X, Windows]
+ - Shell [e.g. bash, zsh]
+ - Version [e.g. Python 3.XX]
+
+**Expected behavior**
+A clear and concise description of what you expected to happen.
+
+**Describe the bug**
+A clear and concise description of what the bug is.
+
+**To Reproduce**
+Steps to reproduce the behavior:
+1. Go to '...'
+2. Run command '....'
+3. Check logs '....'
+4. See error
+
+**Screenshots**
+If applicable, add screenshots to help explain your problem.
+
+**Additional context**
+Add any other context or links about the problem here.
diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md
new file mode 100644
index 0000000..c9905fb
--- /dev/null
+++ b/.github/ISSUE_TEMPLATE/feature_request.md
@@ -0,0 +1,14 @@
+---
+name: Feature request
+about: Suggest an addition
+title: ''
+labels: ''
+assignees: ''
+
+---
+
+## Additional details
+
+## Related pull request(s)
+
+- #00

From 42a0d85e7f46c8ae39480ce5b1a045ba2b7395a5 Mon Sep 17 00:00:00 2001
From: naumansharifwork <157045300+naumansharifwork@users.noreply.github.com>
Date: Mon, 16 Sep 2024 00:03:09 +0500
Subject: [PATCH 3/3] updated utils.py to include post url and get_cookies
 functions (#113)

---
 clean/utils.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 58 insertions(+)

diff --git a/clean/utils.py b/clean/utils.py
index dda7195..b1b8689 100644
--- a/clean/utils.py
+++ b/clean/utils.py
@@ -287,3 +287,61 @@ def get_repeated_asset_url(self, objects: List[MetadataDict]):
         else:
             seen_urls.add(asset_url)
     return repeated_urls
+
+
+@retry(tries=3, delay=15, backoff=2)
+def post_url(
+    url, user_agent="Big Local News (biglocalnews.org)", session=None, **kwargs
+):
+    """Request the provided URL and return a response object.
+
+    Args:
+        url (str): the url to be requested
+        user_agent (str): the user-agent header passed with the request (default: biglocalnews.org)
+        session: a session object to use when making the request. optional
+    """
+    logger.debug(f"Requesting {url}")
+
+    # Set the headers
+    if "headers" not in kwargs:
+        kwargs["headers"] = {}
+    kwargs["headers"]["User-Agent"] = user_agent
+
+    # Go get it
+    if session is not None:
+        logger.debug(f"Requesting with session {session}")
+        response = session.post(url, **kwargs)
+    else:
+        response = requests.post(url, **kwargs)
+    logger.debug(f"Response code: {response.status_code}")
+
+    # Verify that the response is 200
+    assert response.ok
+
+    # Return the response
+    return response
+
+
+@retry(tries=3, delay=15, backoff=2)
+def get_cookies(url, user_agent="Big Local News (biglocalnews.org)", **kwargs):
+    """Request the provided URL and return cookie object.
+
+    Args:
+        url (str): the url to be requested
+        user_agent (str): the user-agent header passed with the request (default: biglocalnews.org)
+    """
+    logger.debug(f"Requesting {url}")
+
+    # Set the headers
+    if "headers" not in kwargs:
+        kwargs["headers"] = {}
+    kwargs["headers"]["User-Agent"] = user_agent
+    response = requests.get(url, **kwargs)
+
+    # Verify that the response is 200
+    assert response.ok
+
+    cookies = response.cookies.get_dict()
+
+    # Return the response
+    return cookies