From b139e7c18e28a3fc46524389558698e740c60917 Mon Sep 17 00:00:00 2001 From: naumansharifwork <157045300+naumansharifwork@users.noreply.github.com> Date: Tue, 3 Sep 2024 18:19:04 +0500 Subject: [PATCH 1/3] added scrape meta for chula_vista_pd #94 (#95) * added scrape meta for chula_vista_pd #94 * removed user-agent * changes done * Rework URL handling; clean up a little more text * Linted. Oops. --------- Co-authored-by: Mike Stucka --- clean/ca/chula_vista_pd.py | 113 ++++++++++++++++++++++++++++++ clean/ca/config/chula_vista_pd.py | 14 ++++ 2 files changed, 127 insertions(+) create mode 100644 clean/ca/chula_vista_pd.py create mode 100644 clean/ca/config/chula_vista_pd.py diff --git a/clean/ca/chula_vista_pd.py b/clean/ca/chula_vista_pd.py new file mode 100644 index 0000000..e987d29 --- /dev/null +++ b/clean/ca/chula_vista_pd.py @@ -0,0 +1,113 @@ +import logging +import time +import urllib.parse +from pathlib import Path + +from bs4 import BeautifulSoup + +from .. import utils +from ..cache import Cache +from .config.chula_vista_pd import index_request_headers + +logger = logging.getLogger(__name__) + + +class Site: + """Scrape file metadata and download files for the City of Chula Vista Police Department. + + Attributes: + name (str): The official name of the agency + """ + + name = "Chula Vista Police Department" + + def __init__( + self, + data_dir: Path = utils.CLEAN_DATA_DIR, + cache_dir: Path = utils.CLEAN_CACHE_DIR, + ): + """Initialize a new instance. + + Args: + data_dir (Path): The directory where downstream processed files/data will be saved + cache_dir (Path): The directory where files will be cached + """ + self.base_url = "https://www.chulavistaca.gov/departments/police-department/senate-bill-1421" + self.data_dir = data_dir + self.cache_dir = cache_dir + self.cache = Cache(cache_dir) + + @property + def agency_slug(self) -> str: + """Construct the agency slug.""" + # Use module path to construct agency slug, which we'll use downstream + mod = Path(__file__) + state_postal = mod.parent.stem + return f"{state_postal}_{mod.stem}" # ca_chula_vista_pd + + def scrape_meta(self, throttle=0): + # construct a local filename relative to the cache directory - agency slug + page url (ca_chula_vista_pd/senate-bill-1421.html) + # download the page (if not already cached) + # save the index page url to cache (sensible name) + base_name = f"{self.base_url.split('/')[-1]}.html" + filename = f"{self.agency_slug}/{base_name}" + self.cache.download( + filename, self.base_url, force=True, headers=index_request_headers + ) + metadata = [] + html = self.cache.read(filename) + soup = BeautifulSoup(html, "html.parser") + content_areas = soup.find_all("div", class_="content_area clearfix") + desired_element = None + for content_area in content_areas: + previous_h2 = content_area.find_previous("h2") + if previous_h2 and previous_h2.text == "Documents": + desired_element = content_area + break + + if desired_element: + sections = desired_element.find_all("div", class_="accordion-item") + for section in sections: + case_type = section.find("div", class_="title").get_text(strip=True) + links = section.find_all("a") + for link in links: + link_href = link.get("href", None) + case_id = link.find_previous("p").text + case_id = case_id.replace("\u00a0", " ").replace("\u2014", "--") + if link_href: + title = link.string + title = title.replace("\u00a0", " ").replace("\u2014", "--") + redirect_start = "/?splash=" + redirect_end = "&____isexternal=true" + + # Clean up links. Check to see if it's a redirect: + if redirect_start in link_href: + link_href = link_href.replace(redirect_start, "").replace( + redirect_end, "" + ) + link_href = urllib.parse.unquote(link_href) + name = title + else: + name = link_href.split("/")[-1] + + # See if it's a relative link + if urllib.parse.urlparse(link_href).netloc == "": + link_href = f"https://www.chulavistaca.gov{link_href}" + + payload = { + "asset_url": link_href, + "case_id": case_id, + "name": name, + "title": title, + "parent_page": str(filename), + "details": {"case_type": case_type}, + } + metadata.append(payload) + + time.sleep(throttle) + else: + logger.error("HTML for the desired Elelemt") + + outfile = self.data_dir.joinpath(f"{self.agency_slug}.json") + self.cache.write_json(outfile, metadata) + return outfile diff --git a/clean/ca/config/chula_vista_pd.py b/clean/ca/config/chula_vista_pd.py new file mode 100644 index 0000000..29ba6ce --- /dev/null +++ b/clean/ca/config/chula_vista_pd.py @@ -0,0 +1,14 @@ +index_request_headers = { + "accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.7", + "accept-language": "en-US,en;q=0.9,nl;q=0.8,ur;q=0.7,ru;q=0.6", + "cache-control": "max-age=0", + "priority": "u=0, i", + "sec-ch-ua": '"Not)A;Brand";v="99", "Google Chrome";v="127", "Chromium";v="127"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "document", + "sec-fetch-mode": "navigate", + "sec-fetch-site": "none", + "sec-fetch-user": "?1", + "upgrade-insecure-requests": "1", +} From b6ae8910ef99a044844161aaf945462de9bf56d1 Mon Sep 17 00:00:00 2001 From: Gerald Rich <1578563+newsroomdev@users.noreply.github.com> Date: Wed, 11 Sep 2024 10:25:48 -0700 Subject: [PATCH 2/3] Add issue templates --- .github/ISSUE_TEMPLATE/bug_report.md | 32 +++++++++++++++++++++++ .github/ISSUE_TEMPLATE/feature_request.md | 14 ++++++++++ 2 files changed, 46 insertions(+) create mode 100644 .github/ISSUE_TEMPLATE/bug_report.md create mode 100644 .github/ISSUE_TEMPLATE/feature_request.md diff --git a/.github/ISSUE_TEMPLATE/bug_report.md b/.github/ISSUE_TEMPLATE/bug_report.md new file mode 100644 index 0000000..8c232b4 --- /dev/null +++ b/.github/ISSUE_TEMPLATE/bug_report.md @@ -0,0 +1,32 @@ +--- +name: Bug report +about: Report an issue +title: '' +labels: '' +assignees: '' + +--- + +**Environment:** + - OS: [e.g. OS X, Windows] + - Shell [e.g. bash, zsh] + - Version [e.g. Python 3.XX] + +**Expected behavior** +A clear and concise description of what you expected to happen. + +**Describe the bug** +A clear and concise description of what the bug is. + +**To Reproduce** +Steps to reproduce the behavior: +1. Go to '...' +2. Run command '....' +3. Check logs '....' +4. See error + +**Screenshots** +If applicable, add screenshots to help explain your problem. + +**Additional context** +Add any other context or links about the problem here. diff --git a/.github/ISSUE_TEMPLATE/feature_request.md b/.github/ISSUE_TEMPLATE/feature_request.md new file mode 100644 index 0000000..c9905fb --- /dev/null +++ b/.github/ISSUE_TEMPLATE/feature_request.md @@ -0,0 +1,14 @@ +--- +name: Feature request +about: Suggest an addition +title: '' +labels: '' +assignees: '' + +--- + +## Additional details + +## Related pull request(s) + +- #00 From 42a0d85e7f46c8ae39480ce5b1a045ba2b7395a5 Mon Sep 17 00:00:00 2001 From: naumansharifwork <157045300+naumansharifwork@users.noreply.github.com> Date: Mon, 16 Sep 2024 00:03:09 +0500 Subject: [PATCH 3/3] updated utils.py to include post url and get_cookies functions (#113) --- clean/utils.py | 58 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 58 insertions(+) diff --git a/clean/utils.py b/clean/utils.py index dda7195..b1b8689 100644 --- a/clean/utils.py +++ b/clean/utils.py @@ -287,3 +287,61 @@ def get_repeated_asset_url(self, objects: List[MetadataDict]): else: seen_urls.add(asset_url) return repeated_urls + + +@retry(tries=3, delay=15, backoff=2) +def post_url( + url, user_agent="Big Local News (biglocalnews.org)", session=None, **kwargs +): + """Request the provided URL and return a response object. + + Args: + url (str): the url to be requested + user_agent (str): the user-agent header passed with the request (default: biglocalnews.org) + session: a session object to use when making the request. optional + """ + logger.debug(f"Requesting {url}") + + # Set the headers + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["User-Agent"] = user_agent + + # Go get it + if session is not None: + logger.debug(f"Requesting with session {session}") + response = session.post(url, **kwargs) + else: + response = requests.post(url, **kwargs) + logger.debug(f"Response code: {response.status_code}") + + # Verify that the response is 200 + assert response.ok + + # Return the response + return response + + +@retry(tries=3, delay=15, backoff=2) +def get_cookies(url, user_agent="Big Local News (biglocalnews.org)", **kwargs): + """Request the provided URL and return cookie object. + + Args: + url (str): the url to be requested + user_agent (str): the user-agent header passed with the request (default: biglocalnews.org) + """ + logger.debug(f"Requesting {url}") + + # Set the headers + if "headers" not in kwargs: + kwargs["headers"] = {} + kwargs["headers"]["User-Agent"] = user_agent + response = requests.get(url, **kwargs) + + # Verify that the response is 200 + assert response.ok + + cookies = response.cookies.get_dict() + + # Return the response + return cookies