From f12b238c4ceff3a13f6dfe87b5ea8c115e03f2e6 Mon Sep 17 00:00:00 2001 From: Mike Stucka Date: Fri, 30 Aug 2024 14:03:26 -0400 Subject: [PATCH] Los Angeles Sheriff's Department for #51 (#54) * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * closer * Closer * Implement logging * Polish * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Kill notebook version brought back by rebase * Move ugly details to config file * Rename ugly detail file * Linting * Fix linting * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Kill notebook version brought back by rebase * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Proof of concept, missing Class * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Move ugly details to config file * Rename ugly detail file * Linting * Fix linting * Build against #69 flag #70 * ... * Apply suggestions * Clean up notes * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * closer * Closer * Implement logging * Polish * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Kill notebook version brought back by rebase * Move ugly details to config file * Rename ugly detail file * Linting * Fix linting * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Kill notebook version brought back by rebase * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Proof of concept, missing Class * Proof of concept, missing Class * Export out usable case index * Incremental work consolidating things * First attempt at class * Move ugly details to config file * Rename ugly detail file * Linting * Fix linting * Build against #69 flag #70 * ... * Apply suggestions * Clean up notes * Copypaste around rebase problems --------- Co-authored-by: Gerald Rich <1578563+newsroomdev@users.noreply.github.com> --- clean/ca/config/los_angeles_sheriff.py | 44 +++++ clean/ca/los_angeles_sheriff.py | 229 +++++++++++++++++++++++++ 2 files changed, 273 insertions(+) create mode 100644 clean/ca/config/los_angeles_sheriff.py create mode 100644 clean/ca/los_angeles_sheriff.py diff --git a/clean/ca/config/los_angeles_sheriff.py b/clean/ca/config/los_angeles_sheriff.py new file mode 100644 index 00000000..00ca3382 --- /dev/null +++ b/clean/ca/config/los_angeles_sheriff.py @@ -0,0 +1,44 @@ +index_request_headers = { + "__requestverificationtoken": "kV60zFyBJ_k-mjeiu_6NIKgUlvNWfcwZ9_D29bWM84LeQ5-hNWPjAvr1VVehyAmYc2Cyp9edrQaHD-AKr4duQQPWGxPKvb0mCDZIXIY68NM1", + "accept": "application/json, text/javascript, */*; q=0.01", + "accept-language": "en-US,en;q=0.9", + "content-type": "application/json; charset=UTF-8", + "cookie": "Dynamics365PortalAnalytics=WfAhGy4JV13-E0dhKke0kztJdjYVyjtsY_vFGiSDZAN-KN83-o4lKIwHCj6Rgfuge-xA4zygTbU6OSjgoo1yp5Kw_JU9nd9NHo4FJPYh3DgEYMm16_293HSPMmfYEaGcT7Cw0h4zw3dIqO8J0A3xPw2; ASP.NET_SessionId=djn0vjtl3u2sagzyduk23cab; ARRAffinity=254b55dea5200c22439ddc2bd303a9f6d5189518bb2c795f872095b53e417c82; ARRAffinitySameSite=254b55dea5200c22439ddc2bd303a9f6d5189518bb2c795f872095b53e417c82; timezoneoffset=240; isDSTSupport=true; isDSTObserved=true; ContextLanguageCode=en-US; __RequestVerificationToken=PXUpizhW17-bet0Sh6T6F_W58jnEZDYJXOqylnNVXsykXoWqoLgcYYn2BWOhWpmBhbHqNJJbPujincEmcn0ZBHak6MOK0CifmoNBtxE5ofY1; timeZoneCode=35", + "origin": "https://lasdsb1421.powerappsportals.us", + "priority": "u=1, i", + "referer": "https://lasdsb1421.powerappsportals.us/dis/", + "request-id": "|5c4f7bc1c8ca42d9901887a721e67944.46457b1429534d7b", + "sec-ch-ua": '"Not/A)Brand";v="8", "Chromium";v="126", "Google Chrome";v="126"', + "sec-ch-ua-mobile": "?0", + "sec-ch-ua-platform": '"Windows"', + "sec-fetch-dest": "empty", + "sec-fetch-mode": "cors", + "sec-fetch-site": "same-origin", + "traceparent": "00-5c4f7bc1c8ca42d9901887a721e67944-46457b1429534d7b-01", + "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/126.0.0.0 Safari/537.36", + "x-requested-with": "XMLHttpRequest", +} + +index_payload = '{"base64SecureConfiguration":"","sortExpression":"sb1421_eventdate ASC","search":"","page":1,"pageSize":9999,"pagingCookie":"","filter":null,"metaFilter":null,"timezoneOffset":240,"customParameters":[]}' + +detail_request_headers = { + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:128.0) Gecko/20100101 Firefox/128.0", + "Accept": "application/json, text/javascript, */*; q=0.01", + "Accept-Language": "en-US,en;q=0.5", + "Accept-Encoding": "gzip, deflate, br, zstd", + "Content-Type": "application/json", + "__RequestVerificationToken": "eHcTqQbCi1LqT2xhe50AZS-IY_4JPB6S-WOyeZ_43BorhlZfHO77Q69jKWO3bctuMtKNHSjY_SxQmKCmC0G2N8vhr-3KKu8cOa4GJ15NgOE1", + "X-Requested-With": "XMLHttpRequest", + "Request-Id": "^|180fc898383b4cdea9562818e9ccb2f0.6971a699587e4f02", + "traceparent": "00-180fc898383b4cdea9562818e9ccb2f0-6971a699587e4f02-01", + "Origin": "https://lasdsb1421.powerappsportals.us", + "Connection": "keep-alive", + "Referer": "https://lasdsb1421.powerappsportals.us/disfiles/?id=13434aab-ab8b-ed11-81ad-001dd830a125", + "Cookie": "Dynamics365PortalAnalytics=I96I2Tvt4N-gPaURejqoFAgdfpCOkV7mfdXsXEgZZq8CooQCFX8ewO5C6tTxgHKGjV8Nqh30acufK6AFfDtdV_SivR7HLAZg5f476jxkzB394E5aPLo8PDI_xXsBmLWgXb5Sf28dZJ2CxuI4re7ZEA2; ASP.NET_SessionId=2k2vrqpb53tklzcqz0ftqqyy; ARRAffinity=254b55dea5200c22439ddc2bd303a9f6d5189518bb2c795f872095b53e417c82; ARRAffinitySameSite=254b55dea5200c22439ddc2bd303a9f6d5189518bb2c795f872095b53e417c82; timezoneoffset=240; isDSTSupport=true; isDSTObserved=true; ContextLanguageCode=en-US; timeZoneCode=35; __RequestVerificationToken=Y4mVGr7Dq1OfgQav9ztK4nDJNNtdU450gGRn6puub7-qbXeiwIiFBzyn-ZFIiwLgFTh13dMhEtTlTXdIUiXIlVaAKO9XENzlm-qMbNC5Egg1", + "Sec-Fetch-Dest": "empty", + "Sec-Fetch-Mode": "cors", + "Sec-Fetch-Site": "same-origin", + "TE": "trailers", +} + +detail_payload = '{"regarding":{"Id":"IDGOESHERE","LogicalName":"sb1421_sb1421responsiverecords","Name":null,"KeyAttributes":[],"RowVersion":null},"sortExpression":"FileLeafRef ASC","page":1,"pageSize":9990,"folderPath":""}' diff --git a/clean/ca/los_angeles_sheriff.py b/clean/ca/los_angeles_sheriff.py new file mode 100644 index 00000000..734cae2d --- /dev/null +++ b/clean/ca/los_angeles_sheriff.py @@ -0,0 +1,229 @@ +import logging +import time +from pathlib import Path + +import requests + +from .. import utils +from ..cache import Cache +from .config.los_angeles_sheriff import ( + detail_payload, + detail_request_headers, + index_payload, + index_request_headers, +) + +logger = logging.getLogger(__name__) + + +class Site: + """Scrapes California's Los Angeles Sheriff's Department. + + Notes: + Several things in this scraper may break with library updates or standarization efforts. + cache.write_json and cache.read_json are using absolute paths. + There is no standarized POST function yet. + BLN request headers are not used, though those might break the scraper. + """ + + name = "Los Angeles Sheriff's Department" + + def __init__(self, data_dir=utils.CLEAN_DATA_DIR, cache_dir=utils.CLEAN_CACHE_DIR): + self.siteslug = "ca_los_angeles_sheriff" + self.rooturl = "https://lasdsb1421.powerappsportals.us" + self.filestoignore = [ + "index", + "timestamplog", + self.siteslug, + "caseindex", + ] # What cached JSON files aren't page-level JSONs? + self.base_url = "https://lasd.org/" + self.disclosure_url = "https://lasdsb1421.powerappsportals.us/" + self.data_dir = data_dir + self.cache_dir = cache_dir + self.cache = Cache(cache_dir) + self.subpages_dir = cache_dir / (self.siteslug + "/subpages") + for localdir in [self.cache_dir, self.data_dir, self.subpages_dir]: + utils.create_directory(localdir) + + def scrape_meta(self, throttle: int = 0) -> Path: + rawindex = self._fetch_index() + oldtimestamps = self._fetch_old_timestamps() + indextimes = self._build_timestamps(rawindex) + detailtodo = self._build_detail_todo(indextimes, oldtimestamps) + self._fetch_detail_pages(detailtodo, throttle) + self._save_timestamps(indextimes) + caseindex = self._build_caseindex(rawindex) + assetlist = self._build_assetlist(caseindex) + assetlist_filename = self._save_assetlist(assetlist) + return assetlist_filename + + def _fetch_index(self): + indexjsonurl = "https://lasdsb1421.powerappsportals.us/_services/entity-grid-data.json/f46b70cc-580b-4f1a-87c3-41deb48eb90d" + r = requests.post( + indexjsonurl, + headers=index_request_headers, + data=index_payload, + ) + targetfilename = f"{self.siteslug}/index.json" + self.cache.write_binary(targetfilename, r.content) + # FIXME: + # with open(self.cache_dir / (self.siteslug + "/index.json"), "wb") as outfile: + # outfile.write(r.content) + rawindex = self.cache.read_json(self.cache_dir / targetfilename) + # TODO: #70 implementation affects above + if rawindex["MoreRecords"] or len(rawindex["Records"]) != rawindex["ItemCount"]: + logger.error("Index JSON is incomplete or broken.") + else: + logger.debug(f"{rawindex['ItemCount']:,} records found.") + return rawindex + + def _build_timestamps(self, rawindex: dict): + indextimes = {} + for record in rawindex["Records"]: + recordid = record["Id"] + timestamp = "" + for entry in record["Attributes"]: + timestamp += entry["AttributeMetadata"]["ModifiedOn"] + indextimes[recordid] = timestamp + return indextimes + + def _fetch_old_timestamps(self): + partfilename = self.siteslug + "/timestamplog.json" + fullfilename = self.cache_dir / partfilename + if self.cache.exists(partfilename): + oldtimestamps = self.cache.read_json(fullfilename) + else: + oldtimestamps = {} + return oldtimestamps + + def _save_timestamps(self, indextimestamps): + targetfilename = self.siteslug + "/timestamplog.json" + self.cache.write_json(self.cache_dir / targetfilename, indextimestamps) + return + + def _get_detail_json(self, recordid: str): + referer = "https://lasdsb1421.powerappsportals.us/disfiles/?id=" + recordid + local_request_headers = detail_request_headers + local_request_headers["Referer"] = referer + local_payload = detail_payload + local_payload = local_payload.replace("IDGOESHERE", recordid) + targeturl = ( + "https://lasdsb1421.powerappsportals.us/_services/sharepoint-data.json/" + + recordid + ) + targetfilename = f"{self.siteslug}/subpages/{recordid}.json" + r = requests.post( + targeturl, + headers=local_request_headers, + data=local_payload, + ) + if not r.ok: + logger.warning(f"Problem downloading detail JSON for {recordid}") + else: + self.cache.write_binary(targetfilename, r.content) + + def _build_detail_file_list(self): + cachefiles = self.cache.files(subdir=self.siteslug + "/subpages") + recordsdownloaded = set() + for cachefile in cachefiles: + corefilename = ( + cachefile.replace("\\", "/").split("/")[-1].replace(".json", "") + ) + if corefilename not in self.filestoignore: + recordsdownloaded.add(corefilename) + return recordsdownloaded + + def _build_detail_todo(self, indextimes, oldtimestamps): + todo = set() + recordsdownloaded = self._build_detail_file_list() + for recordid in indextimes: + if recordid not in recordsdownloaded: + todo.add(recordid) + elif recordid not in oldtimestamps: + todo.add(recordid) + elif ( + indextimes[recordid] != oldtimestamps[recordid] + ): # If something got modified, maybe + todo.add(recordid) + logger.debug(f"{len(todo):,} subpages to download") + return todo + + def _fetch_detail_pages(self, detailtodo, throttle): + for recordid in detailtodo: + self._get_detail_json(recordid) + time.sleep(throttle) + + def _build_caseindex(self, rawindex): + caseindex = {} + sectiontypes = [ + "case_number", + "recordid", + "case_type", + "suspectvictim", + "event_date_epoch", + "event_date_human", + "release_date_epoch", + "release_date_human", + ] + for record in rawindex["Records"]: + line = {} + for sectiontype in sectiontypes: + line[sectiontype] = None + line["recordid"] = record["Id"] + for a in record["Attributes"]: + if a["Name"] == "sb1421_name": + line["case_number"] = a["Value"] + elif a["Name"] == "sb1421_caseorincidenttype": + line["case_type"] = a["DisplayValue"] + elif a["Name"] == "sb1421_suspectvictim": + line["suspectvictim"] = a["Value"] + elif a["Name"] == "sb1421_publicreleasedate": + line["release_date_human"] = a["DisplayValue"] + line["release_date_epoch"] = int( + a["Value"].split("(")[1].split(")")[0] + ) + elif a["Name"] == "sb1421_eventdate": + line["event_date_human"] = a["DisplayValue"] + line["event_date_epoch"] = int( + a["Value"].split("(")[1].split(")")[0] + ) + caseindex[line["recordid"]] = line + return caseindex + + def _build_assetlist(self, caseindex): + assetlist = [] + recordsdownloaded = self._build_detail_file_list() + for recordid in recordsdownloaded: + sourcefile = self.cache_dir / f"{self.siteslug}/subpages/{recordid}.json" + localjson = self.cache.read_json(sourcefile) + for asset in localjson["SharePointItems"]: + line = {} + line["asset_url"] = self.rooturl + asset["Url"] + line["name"] = asset["Name"] + line["parent_page"] = str(sourcefile).replace("\\", "/").split("/")[-1] + line["title"] = asset["Name"] + line["case_id"] = caseindex[recordid]["case_number"] + line["details"] = {} + line["details"]["filesize"] = asset["FileSize"] + line["details"]["date_modified"] = asset["ModifiedOnDisplay"] + line["details"]["date_created"] = asset["CreatedOnDisplay"] + for item in [ + "case_type", + "suspectvictim", + "event_date_epoch", + "event_date_human", + "release_date_epoch", + "release_date_human", + ]: + line["details"][("case_" + item).replace("case_case_", "case_")] = ( + caseindex[recordid][item] + ) + assetlist.append(line) + return assetlist + + def _save_assetlist(self, assetlist): + targetfilename = self.data_dir / (self.siteslug + ".json") + logger.debug(f"Saving asset list to {targetfilename}") + self.cache.write_json(self.cache_dir / targetfilename, assetlist) + return targetfilename