diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py index a3a1b0b77..7abcd3312 100644 --- a/juriscraper/AbstractSite.py +++ b/juriscraper/AbstractSite.py @@ -1,6 +1,6 @@ import hashlib import json -from datetime import date, datetime +from datetime import date, datetime, timedelta from typing import Dict, List, Tuple import certifi @@ -257,6 +257,7 @@ def _check_sanity(self): prior_case_name = name i += 1 + future_date_count = 0 for index, case_date in enumerate(self.case_dates): if not isinstance(case_date, date): raise InsanityException( @@ -266,24 +267,30 @@ def _check_sanity(self): ) # Sanitize case date, fix typo of current year if present fixed_date = fix_future_year_typo(case_date) + case_name = self.case_names[index] if fixed_date != case_date: logger.info( "Date year typo detected. Converting %s to %s " - "for case '%s' in %s" - % ( - case_date, - fixed_date, - self.case_names[index], - self.court_id, - ) + "for case '%s' in %s", + case_date, + fixed_date, + case_name, + self.court_id, ) case_date = fixed_date self.case_dates[index] = fixed_date - if case_date.year > 2025: - raise InsanityException( - "%s: member of case_dates list is from way in the future, " - "with value %s" % (self.court_id, case_date.year) - ) + + # dates should not be in the future. Tolerate a week + if case_date > (date.today() + timedelta(days=7)): + future_date_count += 1 + error = f"{self.court_id}: {case_date} date is in the future. Case '{case_name}'" + logger.error(error) + + # Interrupt data ingestion if more than 1 record has a bad date + if future_date_count > 1: + raise InsanityException( + f"More than 1 case has a date in the future. Last case: {error}" + ) # Is cookies a dict? if type(self.cookies) != dict: diff --git a/juriscraper/OpinionSite.py b/juriscraper/OpinionSite.py index 1cf7ce10a..975dcbc01 100644 --- a/juriscraper/OpinionSite.py +++ b/juriscraper/OpinionSite.py @@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs): "per_curiam", "types", "other_dates", + "attorneys", ] self._req_attrs = [ "case_dates", @@ -134,6 +135,9 @@ def _get_per_curiam(self): def _get_other_dates(self): return None + def _get_attorneys(self): + return None + def extract_from_text(self, scraped_text): """Pass scraped text into function and return data as a dictionary diff --git a/juriscraper/OpinionSiteLinear.py b/juriscraper/OpinionSiteLinear.py index 15c74c7c8..c556feab9 100644 --- a/juriscraper/OpinionSiteLinear.py +++ b/juriscraper/OpinionSiteLinear.py @@ -40,6 +40,7 @@ class OpinionSiteLinear(OpinionSite): "type", "joined_by", "other_date", + "attorney", } def __init__(self, *args, **kwargs): @@ -153,6 +154,10 @@ def _get_other_dates(self): """Goes into OpinionCluster.other_dates, type: string""" return self._get_optional_field_by_id("other_date") + def _get_attorneys(self): + """Goes into OpinionCluster.attorneys, type: string""" + return self._get_optional_field_by_id("attorney") + def _check_sanity(self): super()._check_sanity() # Check that all returned keys have the proper name to be used diff --git a/juriscraper/lib/date_utils.py b/juriscraper/lib/date_utils.py index 713cbdf76..7e36c0ad3 100644 --- a/juriscraper/lib/date_utils.py +++ b/juriscraper/lib/date_utils.py @@ -4,6 +4,7 @@ from datetime import date from itertools import zip_longest from math import ceil +from typing import Union from dateutil.parser import parser, parserinfo from dateutil.rrule import DAILY, rrule @@ -150,3 +151,30 @@ def make_date_range_tuples(start, end, gap): for d in rrule(DAILY, interval=gap, dtstart=end_start, until=end) ] return list(zip_longest(start_dates, end_dates, fillvalue=end)) + + +def unique_year_month( + date_list: list[Union[date, datetime.datetime, tuple[date]]], +) -> list[Union[date, datetime.datetime]]: + """Takes a list of dates or date tuples, and reduces it + to date objects with unique year-months pairs + + :param date_list: a list containing dates or tuples of dates + default make_backscrape_iterable returns date tuples + :return: a list with date objects of unique year-month pairs + """ + unique_list = [] + seen_year_months = set() + + for obj in date_list: + if isinstance(obj, date) or isinstance(obj, datetime.datetime): + obj = [obj] + + for date_obj in obj: + ym = date_obj.strftime("%Y%m") + if ym in seen_year_months: + continue + seen_year_months.add(ym) + unique_list.append(date_obj) + + return unique_list diff --git a/juriscraper/opinions/united_states/state/__init__.py b/juriscraper/opinions/united_states/state/__init__.py index 933cb01ba..efda973a4 100644 --- a/juriscraper/opinions/united_states/state/__init__.py +++ b/juriscraper/opinions/united_states/state/__init__.py @@ -61,6 +61,7 @@ "kyctapp", "la", "lactapp_1", + "lactapp_5", "mass", "massappct", "massappct_u", diff --git a/juriscraper/opinions/united_states/state/la.py b/juriscraper/opinions/united_states/state/la.py index d454bf57e..88a1aaa17 100644 --- a/juriscraper/opinions/united_states/state/la.py +++ b/juriscraper/opinions/united_states/state/la.py @@ -7,6 +7,7 @@ # rgunn@lasc.org from datetime import date +from urllib.parse import urljoin from juriscraper.lib.html_utils import get_html_parsed_text from juriscraper.lib.string_utils import titlecase @@ -36,29 +37,31 @@ def _download(self, request_dict={}): return [self._get_subpage_html_by_url(url) for url in urls] def _process_html(self): - path = ( - "//a[" - "contains(., 'v.') or " - "contains(., 'IN RE') or " - "contains(., 'IN THE') or " - "contains(., 'vs.') or " - "contains(., 'VS.')" - "]" - ) + xpath = "//a[contains(@href, 'opinions') and contains(@href, 'pdf')]" for html in self.html: - for anchor in html.xpath(path): + for anchor in html.xpath(xpath): date_string = self._get_date_for_opinions(html) text = anchor.text_content() parts = text.split(None, 1) summary_lines = anchor.getparent().xpath("./text()") + + judge = self._get_judge_above_anchor(anchor) + per_curiam = False + if "per curiam" in judge.lower(): + per_curiam = True + judge = "" + self.cases.append( { "date": date_string, "docket": parts[0], - "judge": self._get_judge_above_anchor(anchor), + "judge": judge, + "per_curiam": per_curiam, "name": titlecase(parts[1]), "summary": " ".join(summary_lines).replace(text, ""), - "url": f"http://www.lasc.org{anchor.get('href')}", + "url": urljoin( + "http://www.lasc.org", anchor.get("href") + ), } ) diff --git a/juriscraper/opinions/united_states/state/lactapp_5.py b/juriscraper/opinions/united_states/state/lactapp_5.py new file mode 100644 index 000000000..a8c3318c0 --- /dev/null +++ b/juriscraper/opinions/united_states/state/lactapp_5.py @@ -0,0 +1,133 @@ +import re +from datetime import date, datetime + +from juriscraper.AbstractSite import logger +from juriscraper.lib.date_utils import unique_year_month +from juriscraper.lib.string_utils import titlecase +from juriscraper.OpinionSiteLinear import OpinionSiteLinear + + +class Site(OpinionSiteLinear): + id_to_case_mapper = { + "lblCaseTitle": "name", + "lblCaseNum": "docket", + "lblRulingJudge": "judge", + "lblDistrictCourtNo": "lower_court_number", + "lblLowerCourt": "lower_court", + "lblAttorney": "attorney", + } + first_opinion_date = datetime(1992, 1, 1) + days_interval = 28 # ensure a tick for each month + date_regex = re.compile(r"\d{2}/\d{2}/\d{4}") + + def __init__(self, *args, **kwargs): + super().__init__(*args, **kwargs) + self.court_id = self.__module__ + self.url = "https://www.fifthcircuit.org/searchopinions.aspx" + self.search_is_configured = False + self.parameters = { + "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions": "2", + } + self.target_date = datetime.today() + self.make_backscrape_iterable(kwargs) + self.status = "Unknown" + + def _process_html(self): + # We need to do a plain GET to get hidden inputs + # Then we can do our filtered request + if not self.test_mode_enabled(): + self.method = "POST" + + # We need to set the proper search filter the first time + if not self.search_is_configured: + self.update_hidden_inputs() + self.parameters["__EVENTTARGET"] = ( + "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions" + ) + self.html = self._download() + self.search_is_configured = True + + # Set the proper filters to get the actual data we want + self.update_date_filters() + self.update_hidden_inputs() + self.html = self._download() + + count_xpath = "//*[@id='cntBody_ctlOpinionSearch_Toggle_lblRecordCnt']" + logger.info(self.html.xpath(count_xpath)[0].text_content().strip()) + + for row in self.html.xpath("//tr[.//a[contains(@id, 'HyperLink_')]]"): + fixed_values = {} + for id_part, key in self.id_to_case_mapper.items(): + element = row.xpath(f".//*[contains(@id, '{id_part}')]") + if element: + fixed_values[key] = element[0].text_content().strip() + + fixed_values["name"] = titlecase(fixed_values["name"]) + if fixed_values.get("judge"): + fixed_values["judge"] = re.sub( + r"Hon\.[\s\n]+", "", fixed_values["judge"] + ) + + # Some cases have more than 1 opinion document (check example 2) + # Some cases have no links, they will be ignored by this loop + for anchor in row.xpath(".//a"): + # The opinion date is sometimes in the disposition text + disposition = "" + case_date = f"{self.target_date.year}/07/01" + date_filed_is_approximate = True + if disp_container := anchor.xpath("following-sibling::text()"): + disposition = disp_container[0].strip() + + if date_match := self.date_regex.search(disposition): + case_date = date_match.group(0) + disposition = disposition.rsplit(" on ", 1)[0].strip( + " '" + ) + date_filed_is_approximate = False + + case = { + "url": anchor.get("href"), + "disposition": disposition, + "date": case_date, + "date_filed_is_approximate": date_filed_is_approximate, + **fixed_values, + } + + self.cases.append(case) + + def update_hidden_inputs(self) -> None: + """Parse form values characteristic of aspx sites, + and put then on self.parameters for POST use + """ + for input in self.html.xpath('//input[@type="hidden"]'): + self.parameters[input.get("name")] = input.get("value", "") + + def update_date_filters(self) -> None: + """Set year and month values from `self.target_date` + into self.parameters for POST use + """ + logger.info( + "Scraping for year: %s - month: %s", + self.target_date.year, + self.target_date.month, + ) + self.parameters = { + "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnMonth": str( + self.target_date.month + ), + "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnYear": str( + self.target_date.year + ), + "ctl00$cntBody$ctlOpinionSearch_Toggle$btnSearch": "Search", + } + + def _download_backwards(self, target_date: date) -> None: + self.target_date = target_date + self.html = self._download() + self._process_html() + + def make_backscrape_iterable(self, kwargs): + super().make_backscrape_iterable(kwargs) + self.back_scrape_iterable = unique_year_month( + self.back_scrape_iterable + ) diff --git a/juriscraper/opinions/united_states/state/sc.py b/juriscraper/opinions/united_states/state/sc.py index 4af90fb76..de78e52a0 100644 --- a/juriscraper/opinions/united_states/state/sc.py +++ b/juriscraper/opinions/united_states/state/sc.py @@ -23,6 +23,7 @@ from typing import Dict, List, Tuple from juriscraper.AbstractSite import logger +from juriscraper.lib.date_utils import unique_year_month from juriscraper.OpinionSiteLinear import OpinionSiteLinear @@ -80,16 +81,9 @@ def make_backscrape_iterable( and replace the self.back_scrape_iterable """ super().make_backscrape_iterable(kwargs) - backscrape_iterable = [] - seen_year_months = set() - for date_obj, _ in self.back_scrape_iterable: - ym = date_obj.strftime("%Y%m") - if ym in seen_year_months: - continue - seen_year_months.add(ym) - backscrape_iterable.append(date_obj) - - self.back_scrape_iterable = backscrape_iterable + self.back_scrape_iterable = unique_year_month( + self.back_scrape_iterable + ) def _download_backwards(self, date_obj: date) -> None: """Downloads an older page, and parses it diff --git a/juriscraper/oral_args/united_states/federal_appellate/cadc.py b/juriscraper/oral_args/united_states/federal_appellate/cadc.py index 69902a445..4eb9067d5 100644 --- a/juriscraper/oral_args/united_states/federal_appellate/cadc.py +++ b/juriscraper/oral_args/united_states/federal_appellate/cadc.py @@ -11,6 +11,7 @@ from urllib.parse import urljoin from juriscraper.AbstractSite import logger +from juriscraper.lib.date_utils import unique_year_month from juriscraper.OralArgumentSiteLinear import OralArgumentSiteLinear @@ -56,16 +57,8 @@ def _process_html(self): } ) - def _download_backwards(self, url: str) -> None: - logger.info("Backscraping URL '%s'", url) - self.url = url - self.html = self._download() - self._process_html() - - def make_backscrape_iterable(self, kwargs: dict) -> None: - """Use base function to generate a range, then pick - unique year-month combinations to build the backscrape - URLS, and save them to the self.back_scrape_iterable + def _download_backwards(self, target_date: date) -> None: + """Download historical data Note that this URL will work: "https://media.cadc.uscourts.gov/recordings/bydate/2007/9" @@ -74,16 +67,17 @@ def make_backscrape_iterable(self, kwargs: dict) -> None: That's why the '%-m' formatter is needed """ - super().make_backscrape_iterable(kwargs) - seen_year_months = set() - urls = [] - - for tupl in self.back_scrape_iterable: - for item in tupl: - ym = item.strftime("%Y/%-m") - if ym in seen_year_months: - continue - seen_year_months.add(ym) - urls.append(self.base_url.format(ym)) + self.url = self.base_url.format(target_date.strftime("%Y/%-m")) + logger.info("Backscraping URL '%s'", self.url) + self.html = self._download() + self._process_html() - self.back_scrape_iterable = urls + def make_backscrape_iterable(self, kwargs: dict) -> None: + """Use base function to generate a range, then pick + unique year-month combinations to build the backscrape + URLS + """ + super().make_backscrape_iterable(kwargs) + self.back_scrape_iterable = unique_year_month( + self.back_scrape_iterable + ) diff --git a/tests/examples/opinions/united_states/la_example.compare.json b/tests/examples/opinions/united_states/la_example.compare.json index 00154dbf8..11e1478d7 100644 --- a/tests/examples/opinions/united_states/la_example.compare.json +++ b/tests/examples/opinions/united_states/la_example.compare.json @@ -1,14 +1,106 @@ [ { - "case_dates": "2020-04-27", - "case_names": "James J. Donelon, Commissioner of Insurance for the State of Louisiana, in His Capacity as Rehabilitator of Louisiana Health Cooperative, Inc. v. Terry S. Shilling, George G. Cromer, Warner L. Thomas, IV, William A. Oliver, Charles D. Calvi, Patrick C. Powers, Cgi Technologies and Solutions, Inc., Group Resources Incorporated, Beam Partners, LLC, Milliman, Inc., Buck Consultants, LLC, and Travelers Casualty and Surety Company of America", - "download_urls": "http://www.lasc.org/opinions/2020/19-0514.C.OPN.pdf", + "case_dates": "2024-06-28", + "case_names": "Watson Memorial Spiritual Temple of Christ D/B/A Watson Memorial Teaching Ministries, Charlotte Brancaforte, Elio Brancaforte, Benito Brancaforte, Josephine Brown, Robert Parke, Nancy Ellis, Mark Hamrick, Robert Link, Charlotte Link, Ross McDiarmid, Laurel McDiarmid, Jerry Osborne, Jack Stolier, and William Taylor v. Ghassan Korban, in His Capacity as Executive Director of the Sewerage and Water Board of New Orleans", + "download_urls": "http://www.lasc.org/opinions/2024/24-0055.C.OPN.pdf", "precedential_statuses": "Published", "blocked_statuses": false, "date_filed_is_approximate": false, - "docket_numbers": "2019-C-00514", + "docket_numbers": "2024-C-00055", + "judges": "Genovese, J.", + "summaries": "(Parish of Orleans Civil) AFFIRMED AND REMANDED TO THE DISTRICT COURT. SEE OPINION.", + "case_name_shorts": "", + "per_curiam": false + }, + { + "case_dates": "2024-06-28", + "case_names": "People for the Ethical Treatment of Animals v. Board of Supervisors of Louisiana State University and Thomas Galligan, Individually and in His Capacity of President of Louisiana State University", + "download_urls": "http://www.lasc.org/opinions/2024/23-1396.C.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2023-C-01396", + "judges": "Weimer, C.J.", + "summaries": "(Parish of East Baton Rouge) AFFIRMED. SEE OPINION.", + "case_name_shorts": "", + "per_curiam": false + }, + { + "case_dates": "2024-06-28", + "case_names": "James Self; Wilma Self v. Bpx Operating Company", + "download_urls": "http://www.lasc.org/opinions/2024/23-1242.CQ.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2023-CQ-01242", + "judges": "Griffin, J.", + "summaries": "CERTIFIED QUESTION ANSWERED. SEE OPINION.", + "case_name_shorts": "", + "per_curiam": false + }, + { + "case_dates": "2024-06-28", + "case_names": "In Re: Timothy A. Meche", + "download_urls": "http://www.lasc.org/opinions/2024/24-0262.B.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2024-B-00262", + "judges": "", + "summaries": "SUSPENSION IMPOSED. SEE PER CURIAM.", + "case_name_shorts": "", + "per_curiam": true + }, + { + "case_dates": "2024-06-28", + "case_names": "In Re: Robert William Hjortsberg", + "download_urls": "http://www.lasc.org/opinions/2024/24-0149.B.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2024-B-00149", + "judges": "", + "summaries": "SUSPENSION IMPOSED. SEE PER CURIAM.", + "case_name_shorts": "", + "per_curiam": true + }, + { + "case_dates": "2024-06-28", + "case_names": "In Re: Adam Granville Young", + "download_urls": "http://www.lasc.org/opinions/2024/24-0248.B.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2024-B-00248", + "judges": "", + "summaries": "SUSPENSION IMPOSED. SEE PER CURIAM.", + "case_name_shorts": "", + "per_curiam": true + }, + { + "case_dates": "2024-06-28", + "case_names": "Barber Brothers Contracting Company, LLC v. Capitol City Produce Company, LLC; Frank Cushenberry; And Xyz Insurance Company C/W Frank Cushenberry and Robin Cushenberry, Individually and on Behalf of the Minor Children, Noah Cushenberry and Khloe Cushenberry v. Johnny Scott and Barber Brothers Contracting Company, LLC", + "download_urls": "http://www.lasc.org/opinions/2024/23-0788.C.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2023-C-00788", + "judges": "McCallum, J.", + "summaries": "(Parish of East Baton Rouge) AFFIRMED AS AMENDED. SEE OPINION.", + "case_name_shorts": "", + "per_curiam": false + }, + { + "case_dates": "2024-06-28", + "case_names": "Angela Pickard v. amazon.com, Inc.", + "download_urls": "http://www.lasc.org/opinions/2024/23-1596.CQ.OPN.pdf", + "precedential_statuses": "Published", + "blocked_statuses": false, + "date_filed_is_approximate": false, + "docket_numbers": "2023-CQ-01596", "judges": "Crain, J.", - "summaries": "(Parish of East Baton Rouge) We granted this writ to determine whether the Louisiana Commissioner of Insurance, as rehabilitator of a health insurance cooperative, in an action arising out of an agreement between the cooperative and a third-party contractor, is bound by an arbitration clause in that agreement. We find the Commissioner not bound by the arbitration clause. REVERSED AND REMANDED.", - "case_name_shorts": "" + "summaries": "CERTIFIED QUESTIONS ANSWERED. SEE OPINION.", + "case_name_shorts": "", + "per_curiam": false } ] \ No newline at end of file diff --git a/tests/examples/opinions/united_states/la_example.html b/tests/examples/opinions/united_states/la_example.html index 9b9123c27..6301a84d8 100644 --- a/tests/examples/opinions/united_states/la_example.html +++ b/tests/examples/opinions/united_states/la_example.html @@ -1,28 +1,39 @@ + - - April 27, 2020 - Opinions - Louisiana Supreme Court + June 28, 2024 - Opinions - Louisiana Supreme Court + + - - - - - - - - + + + + + + + + + + + + + + + + + - - + + - - + + @@ -37,15 +48,15 @@
-
LASC
+
+ 400 Royal Street, New Orleans, LA 70130
- 400 Royal Street, New Orleans, LA 70130
- Hon. Bernette Joshua Johnson
- Chief Justice

- John Tarlton Olivier
- Clerk Of Court

+ Hon. John L. Weimer
+ Chief Justice

+ Veronica O. Koclanes
+ Clerk Of Court

Sandra A. Vujnovich
Judicial Administrator
@@ -69,51 +80,67 @@ -
@@ -122,7 +149,7 @@ - +