Merge branch 'main' into fix_coloctapp_cleanup_content

freelawproject · Oct 18, 2024 · 8cd0f56 · 8cd0f56
2 parents 4f767c5 + 0900555
commit 8cd0f56
Show file tree

Hide file tree

Showing 15 changed files with 9,445 additions and 203 deletions.
diff --git a/juriscraper/AbstractSite.py b/juriscraper/AbstractSite.py
@@ -1,6 +1,6 @@
 import hashlib
 import json
-from datetime import date, datetime
+from datetime import date, datetime, timedelta
 from typing import Dict, List, Tuple
 
 import certifi
@@ -257,6 +257,7 @@ def _check_sanity(self):
                 prior_case_name = name
                 i += 1
 
+        future_date_count = 0
         for index, case_date in enumerate(self.case_dates):
             if not isinstance(case_date, date):
                 raise InsanityException(
@@ -266,24 +267,30 @@ def _check_sanity(self):
                 )
             # Sanitize case date, fix typo of current year if present
             fixed_date = fix_future_year_typo(case_date)
+            case_name = self.case_names[index]
             if fixed_date != case_date:
                 logger.info(
                     "Date year typo detected. Converting %s to %s "
-                    "for case '%s' in %s"
-                    % (
-                        case_date,
-                        fixed_date,
-                        self.case_names[index],
-                        self.court_id,
-                    )
+                    "for case '%s' in %s",
+                    case_date,
+                    fixed_date,
+                    case_name,
+                    self.court_id,
                 )
                 case_date = fixed_date
                 self.case_dates[index] = fixed_date
-            if case_date.year > 2025:
-                raise InsanityException(
-                    "%s: member of case_dates list is from way in the future, "
-                    "with value %s" % (self.court_id, case_date.year)
-                )
+
+            # dates should not be in the future. Tolerate a week
+            if case_date > (date.today() + timedelta(days=7)):
+                future_date_count += 1
+                error = f"{self.court_id}: {case_date} date is in the future. Case '{case_name}'"
+                logger.error(error)
+
+                # Interrupt data ingestion if more than 1 record has a bad date
+                if future_date_count > 1:
+                    raise InsanityException(
+                        f"More than 1 case has a date in the future. Last case: {error}"
+                    )
 
         # Is cookies a dict?
         if type(self.cookies) != dict:

diff --git a/juriscraper/OpinionSite.py b/juriscraper/OpinionSite.py
@@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs):
             "per_curiam",
             "types",
             "other_dates",
+            "attorneys",
         ]
         self._req_attrs = [
             "case_dates",
@@ -134,6 +135,9 @@ def _get_per_curiam(self):
     def _get_other_dates(self):
         return None
 
+    def _get_attorneys(self):
+        return None
+
     def extract_from_text(self, scraped_text):
         """Pass scraped text into function and return data as a dictionary
 

diff --git a/juriscraper/OpinionSiteLinear.py b/juriscraper/OpinionSiteLinear.py
@@ -40,6 +40,7 @@ class OpinionSiteLinear(OpinionSite):
         "type",
         "joined_by",
         "other_date",
+        "attorney",
     }
 
     def __init__(self, *args, **kwargs):
@@ -153,6 +154,10 @@ def _get_other_dates(self):
         """Goes into OpinionCluster.other_dates, type: string"""
         return self._get_optional_field_by_id("other_date")
 
+    def _get_attorneys(self):
+        """Goes into OpinionCluster.attorneys, type: string"""
+        return self._get_optional_field_by_id("attorney")
+
     def _check_sanity(self):
         super()._check_sanity()
         # Check that all returned keys have the proper name to be used

diff --git a/juriscraper/lib/date_utils.py b/juriscraper/lib/date_utils.py
@@ -4,6 +4,7 @@
 from datetime import date
 from itertools import zip_longest
 from math import ceil
+from typing import Union
 
 from dateutil.parser import parser, parserinfo
 from dateutil.rrule import DAILY, rrule
@@ -150,3 +151,30 @@ def make_date_range_tuples(start, end, gap):
         for d in rrule(DAILY, interval=gap, dtstart=end_start, until=end)
     ]
     return list(zip_longest(start_dates, end_dates, fillvalue=end))
+
+
+def unique_year_month(
+    date_list: list[Union[date, datetime.datetime, tuple[date]]],
+) -> list[Union[date, datetime.datetime]]:
+    """Takes a list of dates or date tuples, and reduces it
+    to date objects with unique year-months pairs
+
+    :param date_list: a list containing dates or tuples of dates
+        default make_backscrape_iterable returns date tuples
+    :return: a list with date objects of unique year-month pairs
+    """
+    unique_list = []
+    seen_year_months = set()
+
+    for obj in date_list:
+        if isinstance(obj, date) or isinstance(obj, datetime.datetime):
+            obj = [obj]
+
+        for date_obj in obj:
+            ym = date_obj.strftime("%Y%m")
+            if ym in seen_year_months:
+                continue
+            seen_year_months.add(ym)
+            unique_list.append(date_obj)
+
+    return unique_list
diff --git a/juriscraper/opinions/united_states/state/__init__.py b/juriscraper/opinions/united_states/state/__init__.py
@@ -61,6 +61,7 @@
     "kyctapp",
     "la",
     "lactapp_1",
+    "lactapp_5",
     "mass",
     "massappct",
     "massappct_u",

diff --git a/juriscraper/opinions/united_states/state/la.py b/juriscraper/opinions/united_states/state/la.py
@@ -7,6 +7,7 @@
 #          [email protected]
 
 from datetime import date
+from urllib.parse import urljoin
 
 from juriscraper.lib.html_utils import get_html_parsed_text
 from juriscraper.lib.string_utils import titlecase
@@ -36,29 +37,31 @@ def _download(self, request_dict={}):
         return [self._get_subpage_html_by_url(url) for url in urls]
 
     def _process_html(self):
-        path = (
-            "//a["
-            "contains(., 'v.') or "
-            "contains(., 'IN RE') or "
-            "contains(., 'IN THE') or "
-            "contains(., 'vs.') or "
-            "contains(., 'VS.')"
-            "]"
-        )
+        xpath = "//a[contains(@href, 'opinions') and contains(@href, 'pdf')]"
         for html in self.html:
-            for anchor in html.xpath(path):
+            for anchor in html.xpath(xpath):
                 date_string = self._get_date_for_opinions(html)
                 text = anchor.text_content()
                 parts = text.split(None, 1)
                 summary_lines = anchor.getparent().xpath("./text()")
+
+                judge = self._get_judge_above_anchor(anchor)
+                per_curiam = False
+                if "per curiam" in judge.lower():
+                    per_curiam = True
+                    judge = ""
+
                 self.cases.append(
                     {
                         "date": date_string,
                         "docket": parts[0],
-                        "judge": self._get_judge_above_anchor(anchor),
+                        "judge": judge,
+                        "per_curiam": per_curiam,
                         "name": titlecase(parts[1]),
                         "summary": " ".join(summary_lines).replace(text, ""),
-                        "url": f"http://www.lasc.org{anchor.get('href')}",
+                        "url": urljoin(
+                            "http://www.lasc.org", anchor.get("href")
+                        ),
                     }
                 )
 

diff --git a/juriscraper/opinions/united_states/state/lactapp_5.py b/juriscraper/opinions/united_states/state/lactapp_5.py
@@ -0,0 +1,133 @@
+import re
+from datetime import date, datetime
+
+from juriscraper.AbstractSite import logger
+from juriscraper.lib.date_utils import unique_year_month
+from juriscraper.lib.string_utils import titlecase
+from juriscraper.OpinionSiteLinear import OpinionSiteLinear
+
+
+class Site(OpinionSiteLinear):
+    id_to_case_mapper = {
+        "lblCaseTitle": "name",
+        "lblCaseNum": "docket",
+        "lblRulingJudge": "judge",
+        "lblDistrictCourtNo": "lower_court_number",
+        "lblLowerCourt": "lower_court",
+        "lblAttorney": "attorney",
+    }
+    first_opinion_date = datetime(1992, 1, 1)
+    days_interval = 28  # ensure a tick for each month
+    date_regex = re.compile(r"\d{2}/\d{2}/\d{4}")
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.court_id = self.__module__
+        self.url = "https://www.fifthcircuit.org/searchopinions.aspx"
+        self.search_is_configured = False
+        self.parameters = {
+            "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions": "2",
+        }
+        self.target_date = datetime.today()
+        self.make_backscrape_iterable(kwargs)
+        self.status = "Unknown"
+
+    def _process_html(self):
+        # We need to do a plain GET to get hidden inputs
+        # Then we can do our filtered request
+        if not self.test_mode_enabled():
+            self.method = "POST"
+
+            # We need to set the proper search filter the first time
+            if not self.search_is_configured:
+                self.update_hidden_inputs()
+                self.parameters["__EVENTTARGET"] = (
+                    "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions"
+                )
+                self.html = self._download()
+                self.search_is_configured = True
+
+            # Set the proper filters to get the actual data we want
+            self.update_date_filters()
+            self.update_hidden_inputs()
+            self.html = self._download()
+
+        count_xpath = "//*[@id='cntBody_ctlOpinionSearch_Toggle_lblRecordCnt']"
+        logger.info(self.html.xpath(count_xpath)[0].text_content().strip())
+
+        for row in self.html.xpath("//tr[.//a[contains(@id, 'HyperLink_')]]"):
+            fixed_values = {}
+            for id_part, key in self.id_to_case_mapper.items():
+                element = row.xpath(f".//*[contains(@id, '{id_part}')]")
+                if element:
+                    fixed_values[key] = element[0].text_content().strip()
+
+            fixed_values["name"] = titlecase(fixed_values["name"])
+            if fixed_values.get("judge"):
+                fixed_values["judge"] = re.sub(
+                    r"Hon\.[\s\n]+", "", fixed_values["judge"]
+                )
+
+            # Some cases have more than 1 opinion document (check example 2)
+            # Some cases have no links, they will be ignored by this loop
+            for anchor in row.xpath(".//a"):
+                # The opinion date is sometimes in the disposition text
+                disposition = ""
+                case_date = f"{self.target_date.year}/07/01"
+                date_filed_is_approximate = True
+                if disp_container := anchor.xpath("following-sibling::text()"):
+                    disposition = disp_container[0].strip()
+
+                    if date_match := self.date_regex.search(disposition):
+                        case_date = date_match.group(0)
+                        disposition = disposition.rsplit(" on ", 1)[0].strip(
+                            " '"
+                        )
+                        date_filed_is_approximate = False
+
+                case = {
+                    "url": anchor.get("href"),
+                    "disposition": disposition,
+                    "date": case_date,
+                    "date_filed_is_approximate": date_filed_is_approximate,
+                    **fixed_values,
+                }
+
+                self.cases.append(case)
+
+    def update_hidden_inputs(self) -> None:
+        """Parse form values characteristic of aspx sites,
+        and put then on self.parameters for POST use
+        """
+        for input in self.html.xpath('//input[@type="hidden"]'):
+            self.parameters[input.get("name")] = input.get("value", "")
+
+    def update_date_filters(self) -> None:
+        """Set year and month values from `self.target_date`
+        into self.parameters for POST use
+        """
+        logger.info(
+            "Scraping for year: %s - month: %s",
+            self.target_date.year,
+            self.target_date.month,
+        )
+        self.parameters = {
+            "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnMonth": str(
+                self.target_date.month
+            ),
+            "ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnYear": str(
+                self.target_date.year
+            ),
+            "ctl00$cntBody$ctlOpinionSearch_Toggle$btnSearch": "Search",
+        }
+
+    def _download_backwards(self, target_date: date) -> None:
+        self.target_date = target_date
+        self.html = self._download()
+        self._process_html()
+
+    def make_backscrape_iterable(self, kwargs):
+        super().make_backscrape_iterable(kwargs)
+        self.back_scrape_iterable = unique_year_month(
+            self.back_scrape_iterable
+        )
diff --git a/juriscraper/opinions/united_states/state/sc.py b/juriscraper/opinions/united_states/state/sc.py
@@ -23,6 +23,7 @@
 from typing import Dict, List, Tuple
 
 from juriscraper.AbstractSite import logger
+from juriscraper.lib.date_utils import unique_year_month
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
@@ -80,16 +81,9 @@ def make_backscrape_iterable(
         and replace the self.back_scrape_iterable
         """
         super().make_backscrape_iterable(kwargs)
-        backscrape_iterable = []
-        seen_year_months = set()
-        for date_obj, _ in self.back_scrape_iterable:
-            ym = date_obj.strftime("%Y%m")
-            if ym in seen_year_months:
-                continue
-            seen_year_months.add(ym)
-            backscrape_iterable.append(date_obj)
-
-        self.back_scrape_iterable = backscrape_iterable
+        self.back_scrape_iterable = unique_year_month(
+            self.back_scrape_iterable
+        )
 
     def _download_backwards(self, date_obj: date) -> None:
         """Downloads an older page, and parses it