Skip to content

Commit

Permalink
Merge branch 'main' into fix_coloctapp_cleanup_content
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie authored Oct 18, 2024
2 parents 4f767c5 + 0900555 commit 8cd0f56
Show file tree
Hide file tree
Showing 15 changed files with 9,445 additions and 203 deletions.
33 changes: 20 additions & 13 deletions juriscraper/AbstractSite.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import hashlib
import json
from datetime import date, datetime
from datetime import date, datetime, timedelta
from typing import Dict, List, Tuple

import certifi
Expand Down Expand Up @@ -257,6 +257,7 @@ def _check_sanity(self):
prior_case_name = name
i += 1

future_date_count = 0
for index, case_date in enumerate(self.case_dates):
if not isinstance(case_date, date):
raise InsanityException(
Expand All @@ -266,24 +267,30 @@ def _check_sanity(self):
)
# Sanitize case date, fix typo of current year if present
fixed_date = fix_future_year_typo(case_date)
case_name = self.case_names[index]
if fixed_date != case_date:
logger.info(
"Date year typo detected. Converting %s to %s "
"for case '%s' in %s"
% (
case_date,
fixed_date,
self.case_names[index],
self.court_id,
)
"for case '%s' in %s",
case_date,
fixed_date,
case_name,
self.court_id,
)
case_date = fixed_date
self.case_dates[index] = fixed_date
if case_date.year > 2025:
raise InsanityException(
"%s: member of case_dates list is from way in the future, "
"with value %s" % (self.court_id, case_date.year)
)

# dates should not be in the future. Tolerate a week
if case_date > (date.today() + timedelta(days=7)):
future_date_count += 1
error = f"{self.court_id}: {case_date} date is in the future. Case '{case_name}'"
logger.error(error)

# Interrupt data ingestion if more than 1 record has a bad date
if future_date_count > 1:
raise InsanityException(
f"More than 1 case has a date in the future. Last case: {error}"
)

# Is cookies a dict?
if type(self.cookies) != dict:
Expand Down
4 changes: 4 additions & 0 deletions juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs):
"per_curiam",
"types",
"other_dates",
"attorneys",
]
self._req_attrs = [
"case_dates",
Expand Down Expand Up @@ -134,6 +135,9 @@ def _get_per_curiam(self):
def _get_other_dates(self):
return None

def _get_attorneys(self):
return None

def extract_from_text(self, scraped_text):
"""Pass scraped text into function and return data as a dictionary
Expand Down
5 changes: 5 additions & 0 deletions juriscraper/OpinionSiteLinear.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class OpinionSiteLinear(OpinionSite):
"type",
"joined_by",
"other_date",
"attorney",
}

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -153,6 +154,10 @@ def _get_other_dates(self):
"""Goes into OpinionCluster.other_dates, type: string"""
return self._get_optional_field_by_id("other_date")

def _get_attorneys(self):
"""Goes into OpinionCluster.attorneys, type: string"""
return self._get_optional_field_by_id("attorney")

def _check_sanity(self):
super()._check_sanity()
# Check that all returned keys have the proper name to be used
Expand Down
28 changes: 28 additions & 0 deletions juriscraper/lib/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import date
from itertools import zip_longest
from math import ceil
from typing import Union

from dateutil.parser import parser, parserinfo
from dateutil.rrule import DAILY, rrule
Expand Down Expand Up @@ -150,3 +151,30 @@ def make_date_range_tuples(start, end, gap):
for d in rrule(DAILY, interval=gap, dtstart=end_start, until=end)
]
return list(zip_longest(start_dates, end_dates, fillvalue=end))


def unique_year_month(
date_list: list[Union[date, datetime.datetime, tuple[date]]],
) -> list[Union[date, datetime.datetime]]:
"""Takes a list of dates or date tuples, and reduces it
to date objects with unique year-months pairs
:param date_list: a list containing dates or tuples of dates
default make_backscrape_iterable returns date tuples
:return: a list with date objects of unique year-month pairs
"""
unique_list = []
seen_year_months = set()

for obj in date_list:
if isinstance(obj, date) or isinstance(obj, datetime.datetime):
obj = [obj]

for date_obj in obj:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
unique_list.append(date_obj)

return unique_list
1 change: 1 addition & 0 deletions juriscraper/opinions/united_states/state/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"kyctapp",
"la",
"lactapp_1",
"lactapp_5",
"mass",
"massappct",
"massappct_u",
Expand Down
27 changes: 15 additions & 12 deletions juriscraper/opinions/united_states/state/la.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@
# [email protected]

from datetime import date
from urllib.parse import urljoin

from juriscraper.lib.html_utils import get_html_parsed_text
from juriscraper.lib.string_utils import titlecase
Expand Down Expand Up @@ -36,29 +37,31 @@ def _download(self, request_dict={}):
return [self._get_subpage_html_by_url(url) for url in urls]

def _process_html(self):
path = (
"//a["
"contains(., 'v.') or "
"contains(., 'IN RE') or "
"contains(., 'IN THE') or "
"contains(., 'vs.') or "
"contains(., 'VS.')"
"]"
)
xpath = "//a[contains(@href, 'opinions') and contains(@href, 'pdf')]"
for html in self.html:
for anchor in html.xpath(path):
for anchor in html.xpath(xpath):
date_string = self._get_date_for_opinions(html)
text = anchor.text_content()
parts = text.split(None, 1)
summary_lines = anchor.getparent().xpath("./text()")

judge = self._get_judge_above_anchor(anchor)
per_curiam = False
if "per curiam" in judge.lower():
per_curiam = True
judge = ""

self.cases.append(
{
"date": date_string,
"docket": parts[0],
"judge": self._get_judge_above_anchor(anchor),
"judge": judge,
"per_curiam": per_curiam,
"name": titlecase(parts[1]),
"summary": " ".join(summary_lines).replace(text, ""),
"url": f"http://www.lasc.org{anchor.get('href')}",
"url": urljoin(
"http://www.lasc.org", anchor.get("href")
),
}
)

Expand Down
133 changes: 133 additions & 0 deletions juriscraper/opinions/united_states/state/lactapp_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
from datetime import date, datetime

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
id_to_case_mapper = {
"lblCaseTitle": "name",
"lblCaseNum": "docket",
"lblRulingJudge": "judge",
"lblDistrictCourtNo": "lower_court_number",
"lblLowerCourt": "lower_court",
"lblAttorney": "attorney",
}
first_opinion_date = datetime(1992, 1, 1)
days_interval = 28 # ensure a tick for each month
date_regex = re.compile(r"\d{2}/\d{2}/\d{4}")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://www.fifthcircuit.org/searchopinions.aspx"
self.search_is_configured = False
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions": "2",
}
self.target_date = datetime.today()
self.make_backscrape_iterable(kwargs)
self.status = "Unknown"

def _process_html(self):
# We need to do a plain GET to get hidden inputs
# Then we can do our filtered request
if not self.test_mode_enabled():
self.method = "POST"

# We need to set the proper search filter the first time
if not self.search_is_configured:
self.update_hidden_inputs()
self.parameters["__EVENTTARGET"] = (
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions"
)
self.html = self._download()
self.search_is_configured = True

# Set the proper filters to get the actual data we want
self.update_date_filters()
self.update_hidden_inputs()
self.html = self._download()

count_xpath = "//*[@id='cntBody_ctlOpinionSearch_Toggle_lblRecordCnt']"
logger.info(self.html.xpath(count_xpath)[0].text_content().strip())

for row in self.html.xpath("//tr[.//a[contains(@id, 'HyperLink_')]]"):
fixed_values = {}
for id_part, key in self.id_to_case_mapper.items():
element = row.xpath(f".//*[contains(@id, '{id_part}')]")
if element:
fixed_values[key] = element[0].text_content().strip()

fixed_values["name"] = titlecase(fixed_values["name"])
if fixed_values.get("judge"):
fixed_values["judge"] = re.sub(
r"Hon\.[\s\n]+", "", fixed_values["judge"]
)

# Some cases have more than 1 opinion document (check example 2)
# Some cases have no links, they will be ignored by this loop
for anchor in row.xpath(".//a"):
# The opinion date is sometimes in the disposition text
disposition = ""
case_date = f"{self.target_date.year}/07/01"
date_filed_is_approximate = True
if disp_container := anchor.xpath("following-sibling::text()"):
disposition = disp_container[0].strip()

if date_match := self.date_regex.search(disposition):
case_date = date_match.group(0)
disposition = disposition.rsplit(" on ", 1)[0].strip(
" '"
)
date_filed_is_approximate = False

case = {
"url": anchor.get("href"),
"disposition": disposition,
"date": case_date,
"date_filed_is_approximate": date_filed_is_approximate,
**fixed_values,
}

self.cases.append(case)

def update_hidden_inputs(self) -> None:
"""Parse form values characteristic of aspx sites,
and put then on self.parameters for POST use
"""
for input in self.html.xpath('//input[@type="hidden"]'):
self.parameters[input.get("name")] = input.get("value", "")

def update_date_filters(self) -> None:
"""Set year and month values from `self.target_date`
into self.parameters for POST use
"""
logger.info(
"Scraping for year: %s - month: %s",
self.target_date.year,
self.target_date.month,
)
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnMonth": str(
self.target_date.month
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnYear": str(
self.target_date.year
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$btnSearch": "Search",
}

def _download_backwards(self, target_date: date) -> None:
self.target_date = target_date
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs):
super().make_backscrape_iterable(kwargs)
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)
14 changes: 4 additions & 10 deletions juriscraper/opinions/united_states/state/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Dict, List, Tuple

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -80,16 +81,9 @@ def make_backscrape_iterable(
and replace the self.back_scrape_iterable
"""
super().make_backscrape_iterable(kwargs)
backscrape_iterable = []
seen_year_months = set()
for date_obj, _ in self.back_scrape_iterable:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
backscrape_iterable.append(date_obj)

self.back_scrape_iterable = backscrape_iterable
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)

def _download_backwards(self, date_obj: date) -> None:
"""Downloads an older page, and parses it
Expand Down
Loading

0 comments on commit 8cd0f56

Please sign in to comment.