Skip to content

Commit

Permalink
Merge branch 'main' into fix_la
Browse files Browse the repository at this point in the history
  • Loading branch information
flooie authored Oct 18, 2024
2 parents d17d49b + 5da736f commit ed37c5f
Show file tree
Hide file tree
Showing 30 changed files with 20,143 additions and 7,133 deletions.
16 changes: 14 additions & 2 deletions CHANGES.md
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,20 @@ Releases are also tagged in git, if that's helpful.

## Current

**2.6.30 - 2024-10-10**

Fixes:
- fix `CADC` oral arguments

## Past

**2.6.29 - 2024-10-10**

Fixes:
- fix `or` and `orctapp` scraper, scraping new endpoint
- fix cache control headers in `AbstractSite`
- fix `sc` expected content types

**2.6.28 - 2024-09-27**

Features:
Expand All @@ -27,8 +41,6 @@ Fixes:
- `cadc_u` change docket number getter
- `sc` implement new site

## Past

**2.6.27 - 2024-09-16**

Fixes:
Expand Down
4 changes: 4 additions & 0 deletions juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs):
"per_curiam",
"types",
"other_dates",
"attorneys",
]
self._req_attrs = [
"case_dates",
Expand Down Expand Up @@ -134,6 +135,9 @@ def _get_per_curiam(self):
def _get_other_dates(self):
return None

def _get_attorneys(self):
return None

def extract_from_text(self, scraped_text):
"""Pass scraped text into function and return data as a dictionary
Expand Down
5 changes: 5 additions & 0 deletions juriscraper/OpinionSiteLinear.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class OpinionSiteLinear(OpinionSite):
"type",
"joined_by",
"other_date",
"attorney",
}

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -153,6 +154,10 @@ def _get_other_dates(self):
"""Goes into OpinionCluster.other_dates, type: string"""
return self._get_optional_field_by_id("other_date")

def _get_attorneys(self):
"""Goes into OpinionCluster.attorneys, type: string"""
return self._get_optional_field_by_id("attorney")

def _check_sanity(self):
super()._check_sanity()
# Check that all returned keys have the proper name to be used
Expand Down
4 changes: 4 additions & 0 deletions juriscraper/OralArgumentSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ def __init__(self, *args, **kwargs):
"docket_numbers",
"judges",
"case_name_shorts",
"attorneys",
]
self._req_attrs = [
"case_dates",
Expand Down Expand Up @@ -47,3 +48,6 @@ def _get_docket_numbers(self):

def _get_judges(self):
return None

def _get_attorneys(self):
return None
3 changes: 3 additions & 0 deletions juriscraper/OralArgumentSiteLinear.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,3 +39,6 @@ def _get_optional_field_by_id(self, id):

def _get_judges(self):
return self._get_optional_field_by_id("judge")

def _get_attorneys(self):
return self._get_optional_field_by_id("attorney")
28 changes: 28 additions & 0 deletions juriscraper/lib/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import date
from itertools import zip_longest
from math import ceil
from typing import Union

from dateutil.parser import parser, parserinfo
from dateutil.rrule import DAILY, rrule
Expand Down Expand Up @@ -150,3 +151,30 @@ def make_date_range_tuples(start, end, gap):
for d in rrule(DAILY, interval=gap, dtstart=end_start, until=end)
]
return list(zip_longest(start_dates, end_dates, fillvalue=end))


def unique_year_month(
date_list: list[Union[date, datetime.datetime, tuple[date]]],
) -> list[Union[date, datetime.datetime]]:
"""Takes a list of dates or date tuples, and reduces it
to date objects with unique year-months pairs
:param date_list: a list containing dates or tuples of dates
default make_backscrape_iterable returns date tuples
:return: a list with date objects of unique year-month pairs
"""
unique_list = []
seen_year_months = set()

for obj in date_list:
if isinstance(obj, date) or isinstance(obj, datetime.datetime):
obj = [obj]

for date_obj in obj:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
unique_list.append(date_obj)

return unique_list
2 changes: 2 additions & 0 deletions juriscraper/opinions/united_states/state/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"kyctapp",
"la",
"lactapp_1",
"lactapp_5",
"mass",
"massappct",
"massappct_u",
Expand Down Expand Up @@ -144,6 +145,7 @@
"ohioctcl_beginningofyear",
"or",
"orctapp",
"ortc",
"okla",
"oklaag",
"oklacivapp",
Expand Down
133 changes: 133 additions & 0 deletions juriscraper/opinions/united_states/state/lactapp_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
from datetime import date, datetime

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
id_to_case_mapper = {
"lblCaseTitle": "name",
"lblCaseNum": "docket",
"lblRulingJudge": "judge",
"lblDistrictCourtNo": "lower_court_number",
"lblLowerCourt": "lower_court",
"lblAttorney": "attorney",
}
first_opinion_date = datetime(1992, 1, 1)
days_interval = 28 # ensure a tick for each month
date_regex = re.compile(r"\d{2}/\d{2}/\d{4}")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://www.fifthcircuit.org/searchopinions.aspx"
self.search_is_configured = False
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions": "2",
}
self.target_date = datetime.today()
self.make_backscrape_iterable(kwargs)
self.status = "Unknown"

def _process_html(self):
# We need to do a plain GET to get hidden inputs
# Then we can do our filtered request
if not self.test_mode_enabled():
self.method = "POST"

# We need to set the proper search filter the first time
if not self.search_is_configured:
self.update_hidden_inputs()
self.parameters["__EVENTTARGET"] = (
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions"
)
self.html = self._download()
self.search_is_configured = True

# Set the proper filters to get the actual data we want
self.update_date_filters()
self.update_hidden_inputs()
self.html = self._download()

count_xpath = "//*[@id='cntBody_ctlOpinionSearch_Toggle_lblRecordCnt']"
logger.info(self.html.xpath(count_xpath)[0].text_content().strip())

for row in self.html.xpath("//tr[.//a[contains(@id, 'HyperLink_')]]"):
fixed_values = {}
for id_part, key in self.id_to_case_mapper.items():
element = row.xpath(f".//*[contains(@id, '{id_part}')]")
if element:
fixed_values[key] = element[0].text_content().strip()

fixed_values["name"] = titlecase(fixed_values["name"])
if fixed_values.get("judge"):
fixed_values["judge"] = re.sub(
r"Hon\.[\s\n]+", "", fixed_values["judge"]
)

# Some cases have more than 1 opinion document (check example 2)
# Some cases have no links, they will be ignored by this loop
for anchor in row.xpath(".//a"):
# The opinion date is sometimes in the disposition text
disposition = ""
case_date = f"{self.target_date.year}/07/01"
date_filed_is_approximate = True
if disp_container := anchor.xpath("following-sibling::text()"):
disposition = disp_container[0].strip()

if date_match := self.date_regex.search(disposition):
case_date = date_match.group(0)
disposition = disposition.rsplit(" on ", 1)[0].strip(
" '"
)
date_filed_is_approximate = False

case = {
"url": anchor.get("href"),
"disposition": disposition,
"date": case_date,
"date_filed_is_approximate": date_filed_is_approximate,
**fixed_values,
}

self.cases.append(case)

def update_hidden_inputs(self) -> None:
"""Parse form values characteristic of aspx sites,
and put then on self.parameters for POST use
"""
for input in self.html.xpath('//input[@type="hidden"]'):
self.parameters[input.get("name")] = input.get("value", "")

def update_date_filters(self) -> None:
"""Set year and month values from `self.target_date`
into self.parameters for POST use
"""
logger.info(
"Scraping for year: %s - month: %s",
self.target_date.year,
self.target_date.month,
)
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnMonth": str(
self.target_date.month
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnYear": str(
self.target_date.year
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$btnSearch": "Search",
}

def _download_backwards(self, target_date: date) -> None:
self.target_date = target_date
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs):
super().make_backscrape_iterable(kwargs)
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)
Loading

0 comments on commit ed37c5f

Please sign in to comment.