Skip to content

Commit

Permalink
Merge pull request #1212 from freelawproject/feat_lactapp_5
Browse files Browse the repository at this point in the history
feat(lactapp_5): new scraper for Lousiana Court of Appeals Fifth Circuit
  • Loading branch information
flooie authored Oct 18, 2024
2 parents 1ee3bc7 + 396857c commit 5da736f
Show file tree
Hide file tree
Showing 11 changed files with 9,174 additions and 32 deletions.
4 changes: 4 additions & 0 deletions juriscraper/OpinionSite.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,7 @@ def __init__(self, *args, **kwargs):
"per_curiam",
"types",
"other_dates",
"attorneys",
]
self._req_attrs = [
"case_dates",
Expand Down Expand Up @@ -134,6 +135,9 @@ def _get_per_curiam(self):
def _get_other_dates(self):
return None

def _get_attorneys(self):
return None

def extract_from_text(self, scraped_text):
"""Pass scraped text into function and return data as a dictionary
Expand Down
5 changes: 5 additions & 0 deletions juriscraper/OpinionSiteLinear.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ class OpinionSiteLinear(OpinionSite):
"type",
"joined_by",
"other_date",
"attorney",
}

def __init__(self, *args, **kwargs):
Expand Down Expand Up @@ -153,6 +154,10 @@ def _get_other_dates(self):
"""Goes into OpinionCluster.other_dates, type: string"""
return self._get_optional_field_by_id("other_date")

def _get_attorneys(self):
"""Goes into OpinionCluster.attorneys, type: string"""
return self._get_optional_field_by_id("attorney")

def _check_sanity(self):
super()._check_sanity()
# Check that all returned keys have the proper name to be used
Expand Down
28 changes: 28 additions & 0 deletions juriscraper/lib/date_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
from datetime import date
from itertools import zip_longest
from math import ceil
from typing import Union

from dateutil.parser import parser, parserinfo
from dateutil.rrule import DAILY, rrule
Expand Down Expand Up @@ -150,3 +151,30 @@ def make_date_range_tuples(start, end, gap):
for d in rrule(DAILY, interval=gap, dtstart=end_start, until=end)
]
return list(zip_longest(start_dates, end_dates, fillvalue=end))


def unique_year_month(
date_list: list[Union[date, datetime.datetime, tuple[date]]],
) -> list[Union[date, datetime.datetime]]:
"""Takes a list of dates or date tuples, and reduces it
to date objects with unique year-months pairs
:param date_list: a list containing dates or tuples of dates
default make_backscrape_iterable returns date tuples
:return: a list with date objects of unique year-month pairs
"""
unique_list = []
seen_year_months = set()

for obj in date_list:
if isinstance(obj, date) or isinstance(obj, datetime.datetime):
obj = [obj]

for date_obj in obj:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
unique_list.append(date_obj)

return unique_list
1 change: 1 addition & 0 deletions juriscraper/opinions/united_states/state/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@
"kyctapp",
"la",
"lactapp_1",
"lactapp_5",
"mass",
"massappct",
"massappct_u",
Expand Down
133 changes: 133 additions & 0 deletions juriscraper/opinions/united_states/state/lactapp_5.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,133 @@
import re
from datetime import date, datetime

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
id_to_case_mapper = {
"lblCaseTitle": "name",
"lblCaseNum": "docket",
"lblRulingJudge": "judge",
"lblDistrictCourtNo": "lower_court_number",
"lblLowerCourt": "lower_court",
"lblAttorney": "attorney",
}
first_opinion_date = datetime(1992, 1, 1)
days_interval = 28 # ensure a tick for each month
date_regex = re.compile(r"\d{2}/\d{2}/\d{4}")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self.url = "https://www.fifthcircuit.org/searchopinions.aspx"
self.search_is_configured = False
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions": "2",
}
self.target_date = datetime.today()
self.make_backscrape_iterable(kwargs)
self.status = "Unknown"

def _process_html(self):
# We need to do a plain GET to get hidden inputs
# Then we can do our filtered request
if not self.test_mode_enabled():
self.method = "POST"

# We need to set the proper search filter the first time
if not self.search_is_configured:
self.update_hidden_inputs()
self.parameters["__EVENTTARGET"] = (
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlSearchOptions"
)
self.html = self._download()
self.search_is_configured = True

# Set the proper filters to get the actual data we want
self.update_date_filters()
self.update_hidden_inputs()
self.html = self._download()

count_xpath = "//*[@id='cntBody_ctlOpinionSearch_Toggle_lblRecordCnt']"
logger.info(self.html.xpath(count_xpath)[0].text_content().strip())

for row in self.html.xpath("//tr[.//a[contains(@id, 'HyperLink_')]]"):
fixed_values = {}
for id_part, key in self.id_to_case_mapper.items():
element = row.xpath(f".//*[contains(@id, '{id_part}')]")
if element:
fixed_values[key] = element[0].text_content().strip()

fixed_values["name"] = titlecase(fixed_values["name"])
if fixed_values.get("judge"):
fixed_values["judge"] = re.sub(
r"Hon\.[\s\n]+", "", fixed_values["judge"]
)

# Some cases have more than 1 opinion document (check example 2)
# Some cases have no links, they will be ignored by this loop
for anchor in row.xpath(".//a"):
# The opinion date is sometimes in the disposition text
disposition = ""
case_date = f"{self.target_date.year}/07/01"
date_filed_is_approximate = True
if disp_container := anchor.xpath("following-sibling::text()"):
disposition = disp_container[0].strip()

if date_match := self.date_regex.search(disposition):
case_date = date_match.group(0)
disposition = disposition.rsplit(" on ", 1)[0].strip(
" '"
)
date_filed_is_approximate = False

case = {
"url": anchor.get("href"),
"disposition": disposition,
"date": case_date,
"date_filed_is_approximate": date_filed_is_approximate,
**fixed_values,
}

self.cases.append(case)

def update_hidden_inputs(self) -> None:
"""Parse form values characteristic of aspx sites,
and put then on self.parameters for POST use
"""
for input in self.html.xpath('//input[@type="hidden"]'):
self.parameters[input.get("name")] = input.get("value", "")

def update_date_filters(self) -> None:
"""Set year and month values from `self.target_date`
into self.parameters for POST use
"""
logger.info(
"Scraping for year: %s - month: %s",
self.target_date.year,
self.target_date.month,
)
self.parameters = {
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnMonth": str(
self.target_date.month
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$ddlOpnYear": str(
self.target_date.year
),
"ctl00$cntBody$ctlOpinionSearch_Toggle$btnSearch": "Search",
}

def _download_backwards(self, target_date: date) -> None:
self.target_date = target_date
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs):
super().make_backscrape_iterable(kwargs)
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)
14 changes: 4 additions & 10 deletions juriscraper/opinions/united_states/state/sc.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@
from typing import Dict, List, Tuple

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand Down Expand Up @@ -80,16 +81,9 @@ def make_backscrape_iterable(
and replace the self.back_scrape_iterable
"""
super().make_backscrape_iterable(kwargs)
backscrape_iterable = []
seen_year_months = set()
for date_obj, _ in self.back_scrape_iterable:
ym = date_obj.strftime("%Y%m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
backscrape_iterable.append(date_obj)

self.back_scrape_iterable = backscrape_iterable
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)

def _download_backwards(self, date_obj: date) -> None:
"""Downloads an older page, and parses it
Expand Down
38 changes: 16 additions & 22 deletions juriscraper/oral_args/united_states/federal_appellate/cadc.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
from urllib.parse import urljoin

from juriscraper.AbstractSite import logger
from juriscraper.lib.date_utils import unique_year_month
from juriscraper.OralArgumentSiteLinear import OralArgumentSiteLinear


Expand Down Expand Up @@ -56,16 +57,8 @@ def _process_html(self):
}
)

def _download_backwards(self, url: str) -> None:
logger.info("Backscraping URL '%s'", url)
self.url = url
self.html = self._download()
self._process_html()

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Use base function to generate a range, then pick
unique year-month combinations to build the backscrape
URLS, and save them to the self.back_scrape_iterable
def _download_backwards(self, target_date: date) -> None:
"""Download historical data
Note that this URL will work:
"https://media.cadc.uscourts.gov/recordings/bydate/2007/9"
Expand All @@ -74,16 +67,17 @@ def make_backscrape_iterable(self, kwargs: dict) -> None:
That's why the '%-m' formatter is needed
"""
super().make_backscrape_iterable(kwargs)
seen_year_months = set()
urls = []

for tupl in self.back_scrape_iterable:
for item in tupl:
ym = item.strftime("%Y/%-m")
if ym in seen_year_months:
continue
seen_year_months.add(ym)
urls.append(self.base_url.format(ym))
self.url = self.base_url.format(target_date.strftime("%Y/%-m"))
logger.info("Backscraping URL '%s'", self.url)
self.html = self._download()
self._process_html()

self.back_scrape_iterable = urls
def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Use base function to generate a range, then pick
unique year-month combinations to build the backscrape
URLS
"""
super().make_backscrape_iterable(kwargs)
self.back_scrape_iterable = unique_year_month(
self.back_scrape_iterable
)
Loading

0 comments on commit 5da736f

Please sign in to comment.