Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix(uscfc): implement new site #1224

Open
wants to merge 2 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,7 @@
"nmcca",
"tax",
"uscfc",
"uscfc_u",
"uscfc_vaccine",
"uscfc_vaccine_u",
"fisc",
"fiscr",
]
177 changes: 52 additions & 125 deletions juriscraper/opinions/united_states/federal_special/uscfc.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,137 +4,64 @@

Notes:
Scraper adapted for new website as of February 20, 2014.
2024-10-23, grossir: implemented new site
"""

import datetime
import re
import json

from lxml import html
from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear

from juriscraper.lib.exceptions import InsanityException
from juriscraper.lib.string_utils import (
clean_if_py3,
convert_date_string,
titlecase,
)
from juriscraper.OpinionSite import OpinionSite


class Site(OpinionSite):
class Site(OpinionSiteLinear):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "http://www.uscfc.uscourts.gov/aggregator/sources/8"
self.back_scrape_iterable = list(range(1, 4))
self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentOpinionsOfTheCourt.pl"
self.court_id = self.__module__
self.today = datetime.datetime.now()

def _download(self, request_dict={}):
if self.test_mode_enabled():
# Use static 'today' date for consisting test results
self.today = convert_date_string("2018/10/17")
return super()._download(request_dict)

def _get_case_dates(self):
dates = []
for item in self.html.xpath('//span[@class="feed-item-date"]'):
text = item.text_content().strip()
words = text.split()
if len(words) == 2:
date = convert_date_string(words[1])
elif "ago" in text:
# The record was added today "X hours and Y min ago"
date = self.today
else:
raise InsanityException(
f"Unrecognized date element string: {text}"
)
dates.append(date)
return dates

def _get_case_names(self):
case_names = []
for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
t = " ".join(clean_if_py3(t).split()) # Normalize whitespace
if t.strip():
# If there is something other than whitespace...
if not isinstance(t, str):
t = str(t, encoding="utf-8")

if " • " in t:
t = t.split(" • ")[1].strip()
t = titlecase(t.lower())
case_names.append(t)
return case_names

def _get_download_urls(self):
path = '//h3[@class="feed-item-title"]/a/@href'
return list(self.html.xpath(path))

def _get_precedential_statuses(self):
return ["Published"] * len(self.case_names)

def _get_docket_numbers(self):
docket_numbers = []
for t in self.html.xpath('//h3[@class="feed-item-title"]//text()'):
t = clean_if_py3(t)
if t.strip():
# If there is something other than whitespace...
if not isinstance(t, str):
t = str(t, encoding="utf-8")

if " • " in t:
t = t.split(" • ")[0].strip()
docket_numbers.append(t)
return docket_numbers

def _get_summaries(self):
summaries = []
path = '//div[@class="feed-item-body"]'
for e in self.html.xpath(path):
s = html.tostring(e, method="text", encoding="unicode")
s = clean_if_py3(s).split("Keywords:")[0]
summaries.append(s)

return summaries

def _get_judges(self):
path = '//div[@class="feed-item-body"]'
judges = []
splitters = [
"Signed by Chief Judge",
"Signed by Judge",
"Signed by Chief Special Master", # Vaccine courts have odd names for judges
"Signed by Special Master",
]
for e in self.html.xpath(path):
t = html.tostring(e, method="text", encoding="unicode")
t = clean_if_py3(t).split("Keywords:")[0]
for splitter in splitters:
judge_parts = t.rsplit(splitter)
if len(judge_parts) == 1:
# No splits found...
judge = ""
continue
else:
judge = judge_parts[1]
break

# Often the text looks like: 'Judge Susan G. Braden. (jt1) Copy to parties.' In that case we only
# want the name, not the rest.
length_of_match = 2
m = re.search(
r"[a-z]{%s}\." % length_of_match, judge
) # Two lower case letters followed by a period
if m:
judge = judge[: m.start() + length_of_match]
else:
judge = ""
judge.strip(".")
judges.append(judge)
return judges

def _download_backwards(self, page):
self.url = (
f"http://www.uscfc.uscourts.gov/aggregator/sources/8?page={page}"
self.is_vaccine = "uscfc_vaccine" in self.court_id

def _process_html(self):
"""The site returns a page with all opinions for this time period
The opinions are inside a <script> tag, as a Javascript constant
that will be parsed using json.loads
"""
judges_mapper = {
option.get("value"): option.text_content()
for option in self.html.xpath("//select[@name='judge']//option")
}
judges_mapper.pop("UNKNOWN", "")
judges_mapper.pop("all", "")

raw_data = (
self.html.xpath("//script")[0]
.text_content()
.strip()
.strip("; ")
.split("= ", 1)[1]
)
self.html = self._download()

for opinion in json.loads(raw_data):
docket, name = opinion["title"].split(" &bull; ", 1)

# Append a "V" as seen in the opinions PDF for the vaccine
# claims. This will help disambiguation, in case docket
# number collide
if self.is_vaccine and not docket.lower().endswith("v"):
docket += "V"
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think docket numbers should resember if possible the full docket number in the opinion

No. 22-0075V
No. 22-0667V

in this case adding leading zeroes


judge = judges_mapper.get(opinion["judge"], "")
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think we should parse out the full name instead of using the judge field in the json

self.cases.append(
{
"url": opinion["link"],
"summary": opinion["text"],
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I dont think the summary text you identify fits the criteria of what I would classify as a summary (at least for the vaccine court).

summaries: "PUBLIC DECISION (Originally filed: 11/03/2023) regarding [32] DECISION Stipulation/Proffer ( Signed by Chief Special Master Brian H. Corcoran. )(mpj) Service on parties made."

I think its closer to a disposition, but not really even that it's more like a document description. which we dont collect in case law.

"date": opinion["date"],
"status": (
"Unpublished"
if opinion["criteria"] == "unreported"
else "Published"
),
"judge": judge,
"name": titlecase(name),
"docket": docket,
}
)
22 changes: 0 additions & 22 deletions juriscraper/opinions/united_states/federal_special/uscfc_u.py

This file was deleted.

Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,4 @@
class Site(uscfc.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "http://www.uscfc.uscourts.gov/aggregator/sources/7"
self.court_id = self.__module__
self.back_scrape_iterable = [1]

def _download_backwards(self, page):
self.url = (
f"http://www.uscfc.uscourts.gov/aggregator/sources/7?page={page}"
)
self.html = self._download()
self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentDecisionsOfTheSpecialMasters.pl"

This file was deleted.

Loading
Loading