Skip to content

Commit

Permalink
fix(uscfc, uscfc_vaccine): improve parsing
Browse files Browse the repository at this point in the history
- parse full judge names
- implement extract_from_text for uscfc_vaccine
- improve docket number for uscfc_vaccine
  • Loading branch information
grossir committed Oct 25, 2024
1 parent 9446881 commit 4c12aef
Show file tree
Hide file tree
Showing 5 changed files with 9,308 additions and 9,262 deletions.
54 changes: 35 additions & 19 deletions juriscraper/opinions/united_states/federal_special/uscfc.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@
"""

import json
import re

from juriscraper.lib.string_utils import titlecase
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
judge_regex = re.compile(r"Signed by[\w\s]+(Master|Judge)(?P<judge>.+?)\(")

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentOpinionsOfTheCourt.pl"
Expand Down Expand Up @@ -43,25 +46,38 @@ def _process_html(self):
for opinion in json.loads(raw_data):
docket, name = opinion["title"].split(" &bull; ", 1)

summary = opinion["text"]
if judge_match := self.judge_regex.search(summary):
judge = judge_match.group("judge").strip(" .()")
# Remove: "Signed by ... . Service on parties made"
summary = summary[: judge_match.start()].strip(", .()")
else:
judge = judges_mapper.get(opinion["judge"], "")

match opinion["criteria"]:
case "unreported":
status = "Unpublished"
case "reported":
status = "Published"
case _:
status = "Unknown"

parsed_case = {
"url": opinion["link"],
"date": opinion["date"],
"status": status,
"summary": summary,
"judge": judge,
"name": titlecase(name),
"docket": docket,
}

# Append a "V" as seen in the opinions PDF for the vaccine
# claims. This will help disambiguation, in case docket
# number collide
if self.is_vaccine and not docket.lower().endswith("v"):
docket += "V"
# numbers collide
if self.is_vaccine:
if not docket.lower().endswith("v"):
yy, number = docket.split("-")
parsed_case["docket"] = f"{yy}-{number.zfill(4)}V"

judge = judges_mapper.get(opinion["judge"], "")
self.cases.append(
{
"url": opinion["link"],
"summary": opinion["text"],
"date": opinion["date"],
"status": (
"Unpublished"
if opinion["criteria"] == "unreported"
else "Published"
),
"judge": judge,
"name": titlecase(name),
"docket": docket,
}
)
self.cases.append(parsed_case)
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,17 @@ class Site(uscfc.Site):
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentDecisionsOfTheSpecialMasters.pl"

def extract_from_text(self, scraped_text: str) -> dict:
"""Extract 'status' from text, if possible
On the first page of the opinion, after the parties and attorneys names
the decision title may point to it being published.
The scraped site itself marks all `uscfc_vaccine` opinions as
unreported
"""
if "PUBLISHED DECISION" in scraped_text[:1500]:
return {"OpinionCluster": {"precedential_status": "Published"}}

return {}
Loading

0 comments on commit 4c12aef

Please sign in to comment.