fix(uscfc, uscfc_vaccine): improve parsing

- parse full judge names - implement extract_from_text for uscfc_vaccine - improve docket number for uscfc_vaccine
freelawproject · Oct 25, 2024 · 4c12aef · 4c12aef
1 parent 9446881
commit 4c12aef
Show file tree

Hide file tree

Showing 5 changed files with 9,308 additions and 9,262 deletions.
diff --git a/juriscraper/opinions/united_states/federal_special/uscfc.py b/juriscraper/opinions/united_states/federal_special/uscfc.py
@@ -8,12 +8,15 @@
 """
 
 import json
+import re
 
 from juriscraper.lib.string_utils import titlecase
 from juriscraper.OpinionSiteLinear import OpinionSiteLinear
 
 
 class Site(OpinionSiteLinear):
+    judge_regex = re.compile(r"Signed by[\w\s]+(Master|Judge)(?P<judge>.+?)\(")
+
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentOpinionsOfTheCourt.pl"
@@ -43,25 +46,38 @@ def _process_html(self):
         for opinion in json.loads(raw_data):
             docket, name = opinion["title"].split(" &bull; ", 1)
 
+            summary = opinion["text"]
+            if judge_match := self.judge_regex.search(summary):
+                judge = judge_match.group("judge").strip(" .()")
+                # Remove: "Signed by ... . Service on parties made"
+                summary = summary[: judge_match.start()].strip(", .()")
+            else:
+                judge = judges_mapper.get(opinion["judge"], "")
+
+            match opinion["criteria"]:
+                case "unreported":
+                    status = "Unpublished"
+                case "reported":
+                    status = "Published"
+                case _:
+                    status = "Unknown"
+
+            parsed_case = {
+                "url": opinion["link"],
+                "date": opinion["date"],
+                "status": status,
+                "summary": summary,
+                "judge": judge,
+                "name": titlecase(name),
+                "docket": docket,
+            }
+
             # Append a "V" as seen in the opinions PDF for the vaccine
             # claims. This will help disambiguation, in case docket
-            # number collide
-            if self.is_vaccine and not docket.lower().endswith("v"):
-                docket += "V"
+            # numbers collide
+            if self.is_vaccine:
+                if not docket.lower().endswith("v"):
+                    yy, number = docket.split("-")
+                    parsed_case["docket"] = f"{yy}-{number.zfill(4)}V"
 
-            judge = judges_mapper.get(opinion["judge"], "")
-            self.cases.append(
-                {
-                    "url": opinion["link"],
-                    "summary": opinion["text"],
-                    "date": opinion["date"],
-                    "status": (
-                        "Unpublished"
-                        if opinion["criteria"] == "unreported"
-                        else "Published"
-                    ),
-                    "judge": judge,
-                    "name": titlecase(name),
-                    "docket": docket,
-                }
-            )
+            self.cases.append(parsed_case)
diff --git a/juriscraper/opinions/united_states/federal_special/uscfc_vaccine.py b/juriscraper/opinions/united_states/federal_special/uscfc_vaccine.py
@@ -9,3 +9,17 @@ class Site(uscfc.Site):
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
         self.url = "https://ecf.cofc.uscourts.gov/cgi-bin/CFC_RecentDecisionsOfTheSpecialMasters.pl"
+
+    def extract_from_text(self, scraped_text: str) -> dict:
+        """Extract 'status' from text, if possible
+
+        On the first page of the opinion, after the parties and attorneys names
+        the decision title may point to it being published.
+
+        The scraped site itself marks all `uscfc_vaccine` opinions as
+        unreported
+        """
+        if "PUBLISHED DECISION" in scraped_text[:1500]:
+            return {"OpinionCluster": {"precedential_status": "Published"}}
+
+        return {}