Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(lactapp_1): dynamic backscraper #1210

Merged
merged 3 commits into from
Oct 21, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
69 changes: 69 additions & 0 deletions juriscraper/lib/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,9 @@
import re
from datetime import date, datetime
from itertools import chain, islice, tee

from juriscraper.AbstractSite import logger

from .string_utils import force_unicode


Expand Down Expand Up @@ -51,3 +54,69 @@ def clean_court_object(obj):
return re.sub(r"\s+,", ",", s)
else:
return obj


def backscrape_over_paginated_results(
url_template: str,
first_page: int,
last_page: int,
start_date: date,
end_date: date,
date_fmt: str,
site,
) -> list[dict]:
"""
Iterates over consecutive pages, looking for cases in a specific date range
Of use when the page offers no date filters, so one must look through all the pages
Assumes the page is returning results ordered by date

:param url_template: string to apply .format() to, like "url&page={}"
where the argument to pass will be the page number
:param first_page: integer of the first page
:param last_page: integer of the last page
:param start_date: cases with a date greater than this value will be collected
:param end_date: cases with a date lesses than this value will be collected
:param date_fmt: date format to parse case dates
:param site: the site object

:return: the list of cases between the dates
"""
cases = []

if isinstance(start_date, datetime):
start_date = start_date.date()
if isinstance(end_date, datetime):
end_date = end_date.date()

for page in range(first_page, last_page):
site.cases = [] # reset results container
site.url = url_template.format(page)
site.html = site._download()
site._process_html()

# results are ordered by desceding date
earliest = datetime.strptime(site.cases[-1]["date"], date_fmt).date()
latest = datetime.strptime(site.cases[0]["date"], date_fmt).date()
logger.info("Results page has date range %s to %s", earliest, latest)

# no intersection between date ranges
if max(earliest, start_date) >= min(latest, end_date):
# if earliest date from results is earlier than
# the start date, no need to iterate any further
if earliest < start_date:
logger.info(
"Finishing backscrape: earliest results date is %s earlier than start %s",
earliest,
start_date,
)
break
continue

# if there is an intersection, test every case and
# collect the matching cases
for case in site.cases:
case_date = datetime.strptime(case["date"], date_fmt).date()
if case_date < end_date and case_date > start_date:
cases.append(case)

return cases
49 changes: 21 additions & 28 deletions juriscraper/opinions/united_states/state/lactapp_1.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,24 +6,31 @@
2019-11-24: Created by mmantel
"""

import math
import re
from datetime import date, datetime

from juriscraper.AbstractSite import logger
from juriscraper.lib.html_utils import (
get_row_column_links,
get_row_column_text,
)
from juriscraper.lib.utils import backscrape_over_paginated_results
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


class Site(OpinionSiteLinear):
first_opinion_date = datetime(2006, 11, 3)
# Ensure the backscrape iterable has a single item
days_interval = (datetime.today() - first_opinion_date).days + 2

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
self.court_id = self.__module__
self._page_size = 50
self._base_url = f"https://www.la-fcca.org/opiniongrid/opinionpub.php?opinionpage_size={self._page_size}"
self.url = self._base_url
self.back_scrape_iterable = self._generate_back_scrape_range()
page_size = 50
self.base_url = f"https://www.la-fcca.org/opiniongrid/opinionpub.php?opinionpage_size={page_size}"
self.url = self.base_url
self.make_backscrape_iterable(kwargs)
self.is_backscrape = False

# The opinions page does not indicate whether a case is
# published or unpublished. That is only found in the PDF.
Expand All @@ -37,7 +44,7 @@ def _process_html(self):
for row in self.html.cssselect("#opinion_contentTable tbody tr"):
self.cases.append(
{
"date": get_row_column_text(row, 1),
"date": get_row_column_text(row, 1).replace(" ", ""),
"docket": self._parse_docket_numbers(row),
"name": get_row_column_text(row, 4),
"url": get_row_column_links(row, 3),
Expand All @@ -54,25 +61,11 @@ def _parse_docket_numbers(self, row):
case_numbers = re.findall("[0-9]{4}[A-Z]{2}[0-9]{4}", text)
return ", ".join(case_numbers)

def _generate_back_scrape_range(self):
# This is a generator function, so this code won't run until a
# caller begins iterating, which is necessary because
# otherwise this would run during unit tests and trigger an
# unwanted network request.
last_page = self._get_last_page_number()

yield from range(1, last_page + 1)

def _get_last_page_number(self):
# The link to the last page has an onclick like:
# javascript:opinion_doPostBack('paging','','&opinionsort_field=sortdate&opinionsort_field_by=&opinionsort_field_type=&opinionsort_type=DESC&opinionpage_size=50&opinionp=395')
# where 395 is the last page number.
html = self._get_html_tree_by_url(self._base_url, {})
el = html.cssselect("a[title=last]")[0]
onclick = el.get("onclick")
return int(re.findall(r"\d+", onclick)[-1])

def _download_backwards(self, page):
self.url = self._base_url + ("&opinionp=%d" % page)
self.html = self._download()
self._process_html()
def _download_backwards(self, dates: tuple[date]) -> None:
logger.info("Backscraping for range %s %s", *dates)
url_template = f"{self.base_url}&opinionp={{}}"
start, end = dates
last_page = 500 # Real last page is 467 in Oct, 2024
self.cases = backscrape_over_paginated_results(
url_template, 2, last_page, start, end, "%m/%d/%Y", self
)
74 changes: 10 additions & 64 deletions juriscraper/opinions/united_states/state/nd.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@

from juriscraper.AbstractSite import logger
from juriscraper.lib.string_utils import normalize_dashes
from juriscraper.lib.utils import backscrape_over_paginated_results
from juriscraper.OpinionSiteLinear import OpinionSiteLinear


Expand All @@ -21,7 +22,9 @@ class Site(OpinionSiteLinear):
"nature_of_suit",
"judge",
]
first_opinion_date = datetime(1955, 10, 25).date()
first_opinion_date = datetime(1955, 10, 25)
# Ensure the backscrape iterable has a single item
days_interval = (datetime.today() - first_opinion_date).days + 2

def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
Expand Down Expand Up @@ -158,69 +161,12 @@ def _download_backwards(self, dates: Tuple[date]) -> None:
:param dates: (start_date, end_date) tuple
:return None
"""
logger.info("Backscraping for range %s %s", *dates)
start, end = dates
date_fmt = "%m/%d/%Y"
# last page is 118 (August 2024)
first_page, last_page = 2, 130
last_page = 130
base_url = self.url
cases = []

for page in range(first_page, last_page):
self.cases = [] # reset results container
self.url = f"{base_url}&page={page}"
self.html = self._download()
self._process_html()

# results are ordered by desceding date
earliest = datetime.strptime(
self.cases[-1]["date"], date_fmt
).date()
latest = datetime.strptime(self.cases[0]["date"], date_fmt).date()
logger.info(
"Results page has date range %s to %s", earliest, latest
)

# no intersection between date ranges
if max(earliest, start) >= min(latest, end):
# if earliest date from results is earlier than
# the start date, no need to iterate any further
if earliest < start:
logger.info(
"Finishing backscrape: earliest results date is %s earlier than start %s",
earliest,
start,
)
break
continue

# if there is an intersection, test every case and
# collect the matching cases
for case in self.cases:
case_date = datetime.strptime(case["date"], date_fmt).date()
if case_date < end and case_date > start:
cases.append(case)

self.cases = cases

def make_backscrape_iterable(self, kwargs: dict) -> None:
"""Checks if backscrape start and end arguments have been passed
by caller, and parses them accordingly

:param kwargs: passed when initializing the scraper, may or
may not contain backscrape controlling arguments
:return None
"""
start = kwargs.get("backscrape_start")
end = kwargs.get("backscrape_end")

if start:
start = datetime.strptime(start, "%m/%d/%Y").date()
else:
start = self.first_opinion_date
if end:
end = datetime.strptime(end, "%m/%d/%Y").date()
else:
end = datetime.now().date()

logger.info("Backscraping for cases between %s and %s", start, end)
self.back_scrape_iterable = [(start, end)]
url_template = f"{base_url}&page={{}}"
self.cases = backscrape_over_paginated_results(
url_template, 2, last_page, start, end, "%m/%d/%Y", self
)
Loading