Skip to content

Commit

Permalink
Merge pull request #128 from openstates/bill-relationships-resolution…
Browse files Browse the repository at this point in the history
…-improved

Bill relationships resolution improved
  • Loading branch information
jessemortenson committed Apr 18, 2024
2 parents 78687fa + 10f594b commit e81dd61
Show file tree
Hide file tree
Showing 3 changed files with 76 additions and 22 deletions.
37 changes: 34 additions & 3 deletions openstates/cli/relationships.py
Original file line number Diff line number Diff line change
@@ -1,20 +1,44 @@
import click
import logging
import logging.config
from typing import Union
from openstates.utils import abbr_to_jid
from ..utils.django import init_django
from ..utils import transformers
from ..exceptions import InternalError
from .. import settings


# Attempt to fix bill identifiers in the DB that were NOT normalized when saved the first time
# non-normalized bill identifiers will never be matchable to a bill.identifier value
def fix_abnormal_related_bill_identifiers(jurisdiction_id: str) -> None:
# import of model has to be after django_init
from ..data.models import RelatedBill
abnormal_unresolved_rb = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
related_bill=None,
).exclude(identifier__contains=' ')
for rb in abnormal_unresolved_rb:
new_identifier = transformers.fix_bill_id(rb.identifier)
if new_identifier is not rb.identifier:
# update this related bill row with normalized identifier
rb.identifier = new_identifier
rb.save()


@click.command(help="Resolve unresolved relationships between entities")
@click.argument("jurisdiction_abbreviation")
@click.option(
"--log_level",
help="Set the level of logging to output.",
default="INFO"
)
def main(jurisdiction_abbreviation: str, log_level: str) -> None:
@click.option(
"--session",
help="Session identifier, used to restrict resolution to within a specific session",
default=None
)
def main(jurisdiction_abbreviation: str, log_level: str, session: Union[str, None]) -> None:
# set up logging
logger = logging.getLogger("openstates")
handler_level = log_level
Expand All @@ -26,10 +50,17 @@ def main(jurisdiction_abbreviation: str, log_level: str) -> None:
init_django()
from openstates.importers import resolve_related_bills

logger.info(f"Beginning resolution of bill relationships for {jurisdiction_abbreviation}")
logger.info(f"Beginning resolution of bill relationships for {jurisdiction_abbreviation}, session: {session}")
jurisdiction_id = abbr_to_jid(jurisdiction_abbreviation)

# Prep: resolve any non-normalized bill identifiers in related bill data
# ie if RelatedBill has an identifier like "A1675" instead of "A 1675", then it can't be matched to a real bill
# (this was a historical problem only fixed in mid 2024)
fix_abnormal_related_bill_identifiers(jurisdiction_id)

# Run the resolution logic
try:
resolve_related_bills(jurisdiction_id, logger)
resolve_related_bills(jurisdiction_id, session, logger)
except InternalError as e:
logger.error(f"Error during bill relationship resolution for {jurisdiction_abbreviation}: {e}")

Expand Down
56 changes: 38 additions & 18 deletions openstates/importers/bills.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
from typing import Union
from .base import BaseImporter
from ._types import _JsonDict, Model
from ..exceptions import InternalError
from ..data.models import (
Bill,
RelatedBill,
Expand All @@ -20,33 +20,53 @@
from .organizations import OrganizationImporter


def resolve_related_bills(jurisdiction_id, logger) -> None:
def resolve_related_bills(jurisdiction_id: str, session: Union[str, None], logger) -> None:
# go through all RelatedBill objs that are attached to a bill in this jurisdiction and
# are currently unresolved
related_bills = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
related_bill=None,
)
logger.info(f"Found {len(related_bills)} unresolved bill relationships")
matches_found = 0
if session is not None:
session_log = f"-{session}"
related_bills = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
bill__legislative_session__identifier=session,
related_bill=None,
)
else:
session_log = ""
related_bills = RelatedBill.objects.filter(
bill__legislative_session__jurisdiction_id=jurisdiction_id,
related_bill=None,
)
logger.info(f"Found {len(related_bills)} unresolved bill relationships in {jurisdiction_id}{session_log}")

# go session-by-session and see if we can find matching candidates
# we do this to reduce the number of SELECT queries we run in cases where there are many relations unresolved
sessions = {}
for rb in related_bills:
if rb.legislative_session not in sessions:
sessions[rb.legislative_session] = [rb.identifier]
else:
sessions[rb.legislative_session].append(rb.identifier)

session_candidate_bills = {}
for session in dict.keys(sessions):
candidates = list(
Bill.objects.filter(
legislative_session__identifier=rb.legislative_session,
identifier__in=sessions[session],
legislative_session__identifier=session,
legislative_session__jurisdiction_id=jurisdiction_id,
identifier=rb.identifier,
)
)
if len(candidates) == 1:
rb.related_bill = candidates[0]
session_candidate_bills[session] = {}
for bill in candidates:
session_candidate_bills[session][bill.identifier] = bill

matches_found = 0
for rb in related_bills:
if rb.identifier in session_candidate_bills[rb.legislative_session]:
rb.related_bill = session_candidate_bills[rb.legislative_session][rb.identifier]
rb.save()
matches_found += 1
logger.debug(f"Resolved {rb.legislative_session} {rb.bill.identifier}")
elif len(candidates) > 1: # pragma: no cover
# if we ever see this, we need to add additional fields on the relation
raise InternalError(
"multiple related_bill candidates found for {}".format(rb)
)
else:
logger.debug(f"FAILED to resolve {rb.legislative_session} {rb.bill.identifier}")

Expand Down Expand Up @@ -139,7 +159,7 @@ def prepare_for_db(self, data: _JsonDict) -> _JsonDict:
return data

def postimport(self) -> None:
resolve_related_bills(self.jurisdiction_id, self.logger)
resolve_related_bills(self.jurisdiction_id, None, self.logger)

def update_computed_fields(self, obj: Model) -> None:
update_bill_fields(obj, save=False)
5 changes: 4 additions & 1 deletion openstates/scrape/bill.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
import warnings
from ..utils import _make_pseudo_id
from ..utils import _make_pseudo_id, transformers
from .popolo import pseudo_organization
from .base import BaseModel, SourceMixin, AssociatedLinkMixin, cleanup_list
from .schemas.bill import schema
Expand Down Expand Up @@ -90,6 +90,9 @@ def add_citation(
)

def add_related_bill(self, identifier, legislative_session, relation_type):
# Normalize identifier before saving
identifier = transformers.fix_bill_id(identifier)

# will we need jurisdiction, organization?
self.related_bills.append(
{
Expand Down

0 comments on commit e81dd61

Please sign in to comment.