Skip to content

Commit

Permalink
deduplication command
Browse files Browse the repository at this point in the history
  • Loading branch information
AaDalal committed Feb 6, 2024
1 parent b2bba1e commit 90b3e35
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 39 deletions.
89 changes: 61 additions & 28 deletions backend/degree/management/commands/deduplicate_rules.py
Original file line number Diff line number Diff line change
@@ -1,45 +1,78 @@
from textwrap import dedent

from django.core.management.base import BaseCommand
from django.core.management.base import BaseCommand, CommandParser
from django.db import transaction
from degree.models import Rule
from degree.models import Rule, Degree
from degree.serializers import RuleSerializer
import json
from collections import defaultdict
from collections import defaultdict, OrderedDict
import decimal
from tqdm import tqdm
from pprint import pprint

class DecimalEncoder(json.JSONEncoder):
"""
JSON encoder that can handle Decimal objects
"""
def default(self, o):
if isinstance(o, decimal.Decimal):
return str(o)
return super().default(o)

def recursively_pop(d: dict(), keys: list[str]) -> None:
"""
Recursively remove keys from a dictionary
"""
for key in keys:
if key in d:
d.pop(key)
for value in d.values():
if isinstance(value, dict) or isinstance(value, OrderedDict):
recursively_pop(value, keys)

def deduplicate_rules(verbose=False):
rule_to_hash = dict()
for rule in tqdm(Rule.objects.all(), disable=not verbose, desc="Hashing rules"):
serialized = RuleSerializer(rule).data # recursively serializes the rule
recursively_pop(serialized, keys=["id", "parent"])
pprint("Serialized rules:")
pprint(serialized)
rule_to_hash[rule.id] = hash(json.dumps(
serialized,
sort_keys=True,
ensure_ascii=True,
cls=DecimalEncoder,
))

hash_to_rule = defaultdict(list)
for rule_id, hashed in rule_to_hash.items():
hash_to_rule[hashed].append(rule_id)

delete_count = 0
for rule_ids in tqdm(hash_to_rule.values(), disable=not verbose, desc="Deleting duplicates"):
if len(rule_ids) > 1:
print(f"Deleting duplicate rules: {rule_ids[1:]}")
Rule.objects.filter(parent_id__in=rule_ids[1:]).update(parent_id=rule_ids[0])
for degree in Degree.objects.filter(rules__in=rule_ids[1:]):
degree.rules.add(rule_ids[0])
degree.rules.remove(*rule_ids[1:])
deleted, _ = Rule.objects.filter(id__in=rule_ids[1:]).delete()
delete_count += deleted

return delete_count

class Command(BaseCommand):
help = dedent(
"""
Removes rules that are identical (based on content hash except for rule ids)
Removes rules that are identical (based on content hash)
"""
)

@transaction.atomic
def handle(self, *args, **kwargs):
# get toposort of rules
rules = list(Rule.objects.filter(parent=None).order_by('id'))

# serialize rules to fixed format
rules = {
rule.id: hash(json.dumps(
RuleSerializer(rule).data,
sort_keys=True,
ensure_ascii=True
))
for rule in rules
}

# invert rules
inverted_rules = defaultdict(list)
for rule, hash in rules.items():
inverted_rules[hash].append(rule)

# fold the rules
for hash, rule_ids in inverted_rules.items():
if len(rule_ids) > 1:
print(f"Removing rules {rule_ids[1:]}")
Rule.objects.filter(id__in=rule_ids[1:]).values_list("parent_id", flat=True).update(parent_id=rule_ids[0]

delete_count = deduplicate_rules(verbose=kwargs["verbosity"])
print(f"Deleted {delete_count} duplicate rules")




26 changes: 19 additions & 7 deletions backend/degree/management/commands/fetch_degrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

from django.core.management.base import BaseCommand
from django.db import transaction
from degree.management.commands.deduplicate_rules import deduplicate_rules

from courses.util import get_current_semester
from degree.models import Degree, program_code_to_name
Expand Down Expand Up @@ -46,6 +47,11 @@ def add_arguments(self, parser):
),
)

parser.add_argument(
"--deduplicate-rules",
action="store_true",
)

def handle(self, *args, **kwargs):
print(
dedent(
Expand All @@ -68,10 +74,11 @@ def handle(self, *args, **kwargs):
name = getenv("NAME")
assert name is not None

print("Using Penn ID:", pennid)
print("Using Auth Token:", auth_token)
print("Using Refresh Token:", refresh_token)
print("Using Name:", name)
if kwargs["verbosity"]:
print("Using Penn ID:", pennid)
print("Using Auth Token:", auth_token)
print("Using Refresh Token:", refresh_token)
print("Using Name:", name)

client = DegreeworksClient(
pennid=pennid, auth_token=auth_token, refresh_token=refresh_token, name=name
Expand All @@ -90,7 +97,12 @@ def handle(self, *args, **kwargs):
concentration=degree.concentration,
year=degree.year,
).delete()

degree.save()
print(f"Saving degree {degree}...")
parse_and_save_degreeworks(client.audit(degree), degree)
if kwargs["verbosity"]:
print(f"Saving degree {degree}...")
parse_and_save_degreeworks(client.audit(degree), degree)

if kwargs["deduplicate_rules"]:
if kwargs["verbosity"]:
print("Deduplicating rules...")
deduplicate_rules(verbose=kwargs["verbosity"])
24 changes: 20 additions & 4 deletions backend/degree/management/commands/load_degrees.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@

from degree.models import Degree, program_code_to_name
from degree.utils.parse_degreeworks import parse_and_save_degreeworks

from degree.management.commands.deduplicate_rules import deduplicate_rules

class Command(BaseCommand):
help = dedent(
Expand All @@ -31,6 +31,13 @@ def add_arguments(self, parser):
),
)

parser.add_argument(
"--deduplicate-rules",
action="store_true",
)

super().add_arguments(parser)

def handle(self, *args, **kwargs):
directory = kwargs["directory"]
assert path.isdir(directory), f"{directory} is not a directory"
Expand All @@ -40,9 +47,12 @@ def handle(self, *args, **kwargs):
r"(\d+)-(\w+)-(\w+)-(\w+)(?:-(\w+))?", degree_file
).groups()
if program not in program_code_to_name:
print(f"Skipping {degree_file} because {program} is not an applicable program code")
if kwargs["verbosity"]:
print(f"Skipping {degree_file} because {program} is not an applicable program code")
continue
print("Loading", degree_file, "...")

if kwargs["verbosity"]:
print("Loading", degree_file, "...")

with transaction.atomic():
Degree.objects.filter(
Expand All @@ -65,6 +75,12 @@ def handle(self, *args, **kwargs):
with open(path.join(directory, degree_file)) as f:
degree_json = json.load(f)

print(f"Parsing and saving degree {degree}...")
if kwargs["verbosity"]:
print(f"Parsing and saving degree {degree}...")
parse_and_save_degreeworks(degree_json, degree)

if kwargs["deduplicate_rules"]:
if kwargs["verbosity"]:
print("Deduplicating rules...")
deduplicate_rules(verbose=kwargs["verbosity"])

0 comments on commit 90b3e35

Please sign in to comment.