From 90b3e35292623d4a3422dcf286b0a595cd7bd329 Mon Sep 17 00:00:00 2001 From: aadalal <57609353+AaDalal@users.noreply.github.com> Date: Mon, 5 Feb 2024 22:10:11 -0500 Subject: [PATCH] deduplication command --- .../management/commands/deduplicate_rules.py | 89 +++++++++++++------ .../management/commands/fetch_degrees.py | 26 ++++-- .../management/commands/load_degrees.py | 24 ++++- 3 files changed, 100 insertions(+), 39 deletions(-) diff --git a/backend/degree/management/commands/deduplicate_rules.py b/backend/degree/management/commands/deduplicate_rules.py index 5a1679f59..5ae0f2945 100644 --- a/backend/degree/management/commands/deduplicate_rules.py +++ b/backend/degree/management/commands/deduplicate_rules.py @@ -1,45 +1,78 @@ from textwrap import dedent -from django.core.management.base import BaseCommand +from django.core.management.base import BaseCommand, CommandParser from django.db import transaction -from degree.models import Rule +from degree.models import Rule, Degree from degree.serializers import RuleSerializer import json -from collections import defaultdict +from collections import defaultdict, OrderedDict +import decimal +from tqdm import tqdm +from pprint import pprint +class DecimalEncoder(json.JSONEncoder): + """ + JSON encoder that can handle Decimal objects + """ + def default(self, o): + if isinstance(o, decimal.Decimal): + return str(o) + return super().default(o) + +def recursively_pop(d: dict(), keys: list[str]) -> None: + """ + Recursively remove keys from a dictionary + """ + for key in keys: + if key in d: + d.pop(key) + for value in d.values(): + if isinstance(value, dict) or isinstance(value, OrderedDict): + recursively_pop(value, keys) + +def deduplicate_rules(verbose=False): + rule_to_hash = dict() + for rule in tqdm(Rule.objects.all(), disable=not verbose, desc="Hashing rules"): + serialized = RuleSerializer(rule).data # recursively serializes the rule + recursively_pop(serialized, keys=["id", "parent"]) + pprint("Serialized rules:") + pprint(serialized) + rule_to_hash[rule.id] = hash(json.dumps( + serialized, + sort_keys=True, + ensure_ascii=True, + cls=DecimalEncoder, + )) + + hash_to_rule = defaultdict(list) + for rule_id, hashed in rule_to_hash.items(): + hash_to_rule[hashed].append(rule_id) + + delete_count = 0 + for rule_ids in tqdm(hash_to_rule.values(), disable=not verbose, desc="Deleting duplicates"): + if len(rule_ids) > 1: + print(f"Deleting duplicate rules: {rule_ids[1:]}") + Rule.objects.filter(parent_id__in=rule_ids[1:]).update(parent_id=rule_ids[0]) + for degree in Degree.objects.filter(rules__in=rule_ids[1:]): + degree.rules.add(rule_ids[0]) + degree.rules.remove(*rule_ids[1:]) + deleted, _ = Rule.objects.filter(id__in=rule_ids[1:]).delete() + delete_count += deleted + + return delete_count + class Command(BaseCommand): help = dedent( """ - Removes rules that are identical (based on content hash except for rule ids) + Removes rules that are identical (based on content hash) """ ) @transaction.atomic def handle(self, *args, **kwargs): - # get toposort of rules - rules = list(Rule.objects.filter(parent=None).order_by('id')) - - # serialize rules to fixed format - rules = { - rule.id: hash(json.dumps( - RuleSerializer(rule).data, - sort_keys=True, - ensure_ascii=True - )) - for rule in rules - } - - # invert rules - inverted_rules = defaultdict(list) - for rule, hash in rules.items(): - inverted_rules[hash].append(rule) - - # fold the rules - for hash, rule_ids in inverted_rules.items(): - if len(rule_ids) > 1: - print(f"Removing rules {rule_ids[1:]}") - Rule.objects.filter(id__in=rule_ids[1:]).values_list("parent_id", flat=True).update(parent_id=rule_ids[0] - + delete_count = deduplicate_rules(verbose=kwargs["verbosity"]) + print(f"Deleted {delete_count} duplicate rules") + diff --git a/backend/degree/management/commands/fetch_degrees.py b/backend/degree/management/commands/fetch_degrees.py index b74457968..5f21b63b7 100644 --- a/backend/degree/management/commands/fetch_degrees.py +++ b/backend/degree/management/commands/fetch_degrees.py @@ -3,6 +3,7 @@ from django.core.management.base import BaseCommand from django.db import transaction +from degree.management.commands.deduplicate_rules import deduplicate_rules from courses.util import get_current_semester from degree.models import Degree, program_code_to_name @@ -46,6 +47,11 @@ def add_arguments(self, parser): ), ) + parser.add_argument( + "--deduplicate-rules", + action="store_true", + ) + def handle(self, *args, **kwargs): print( dedent( @@ -68,10 +74,11 @@ def handle(self, *args, **kwargs): name = getenv("NAME") assert name is not None - print("Using Penn ID:", pennid) - print("Using Auth Token:", auth_token) - print("Using Refresh Token:", refresh_token) - print("Using Name:", name) + if kwargs["verbosity"]: + print("Using Penn ID:", pennid) + print("Using Auth Token:", auth_token) + print("Using Refresh Token:", refresh_token) + print("Using Name:", name) client = DegreeworksClient( pennid=pennid, auth_token=auth_token, refresh_token=refresh_token, name=name @@ -90,7 +97,12 @@ def handle(self, *args, **kwargs): concentration=degree.concentration, year=degree.year, ).delete() - degree.save() - print(f"Saving degree {degree}...") - parse_and_save_degreeworks(client.audit(degree), degree) \ No newline at end of file + if kwargs["verbosity"]: + print(f"Saving degree {degree}...") + parse_and_save_degreeworks(client.audit(degree), degree) + + if kwargs["deduplicate_rules"]: + if kwargs["verbosity"]: + print("Deduplicating rules...") + deduplicate_rules(verbose=kwargs["verbosity"]) \ No newline at end of file diff --git a/backend/degree/management/commands/load_degrees.py b/backend/degree/management/commands/load_degrees.py index 446b508f0..0ce99d20f 100644 --- a/backend/degree/management/commands/load_degrees.py +++ b/backend/degree/management/commands/load_degrees.py @@ -8,7 +8,7 @@ from degree.models import Degree, program_code_to_name from degree.utils.parse_degreeworks import parse_and_save_degreeworks - +from degree.management.commands.deduplicate_rules import deduplicate_rules class Command(BaseCommand): help = dedent( @@ -31,6 +31,13 @@ def add_arguments(self, parser): ), ) + parser.add_argument( + "--deduplicate-rules", + action="store_true", + ) + + super().add_arguments(parser) + def handle(self, *args, **kwargs): directory = kwargs["directory"] assert path.isdir(directory), f"{directory} is not a directory" @@ -40,9 +47,12 @@ def handle(self, *args, **kwargs): r"(\d+)-(\w+)-(\w+)-(\w+)(?:-(\w+))?", degree_file ).groups() if program not in program_code_to_name: - print(f"Skipping {degree_file} because {program} is not an applicable program code") + if kwargs["verbosity"]: + print(f"Skipping {degree_file} because {program} is not an applicable program code") continue - print("Loading", degree_file, "...") + + if kwargs["verbosity"]: + print("Loading", degree_file, "...") with transaction.atomic(): Degree.objects.filter( @@ -65,6 +75,12 @@ def handle(self, *args, **kwargs): with open(path.join(directory, degree_file)) as f: degree_json = json.load(f) - print(f"Parsing and saving degree {degree}...") + if kwargs["verbosity"]: + print(f"Parsing and saving degree {degree}...") parse_and_save_degreeworks(degree_json, degree) + + if kwargs["deduplicate_rules"]: + if kwargs["verbosity"]: + print("Deduplicating rules...") + deduplicate_rules(verbose=kwargs["verbosity"]) \ No newline at end of file