forked from pennlabs/penn-courses
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
3 changed files
with
100 additions
and
39 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,45 +1,78 @@ | ||
from textwrap import dedent | ||
|
||
from django.core.management.base import BaseCommand | ||
from django.core.management.base import BaseCommand, CommandParser | ||
from django.db import transaction | ||
from degree.models import Rule | ||
from degree.models import Rule, Degree | ||
from degree.serializers import RuleSerializer | ||
import json | ||
from collections import defaultdict | ||
from collections import defaultdict, OrderedDict | ||
import decimal | ||
from tqdm import tqdm | ||
from pprint import pprint | ||
|
||
class DecimalEncoder(json.JSONEncoder): | ||
""" | ||
JSON encoder that can handle Decimal objects | ||
""" | ||
def default(self, o): | ||
if isinstance(o, decimal.Decimal): | ||
return str(o) | ||
return super().default(o) | ||
|
||
def recursively_pop(d: dict(), keys: list[str]) -> None: | ||
""" | ||
Recursively remove keys from a dictionary | ||
""" | ||
for key in keys: | ||
if key in d: | ||
d.pop(key) | ||
for value in d.values(): | ||
if isinstance(value, dict) or isinstance(value, OrderedDict): | ||
recursively_pop(value, keys) | ||
|
||
def deduplicate_rules(verbose=False): | ||
rule_to_hash = dict() | ||
for rule in tqdm(Rule.objects.all(), disable=not verbose, desc="Hashing rules"): | ||
serialized = RuleSerializer(rule).data # recursively serializes the rule | ||
recursively_pop(serialized, keys=["id", "parent"]) | ||
pprint("Serialized rules:") | ||
pprint(serialized) | ||
rule_to_hash[rule.id] = hash(json.dumps( | ||
serialized, | ||
sort_keys=True, | ||
ensure_ascii=True, | ||
cls=DecimalEncoder, | ||
)) | ||
|
||
hash_to_rule = defaultdict(list) | ||
for rule_id, hashed in rule_to_hash.items(): | ||
hash_to_rule[hashed].append(rule_id) | ||
|
||
delete_count = 0 | ||
for rule_ids in tqdm(hash_to_rule.values(), disable=not verbose, desc="Deleting duplicates"): | ||
if len(rule_ids) > 1: | ||
print(f"Deleting duplicate rules: {rule_ids[1:]}") | ||
Rule.objects.filter(parent_id__in=rule_ids[1:]).update(parent_id=rule_ids[0]) | ||
for degree in Degree.objects.filter(rules__in=rule_ids[1:]): | ||
degree.rules.add(rule_ids[0]) | ||
degree.rules.remove(*rule_ids[1:]) | ||
deleted, _ = Rule.objects.filter(id__in=rule_ids[1:]).delete() | ||
delete_count += deleted | ||
|
||
return delete_count | ||
|
||
class Command(BaseCommand): | ||
help = dedent( | ||
""" | ||
Removes rules that are identical (based on content hash except for rule ids) | ||
Removes rules that are identical (based on content hash) | ||
""" | ||
) | ||
|
||
@transaction.atomic | ||
def handle(self, *args, **kwargs): | ||
# get toposort of rules | ||
rules = list(Rule.objects.filter(parent=None).order_by('id')) | ||
|
||
# serialize rules to fixed format | ||
rules = { | ||
rule.id: hash(json.dumps( | ||
RuleSerializer(rule).data, | ||
sort_keys=True, | ||
ensure_ascii=True | ||
)) | ||
for rule in rules | ||
} | ||
|
||
# invert rules | ||
inverted_rules = defaultdict(list) | ||
for rule, hash in rules.items(): | ||
inverted_rules[hash].append(rule) | ||
|
||
# fold the rules | ||
for hash, rule_ids in inverted_rules.items(): | ||
if len(rule_ids) > 1: | ||
print(f"Removing rules {rule_ids[1:]}") | ||
Rule.objects.filter(id__in=rule_ids[1:]).values_list("parent_id", flat=True).update(parent_id=rule_ids[0] | ||
|
||
delete_count = deduplicate_rules(verbose=kwargs["verbosity"]) | ||
print(f"Deleted {delete_count} duplicate rules") | ||
|
||
|
||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters