Skip to content

Commit

Permalink
fix: do not load the last cluster in exemplars
Browse files Browse the repository at this point in the history
  • Loading branch information
danellecline committed Sep 3, 2024
1 parent 091df20 commit bbc3a9a
Showing 1 changed file with 10 additions and 0 deletions.
10 changes: 10 additions & 0 deletions sdcat/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,6 +140,16 @@ def _run_hdbscan_assign(
coverage = 0.0
return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage

# Remove the last cluster which is the unknown cluster
# Note that in the rare case where the coverage is 100%, the last cluster may not be the unknown cluster!
# TODO: how to handle this case?
max_clusters = len(unique_clusters) - 1
num_before = len(cluster_df)
info(f"Removing {num_before - len(cluster_df[cluster_df['cluster'] != max_clusters])} samples from the unknown cluster")
cluster_df = cluster_df[cluster_df['cluster'] != max_clusters]
num_after = len(cluster_df)
info(f"Number of samples after removing unknown cluster: {num_after}")

cluster_df['score'] = scan.probabilities_
# Get the index of the highest scores for each unique cluster sorted in increasing order
# and use this as a representative image for the cluster
Expand Down

0 comments on commit bbc3a9a

Please sign in to comment.