Skip to content

Commit

Permalink
fix: move last cluster removal to exemplars only
Browse files Browse the repository at this point in the history
  • Loading branch information
danellecline committed Sep 5, 2024
1 parent 1d15d3d commit 96fc5db
Showing 1 changed file with 11 additions and 11 deletions.
22 changes: 11 additions & 11 deletions sdcat/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,24 +140,14 @@ def _run_hdbscan_assign(
coverage = 0.0
return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage

# Remove the last cluster which is the unknown cluster
# Note that in the rare case where the coverage is 100%, the last cluster may not be the unknown cluster!
# TODO: how to handle this case?
max_clusters = len(unique_clusters) - 1
num_before = len(cluster_df)
info(f"Removing {num_before - len(cluster_df[cluster_df['cluster'] != max_clusters])} samples from the unknown cluster")
cluster_df = cluster_df[cluster_df['cluster'] != max_clusters]
num_after = len(cluster_df)
info(f"Number of samples after removing unknown cluster: {num_after}")

cluster_df['score'] = scan.probabilities_
# Get the index of the highest scores for each unique cluster sorted in increasing order
# and use this as a representative image for the cluster
max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax()
# Remove the last index which is the -1 cluster
max_scores = max_scores[:-1]

# Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array
# Get the representative embeddings for the max scoring exemplars for each cluster and store them in a numpy array
exemplar_emb = [image_emb[i] for i in max_scores]
exemplar_emb = np.array(exemplar_emb)

Expand All @@ -168,6 +158,16 @@ def _run_hdbscan_assign(
exemplar_df['image_path'] = ancillary_df.iloc[max_scores]['image_path'].tolist()
exemplar_df['embedding'] = exemplar_emb.tolist()

# Remove the last cluster which is the unknown cluster
# Note that in the rare case where the coverage is 100%, the last cluster may not be the unknown cluster!
# TODO: how to handle this case?
max_clusters = len(unique_clusters) - 1
num_before = len(exemplar_df)
info(f"Removing {num_before - len(exemplar_df[exemplar_df['cluster'] != max_clusters])} samples from the unknown cluster")
exemplar_df = exemplar_df[exemplar_df['cluster'] != max_clusters]
num_after = len(exemplar_df)
info(f"Number of samples after removing unknown cluster: {num_after}")

# Reassign the unknowns to the closest cluster - this is only needed if the coverage is less than 1
clustered = labels >= 0
coverage = np.sum(clustered) / num_samples
Expand Down

0 comments on commit 96fc5db

Please sign in to comment.