Skip to content

Commit

Permalink
fix: correct handling of single cluster
Browse files Browse the repository at this point in the history
  • Loading branch information
danellecline committed Jul 31, 2024
1 parent 638254b commit f6ede19
Showing 1 changed file with 7 additions and 7 deletions.
14 changes: 7 additions & 7 deletions sdcat/cluster/cluster.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,13 +124,6 @@ def _run_hdbscan_assign(
unique_clusters.sort()
info(f"Number of clusters including unassigned -1 cluster: {len(unique_clusters)}")

cluster_df['score'] = scan.probabilities_
# Get the index of the highest scores for each unique cluster sorted in increasing order
# and use this as a representative image for the cluster
max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax()
# Remove the last index which is the -1 cluster
max_scores = max_scores[:-1]

# If all the clusters are unassigned, then use all the samples as exemplars,
# and assign them to the unknown cluster. If embedding is empty, this is also the case (failed to extract embeddings)
if len(unique_clusters) == 1 and unique_clusters[0] == -1:
Expand All @@ -144,6 +137,13 @@ def _run_hdbscan_assign(
coverage = 0.0
return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage

cluster_df['score'] = scan.probabilities_
# Get the index of the highest scores for each unique cluster sorted in increasing order
# and use this as a representative image for the cluster
max_scores = cluster_df.sort_values('cluster', ascending=True).groupby('cluster')['score'].idxmax()
# Remove the last index which is the -1 cluster
max_scores = max_scores[:-1]

# Get the representative embeddings for the max scoring examplars for each cluster and store them in a numpy array
exemplar_emb = [image_emb[i] for i in max_scores]
exemplar_emb = np.array(exemplar_emb)
Expand Down

0 comments on commit f6ede19

Please sign in to comment.