diff --git a/sdcat/cluster/cluster.py b/sdcat/cluster/cluster.py index e8bb30c..ec55231 100755 --- a/sdcat/cluster/cluster.py +++ b/sdcat/cluster/cluster.py @@ -140,6 +140,16 @@ def _run_hdbscan_assign( coverage = 0.0 return avg_sim_scores, exemplar_df, clusters, cluster_means, coverage + # Remove the last cluster which is the unknown cluster + # Note that in the rare case where the coverage is 100%, the last cluster may not be the unknown cluster! + # TODO: how to handle this case? + max_clusters = len(unique_clusters) - 1 + num_before = len(cluster_df) + info(f"Removing {num_before - len(cluster_df[cluster_df['cluster'] != max_clusters])} samples from the unknown cluster") + cluster_df = cluster_df[cluster_df['cluster'] != max_clusters] + num_after = len(cluster_df) + info(f"Number of samples after removing unknown cluster: {num_after}") + cluster_df['score'] = scan.probabilities_ # Get the index of the highest scores for each unique cluster sorted in increasing order # and use this as a representative image for the cluster