Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Overlapping cluster stats #21

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions clusterers/clustering_stats.cc
Original file line number Diff line number Diff line change
Expand Up @@ -71,14 +71,20 @@ absl::StatusOr<ClusteringStatistics> GetStats(const GbbsGraph& graph,
ComputeEdgeDensity(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config);
auto end_edge_density = std::chrono::steady_clock::now();
PrintTime(end_diameter, end_edge_density, "Compute EdgeDensity");
ComputeEdgeDensityOverlap(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config);
auto end_edge_density_overlap = std::chrono::steady_clock::now();
PrintTime(end_edge_density, end_edge_density_overlap, "Compute EdgeDensityOverlap");
ComputeTriangleDensity(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config);
auto end_triangle_density = std::chrono::steady_clock::now();
PrintTime(end_edge_density, end_triangle_density, "Compute Triangle Density");
PrintTime(end_edge_density_overlap, end_triangle_density, "Compute Triangle Density");
ComputeTriangleDensityOverlap(graph, clustering, &clustering_stats, cluster_ids, clustering_stats_config);
auto end_triangle_density_overlap = std::chrono::steady_clock::now();
PrintTime(end_triangle_density, end_triangle_density_overlap, "Compute Triangle Density");

size_t n = graph.Graph()->n;
ComputeARI(n, clustering, &clustering_stats, communities, clustering_stats_config);
auto end_ari = std::chrono::steady_clock::now();
PrintTime(end_triangle_density, end_ari, "Compute ARI");
PrintTime(end_triangle_density_overlap, end_ari, "Compute ARI");
ComputeNMI(n, clustering, &clustering_stats, communities, clustering_stats_config);
auto end_nmi = std::chrono::steady_clock::now();
PrintTime(end_ari, end_nmi, "Compute NMI");
Expand Down
4 changes: 4 additions & 0 deletions clusterers/clustering_stats.proto
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,8 @@ message ClusteringStatsConfig {
optional bool compute_precision_recall = 9;
optional bool compute_nmi = 10;
optional double f_score_param = 11;
optional bool compute_edge_density_overlap = 12;
optional bool compute_triangle_density_overlap = 13;
}

message DistributionStats {
Expand Down Expand Up @@ -44,4 +46,6 @@ message ClusteringStatistics {
optional double f_score_param = 31;
optional double weighted_edge_density_mean = 32;
optional double weighted_triangle_density_mean = 33;
optional double weighted_edge_density_overlap_mean = 34;
optional double weighted_triangle_density_overlap_mean = 35;
}
134 changes: 134 additions & 0 deletions clusterers/stats/stats_density.h
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,72 @@ inline absl::Status ComputeEdgeDensity(const GbbsGraph& graph,
return absl::OkStatus();
}

// compute the edge density of each cluster
// edge density is the number of edges divided by the number of possible edges
inline absl::Status ComputeEdgeDensityOverlap(const GbbsGraph& graph,
const InMemoryClusterer::Clustering& clustering, ClusteringStatistics* clustering_stats,
const parlay::sequence<gbbs::uintE>& cluster_ids, const ClusteringStatsConfig& clustering_stats_config) {
const bool compute_edge_density_overlap = clustering_stats_config.compute_edge_density_overlap();
if (!compute_edge_density_overlap) {
return absl::OkStatus();
}

parlay::sequence<gbbs::uintE> cluster_ids_overlap = parlay::sequence<gbbs::uintE>(graph.Graph()->n);
parlay::parallel_for(0, clustering.size(), [&](size_t i){
const auto& cluster = clustering[i];
parlay::parallel_for(0, cluster.size(), [&](size_t j){
cluster_ids_overlap[cluster[j]] = i;
});
});

std::size_t n = graph.Graph()->n;
auto result = std::vector<double>(clustering.size());

if(clustering.size()==1){
result[0] = (static_cast<double>(graph.Graph()->m)) / (static_cast<double>(n)*(n-1));
}else{
for(size_t i = 0; i < clustering.size(); i++) {
if (clustering[i].size() == 1){
result[i] = 0;
}
else{
const auto& cluster = clustering[i];
parlay::parallel_for(0, cluster.size(), [&](size_t j){
cluster_ids_overlap[cluster[j]] = i;
});
size_t m_subgraph = get_subgraph_num_edges(graph, clustering[i], cluster_ids_overlap);
double m_total = clustering[i].size()*(clustering[i].size()-1);
// std::cout << "m_subgraph" << " " << m_subgraph << std::endl;
// std::cout << "m_total" << " " << m_total << std::endl;
result[i] = (static_cast<double>(m_subgraph)) / (static_cast<double>(m_total));
}
}
}
auto result_func = [&](std::size_t i) {
return result[i];
};
parlay::sequence<double> cluster_sum = parlay::sequence<double>(n, 0);
parlay::sequence<gbbs::uintE> cluster_count = parlay::sequence<gbbs::uintE>(n, 0);
parlay::parallel_for(0, clustering.size(), [&](size_t i){
const auto& cluster = clustering[i];
parlay::parallel_for(0, cluster.size(), [&](size_t j){
cluster_sum[cluster[j]] += result_func(i);
cluster_count[cluster[j]] += 1;
});
});

double weighted_mean_overlap = 0;
for (int i=0;i<cluster_sum.size();++i){
if (cluster_count[i] != 0) {
weighted_mean_overlap += cluster_sum[i] / cluster_count[i];
}
}
weighted_mean_overlap = weighted_mean_overlap / n;
clustering_stats->set_weighted_edge_density_overlap_mean(weighted_mean_overlap);

return absl::OkStatus();
}

// compute the triangle density of each cluster
// triangle density is the number of triangles divided by the number of wedges
// if no wedge, density is 0
Expand Down Expand Up @@ -129,6 +195,74 @@ inline absl::Status ComputeTriangleDensity(const GbbsGraph& graph,
return absl::OkStatus();
}

// compute the triangle density of each cluster with overlapping clusters
// triangle density is the number of triangles divided by the number of wedges
// if no wedge, density is 0
inline absl::Status ComputeTriangleDensityOverlap(const GbbsGraph& graph,
const InMemoryClusterer::Clustering& clustering, ClusteringStatistics* clustering_stats,
const parlay::sequence<gbbs::uintE>& cluster_ids, const ClusteringStatsConfig& clustering_stats_config) {
const bool compute_triangle_density_overlap = clustering_stats_config.compute_triangle_density_overlap();
if (!compute_triangle_density_overlap) {
return absl::OkStatus();
}

parlay::sequence<gbbs::uintE> cluster_ids_overlap = parlay::sequence<gbbs::uintE>(graph.Graph()->n);
parlay::parallel_for(0, clustering.size(), [&](size_t i){
const auto& cluster = clustering[i];
parlay::parallel_for(0, cluster.size(), [&](size_t j){
cluster_ids_overlap[cluster[j]] = i;
});
});

std::size_t n = graph.Graph()->n;
auto result = std::vector<double>(clustering.size());
auto f = [&] (gbbs::uintE u, gbbs::uintE v, gbbs::uintE w) { };

//even if clustering.size()==1, we need to get the subgraph because could not match 'symmetric_graph' against 'symmetric_ptr_graph'
for(size_t i = 0; i < clustering.size(); i++) {
const auto& cluster = clustering[i];
parlay::parallel_for(0, cluster.size(), [&](size_t j){
cluster_ids_overlap[cluster[j]] = i;
});
auto G = get_subgraph<gbbs::empty>(graph, clustering[i], cluster_ids_overlap); //have to use unweighted graph, otherwise result is wrong
size_t num_wedges = get_num_wedges(&G);
if(num_wedges == 0){
result[i] = 0;
}else{
size_t num_tri = 0;
if (G.num_edges() >= 3 && G.num_vertices() >= 3){
num_tri = gbbs::Triangle_degree_ordering(G, f);
}
result[i] = (static_cast<double>(num_tri)) / (static_cast<double>(num_wedges));
}
}
// for(double l:result) std::cout << l << std::endl;
auto result_func = [&](std::size_t i) {
return result[i];
};

parlay::sequence<double> cluster_sum = parlay::sequence<double>(n, 0);
parlay::sequence<gbbs::uintE> cluster_count = parlay::sequence<gbbs::uintE>(n, 0);
parlay::parallel_for(0, clustering.size(), [&](size_t i){
const auto& cluster = clustering[i];
parlay::parallel_for(0, cluster.size(), [&](size_t j){
cluster_sum[cluster[j]] += result_func(i);
cluster_count[cluster[j]] += 1;
});
});

double weighted_mean_overlap = 0;
for (int i=0;i<cluster_sum.size();++i){
if (cluster_count[i] != 0) {
weighted_mean_overlap += cluster_sum[i] / cluster_count[i];
}
}
weighted_mean_overlap = weighted_mean_overlap / n;
clustering_stats->set_weighted_triangle_density_overlap_mean(weighted_mean_overlap);

return absl::OkStatus();
}


} // namespace research_graph::in_memory

Expand Down