Skip to content

Commit

Permalink
Add GLM coefficient output to Placement-Factorization
Browse files Browse the repository at this point in the history
  • Loading branch information
lczech committed Feb 20, 2024
1 parent 78705cb commit c5dd55f
Show file tree
Hide file tree
Showing 4 changed files with 211 additions and 37 deletions.
2 changes: 1 addition & 1 deletion CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ include( "${CMAKE_CURRENT_LIST_DIR}/tools/cmake/DownloadDependency.cmake" )
# These are replaced by tools/cmake/update_dependencies.sh to the hashes that are currently checked out.
# Thus, do not replace the hashes manually!
SET( CLI11_COMMIT_HASH "13becaddb657eacd090537719a669d66d393b8b2" ) #CLI11_COMMIT_HASH#
SET( genesis_COMMIT_HASH "9d65c7b1ee13d15034bfa0944830c3c4d8e5e723" ) #genesis_COMMIT_HASH#
SET( genesis_COMMIT_HASH "277292d4a4b2a081ef7ca4a8e813fdeb8b19e272" ) #genesis_COMMIT_HASH#
SET( sparsepp_COMMIT_HASH "6bfe3b4bdb364993e612d6bb729d680cf4c77649" ) #sparsepp_COMMIT_HASH#

# Call the github download function, which takes four arguments:
Expand Down
109 changes: 109 additions & 0 deletions scripts/plot-pf-glm-coeffs.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
#!/usr/bin/env Rscript

# gappa - Genesis Applications for Phylogenetic Placement Analysis
# Copyright (C) 2017-2024 Lucas Czech
#
# This program is free software: you can redistribute it and/or modify
# it under the terms of the GNU General Public License as published by
# the Free Software Foundation, either version 3 of the License, or
# (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# You should have received a copy of the GNU General Public License
# along with this program. If not, see <http://www.gnu.org/licenses/>.
#
# Contact:
# Lucas Czech <[email protected]>
# Department of Plant Biology, Carnegie Institution For Science
# 260 Panama Street, Stanford, CA 94305, USA

# Load packages
library(tidyr)
library(ggplot2)
library(cowplot)
theme_set(theme_cowplot())

# Get input and output files from command line, or use defaults.
meta_file = "meta.csv"
balances_file = "factor_balances.csv"
coeffs_file = "factor_glm_coefficients.csv"
out_pref = "pf-glm-coeffs-"
args = commandArgs(trailingOnly=TRUE)
if (length(args) >= 1) {
meta_file = args[1]
}
if (length(args) >= 2) {
balances_file = args[2]
}
if (length(args) >= 3) {
coeffs_file = args[3]
}
if (length(args) <= 4) {
out_pref = args[4]
} else {
stop( "Usage: plot-pf-glm-coeffs.R [meta-data] [balances] [glm-coefficients] [out-prefix]")
}

print(paste("balances: ", balances_file))
print(paste("glm-coefficients: ", coeffs_file))
print(paste("meta-data: ", meta_file))
print(paste("out-prefix: ", out_pref))

Balances <- read.table(balances_file, sep="\t", header=TRUE)
Coefficients <- read.table(coeffs_file, sep="\t", header=TRUE)
Meta <- read.table(meta_file, sep="\t", header=TRUE)

# Match the Samples between Meta and Balances
# The Meta table might not have the first column named Samples,
# so we just take whatever is the first column here.
matched_samples <- match(Meta[, colnames(Meta)[1]], Balances$Sample)

# Loop through each row in the Coefficients table
for(i in 1:nrow(Coefficients)) {
print(paste("Factor", i))

# Extract the current row's coefficients and intercept
# Exclude Factor and Intercept columns
current_coefficients <- as.numeric(Coefficients[i, -c(1, 2)])
current_intercept <- Coefficients$Intercept[i]

# Compute the dot product for the current coefficients against all Meta rows
# Then add the current intercept
result <- as.matrix(Meta[, -1]) %*% current_coefficients + current_intercept

# Find the corresponding column in the Balances table
balance_column_name <- paste("Factor", i, sep = "_")
if(!(balance_column_name %in% names(Balances))) {
print(paste("Invalid column", balance_column_name))
next
}

# Create a data frame for plotting.
# Add the result with the corresponding balance value.
# Note: Ensure that the matched_samples indices are valid and there are no NAs
df <- data.frame(
Result = result,
Balance = Balances[matched_samples, balance_column_name]
)

# Generate the scatter plot.
p <- ggplot(df, aes(x = Balance, y = Result)) +
geom_point() +
# geom_smooth(method = "lm", color = "red", se = FALSE, formula="y~x", orientation = "y") +
labs(
title = balance_column_name,
x = "Balance Value",
y = "GLM Prediction"
) +
geom_abline(slope=1, intercept=0, color='#6666FF') +
coord_fixed() +
theme(legend.title=element_blank())

# Display the plot
# print(p)
ggsave(paste0(out_pref, i,".png"), width=12, height=8)
}
135 changes: 100 additions & 35 deletions src/commands/analyze/placement_factorization.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,9 @@
#include "genesis/utils/color/color.hpp"
#include "genesis/utils/color/list_sequential.hpp"

#include <cmath>
#include <fstream>
#include <limits>
#include <stdexcept>
#include <unordered_set>
#include <vector>
Expand Down Expand Up @@ -162,8 +164,17 @@ void setup_placement_factorization( CLI::App& app )
// Input Reading
// =================================================================================================

genesis::utils::Matrix<double> read_meta_data( PlacementFactorizationOptions const& options )
struct MetaMatrix
{
genesis::utils::Matrix<double> matrix;
std::vector<std::string> column_names;
std::vector<std::string> row_names;
};

MetaMatrix read_meta_data( PlacementFactorizationOptions const& options )
{
MetaMatrix result;

// Get the metadata.
// options.metadata_input.print();
auto df = options.metadata_input.read_string_dataframe();
Expand All @@ -180,18 +191,21 @@ genesis::utils::Matrix<double> read_meta_data( PlacementFactorizationOptions con
// Convert as needed for phylo factorization.
std::string report;
auto meta = glm_prepare_dataframe( df, report );
result.column_names = meta.col_names();
LOG_MSG1 << report;

// TODO use glm_convert_dataframe instead?!

// Copy the meta data in the correct sample order.
auto const jplace_count = options.jplace_input.file_count();
auto result = genesis::utils::Matrix<double>( jplace_count, meta.cols() );
result.matrix = genesis::utils::Matrix<double>( jplace_count, meta.cols() );
result.row_names.resize( jplace_count );
assert( meta.rows() == jplace_count );
for( size_t i = 0; i < jplace_count; ++i ) {
for( size_t c = 0; c < meta.cols(); ++c ) {
result( i, c ) = meta[ c ].as<double>()[ options.jplace_input.base_file_name(i) ];
result.matrix( i, c ) = meta[ c ].as<double>()[ options.jplace_input.base_file_name(i) ];
}
result.row_names[i] = options.jplace_input.base_file_name(i);
}

return result;
Expand Down Expand Up @@ -254,6 +268,23 @@ genesis::tree::BalanceData read_balance_data( PlacementFactorizationOptions cons
return mass_balance_data( mass_trees, settings );
}

struct GlmCoefficients
{
using Coefficients = std::vector<double>;
std::vector<Coefficients> edge_coefficients;
};

std::vector<GlmCoefficients> prepare_glm_coefficients(
PlacementFactorizationOptions const& options,
genesis::tree::BalanceData const& balances
) {
auto result = std::vector<GlmCoefficients>( options.factors.value() );
for( auto& factor : result ) {
factor.edge_coefficients.resize( balances.tree.edge_count() );
}
return result;
}

// =================================================================================================
// Output Writing
// =================================================================================================
Expand Down Expand Up @@ -337,8 +368,13 @@ void write_factor_taxa(
using namespace genesis::tree;
using namespace genesis::utils;

auto factor_taxa_of = options.file_output.get_output_target( "factor_taxa", "txt" );
auto write_taxa_list = [&]( std::unordered_set<size_t> indices ) {
// Open the file
auto factor_taxa_of = options.file_output.get_output_target( "factor_taxa", "csv" );

// Write a line to the file. Factor index, then taxon name, then indicator of which side
auto write_taxa_list = [&](
size_t factor, std::unordered_set<size_t> indices, std::string const& side
) {
std::unordered_set<std::string> edge_names;
for( auto const ei : indices ) {
auto const& ed = tree.edge_at( ei ).secondary_link().node().data<CommonNodeData>();
Expand All @@ -347,19 +383,15 @@ void write_factor_taxa(
}
}
for( auto const& en : edge_names ) {
(*factor_taxa_of) << en << "\n";
(*factor_taxa_of) << factor << "\t" << en << "\t" << side << "\n";
}
(*factor_taxa_of) << "\n";
};

// Write the table, for each factor, and for each side of it the
(*factor_taxa_of) << "Factor\tTaxon\tRootSide\n";
for( size_t i = 0; i < factors.size(); ++i ) {
auto const& factor = factors[i];

(*factor_taxa_of) << "Factor " << (i+1) << ", root side:\n";
write_taxa_list( factor.edge_indices_primary );

(*factor_taxa_of) << "Factor " << (i+1) << ", non-root side:\n";
write_taxa_list( factor.edge_indices_secondary );
write_taxa_list( i+1, factors[i].edge_indices_primary, "1" );
write_taxa_list( i+1, factors[i].edge_indices_secondary, "0" );
}
}

Expand Down Expand Up @@ -390,13 +422,44 @@ void write_balances_table(
);
}

void write_glm_coefficients(
PlacementFactorizationOptions const& options,
std::vector<genesis::tree::PhyloFactor> const& factors,
MetaMatrix const& meta,
std::vector<GlmCoefficients> const& glm_coeffs
) {
auto target = options.file_output.get_output_target( "factor_glm_coefficients", "csv" );

// Write the header. One column per meta
(*target) << "Factor\tIntercept";
for( auto const& col_name : meta.column_names ) {
(*target) << "\t" << col_name;
}
(*target) << "\n";

// Write the coefficients of the winning edge.
assert( glm_coeffs.size() == factors.size() );
for( size_t i = 0; i < factors.size(); ++i ) {
auto const winning_edge_idx = factors[i].edge_index;
assert( glm_coeffs[i].edge_coefficients.size() == factors[i].all_objective_values.size() );
auto const& edge_vals = glm_coeffs[i].edge_coefficients[winning_edge_idx];
(*target) << (i+1);
for( auto v : edge_vals ) {
(*target) << "\t" << v;
}
(*target) << "\n";
}
}

// =================================================================================================
// Run
// =================================================================================================

void run_placement_factorization( PlacementFactorizationOptions const& options )
{
using namespace genesis;
using namespace genesis::tree;
using namespace genesis::utils;

// -------------------------------------------------------------------------
// Preparations
Expand All @@ -411,8 +474,9 @@ void run_placement_factorization( PlacementFactorizationOptions const& options )
files_to_check.push_back({ "objective_values_" + std::to_string( i+1 ), e });
}
}
files_to_check.push_back({ "factor_taxa", "txt" });
files_to_check.push_back({ "factor_taxa", "csv" });
files_to_check.push_back({ "factor_balances", "csv" });
files_to_check.push_back({ "factor_glm_coefficients", "csv" });
options.file_output.check_output_files_nonexistence( files_to_check );

// Print some user output.
Expand All @@ -432,27 +496,27 @@ void run_placement_factorization( PlacementFactorizationOptions const& options )
// Calculations and Output
// -------------------------------------------------------------------------

auto const factors = tree::phylogenetic_factorization(
// We capture the GLM coefficients of all factors and edges. The outer vector has elements
// per factor (iteration), and the inner per edge index.
auto glm_coeffs = prepare_glm_coefficients( options, balances );

// Ruuuuuun!
auto const factors = phylogenetic_factorization(
balances,
[&]( std::vector<double> const& balances ){
auto const fit = glm_fit( meta, balances, utils::glm_family_gaussian() );

// TODO return nan in the follwing cases. but make sure that this works with
// the phylofactor implementation first!

// if( !fit.converged ) {
// LOG_DBG << "did not converge";
// }
// if( ! std::isfinite(fit.null_deviance) ) {
// LOG_DBG << "fit.null_deviance " << fit.null_deviance;
// }
// if( ! std::isfinite(fit.deviance) ) {
// LOG_DBG << "fit.deviance " << fit.deviance;
// }
// if( ! std::isfinite(fit.null_deviance) || ! std::isfinite(fit.deviance) ) {
// LOG_DBG << "data.meta_vals_mat " << join( data.meta_vals_mat );
// LOG_DBG << "balances " << join( balances );
// }
[&]( size_t iteration, size_t edge_index, std::vector<double> const& balances ){
auto const fit = glm_fit( meta.matrix, balances, glm_family_gaussian() );

// Store the coefficients computed from the fitting
assert( iteration < glm_coeffs.size() );
assert( edge_index < glm_coeffs[iteration].size() );
auto& coeff = glm_coeffs[iteration].edge_coefficients[edge_index];
coeff = glm_coefficients( meta.matrix, balances, fit );

// If something did not work in the GLM, we return a nan,
// so that this edge is not considered downstream.
if( !fit.converged || !std::isfinite(fit.null_deviance) || !std::isfinite(fit.deviance) ) {
return std::numeric_limits<double>::quiet_NaN();
}

return fit.null_deviance - fit.deviance;
},
Expand All @@ -467,4 +531,5 @@ void run_placement_factorization( PlacementFactorizationOptions const& options )
write_factor_objective_values( options, factors, balances.tree );
write_factor_taxa( options, factors, balances.tree );
write_balances_table( options, factors );
write_glm_coefficients( options, factors, meta, glm_coeffs );
}

0 comments on commit c5dd55f

Please sign in to comment.