From d41b94a72cb84cd2734a1246f38314cb4595b075 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Thu, 4 Jul 2024 14:55:37 +0000 Subject: [PATCH 1/4] fix: postprocess for all diffexp levels, use proper table for constraining matrix, sort in the right order --- workflow/rules/datavzrd.smk | 26 +++++------------------- workflow/scripts/postprocess_diffexp.py | 2 +- workflow/scripts/postprocess_logcount.py | 8 ++++---- 3 files changed, 10 insertions(+), 26 deletions(-) diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 99ab3b9a..14a6b001 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -16,31 +16,15 @@ rule postprocess_go_enrichment: # Postprocessing Differential Expression Data rule postprocess_diffexp: input: - genes_representative="results/tables/diffexp/{model}.genes-representative.diffexp.tsv", + genes_representative="results/tables/diffexp/{model}.{level}.diffexp.tsv", output: - "results/tables/diffexp/{model}.genes-representative.diffexp_postprocessed.tsv", + "results/tables/diffexp/{model}.{level}.diffexp_postprocessed.tsv", conda: "../envs/pandas.yaml" params: model=get_model, log: - "logs/yte/postprocess_diffexp/{model}.log", - script: - "../scripts/postprocess_diffexp.py" - - -# Postprocessing Differential Expression Data -rule postprocess_transcripts: - input: - "results/tables/diffexp/{model}.transcripts.diffexp.tsv", - output: - "results/tables/diffexp/{model}.transcripts.diffexp_postprocessed.tsv", - conda: - "../envs/pandas.yaml" - params: - model=get_model, - log: - "logs/yte/postprocess_diffexp/{model}.log", + "logs/yte/postprocess_diffexp/{model}/{level}.log", script: "../scripts/postprocess_diffexp.py" @@ -49,7 +33,7 @@ rule postprocess_transcripts: rule postprocess_logcount_matrix: input: logcount="results/tables/logcount-matrix/{model}.logcount-matrix.tsv", - genes_representative="results/tables/diffexp/{model}.genes-representative.diffexp_postprocessed.tsv", + diffexp="results/tables/diffexp/{model}.transcripts.diffexp_postprocessed.tsv", output: "results/tables/logcount-matrix/{model}.logcount-matrix_postprocessed.tsv", conda: @@ -96,7 +80,7 @@ rule diffexp_datavzrd: # optional files required for rendering the given config logcount_matrix="results/tables/logcount-matrix/{model}.logcount-matrix_postprocessed.tsv", transcripts="results/tables/diffexp/{model}.transcripts.diffexp_postprocessed.tsv", - genes_aggregated="results/tables/diffexp/{model}.genes-aggregated.diffexp.tsv", + genes_aggregated="results/tables/diffexp/{model}.genes-aggregated.diffexp_postprocessed.tsv", genes_representative="results/tables/diffexp/{model}.genes-representative.diffexp_postprocessed.tsv", volcano_plots="results/plots/interactive/volcano/{model}.vl.json", output: diff --git a/workflow/scripts/postprocess_diffexp.py b/workflow/scripts/postprocess_diffexp.py index 208d9749..4feb58e7 100644 --- a/workflow/scripts/postprocess_diffexp.py +++ b/workflow/scripts/postprocess_diffexp.py @@ -23,7 +23,7 @@ def sort_columns(df, matching_columns): def sort_rows(df, primary_variable): """Sort DataFrame by the absolute value of signed_p_value of primary variable in ascending order.""" df = df.reindex( - df['signed_pi_value_' + primary_variable + '+'].abs().sort_values().index) + df['signed_pi_value_' + primary_variable + '+'].abs().sort_values(descending=True).index) return df diff --git a/workflow/scripts/postprocess_logcount.py b/workflow/scripts/postprocess_logcount.py index d2832971..3cf4ca92 100644 --- a/workflow/scripts/postprocess_logcount.py +++ b/workflow/scripts/postprocess_logcount.py @@ -3,12 +3,12 @@ # Read the TSV files logcount_matrix = pd.read_csv(snakemake.input['logcount'], sep='\t') -genes_representative = pd.read_csv( - snakemake.input['genes_representative'], sep='\t') +diffexp = pd.read_csv(snakemake.input['diffexp'], sep='\t') # Filter logcount_matrix to only include rows where 'transcript' is in 'target_id' of genes_representative -filtered_logcount_matrix = logcount_matrix[logcount_matrix['transcript'].isin( - genes_representative['target_id'])] +filtered_logcount_matrix = logcount_matrix[ + logcount_matrix['transcript'].isin(diffexp['target_id']) +] # Save the filtered dataframe to a new TSV file filtered_logcount_matrix.to_csv(snakemake.output[0], sep='\t', index=False) From 354aaacfb51970c7967fb44e6e99f9e0d19e4818 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Thu, 4 Jul 2024 15:08:57 +0000 Subject: [PATCH 2/4] bump datavzrd --- workflow/rules/datavzrd.smk | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index 14a6b001..e26fdeee 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -70,7 +70,7 @@ rule spia_datavzrd: offer_excel=lookup(within=config, dpath="report/offer_excel", default=False), pathway_db=config["enrichment"]["spia"]["pathway_database"], wrapper: - "v3.13.2/utils/datavzrd" + "v3.13.4/utils/datavzrd" # Generating Differential Expression Datavzrd Report @@ -100,7 +100,7 @@ rule diffexp_datavzrd: offer_excel=lookup(within=config, dpath="report/offer_excel", default=False), samples=get_model_samples, wrapper: - "v3.13.2/utils/datavzrd" + "v3.13.4/utils/datavzrd" # Generating GO Enrichment Datavzrd Report @@ -136,7 +136,7 @@ rule go_enrichment_datavzrd: offer_excel=lookup(within=config, dpath="report/offer_excel", default=False), samples=get_model_samples, wrapper: - "v3.13.2/utils/datavzrd" + "v3.13.4/utils/datavzrd" # Generating Meta Comparison Datavzrd Reports @@ -162,4 +162,4 @@ rule meta_compare_datavzrd: log: "logs/datavzrd-report/meta_comp_{method}.{meta_comp}.log", wrapper: - "v3.13.2/utils/datavzrd" + "v3.13.4/utils/datavzrd" From e0d4b7689fe875640e4de9c84b4b4d6922ac2e17 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Fri, 5 Jul 2024 12:01:55 +0000 Subject: [PATCH 3/4] do not create pi-value sorting for genes-aggregated (no beta-values and no pi-value) --- workflow/rules/datavzrd.smk | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflow/rules/datavzrd.smk b/workflow/rules/datavzrd.smk index e26fdeee..6e039202 100644 --- a/workflow/rules/datavzrd.smk +++ b/workflow/rules/datavzrd.smk @@ -14,6 +14,7 @@ rule postprocess_go_enrichment: # Postprocessing Differential Expression Data +# Does not work for level = genes-aggregated since it does not contain beta values. rule postprocess_diffexp: input: genes_representative="results/tables/diffexp/{model}.{level}.diffexp.tsv", @@ -80,7 +81,7 @@ rule diffexp_datavzrd: # optional files required for rendering the given config logcount_matrix="results/tables/logcount-matrix/{model}.logcount-matrix_postprocessed.tsv", transcripts="results/tables/diffexp/{model}.transcripts.diffexp_postprocessed.tsv", - genes_aggregated="results/tables/diffexp/{model}.genes-aggregated.diffexp_postprocessed.tsv", + genes_aggregated="results/tables/diffexp/{model}.genes-aggregated.diffexp.tsv", genes_representative="results/tables/diffexp/{model}.genes-representative.diffexp_postprocessed.tsv", volcano_plots="results/plots/interactive/volcano/{model}.vl.json", output: From baac24e5b4494fc4b50bf7c54af187e491a7c653 Mon Sep 17 00:00:00 2001 From: Johannes Koester Date: Fri, 5 Jul 2024 12:07:24 +0000 Subject: [PATCH 4/4] fix arg --- workflow/scripts/postprocess_diffexp.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflow/scripts/postprocess_diffexp.py b/workflow/scripts/postprocess_diffexp.py index 4feb58e7..dc94cb18 100644 --- a/workflow/scripts/postprocess_diffexp.py +++ b/workflow/scripts/postprocess_diffexp.py @@ -23,7 +23,7 @@ def sort_columns(df, matching_columns): def sort_rows(df, primary_variable): """Sort DataFrame by the absolute value of signed_p_value of primary variable in ascending order.""" df = df.reindex( - df['signed_pi_value_' + primary_variable + '+'].abs().sort_values(descending=True).index) + df['signed_pi_value_' + primary_variable + '+'].abs().sort_values(ascending=False).index) return df