diff --git a/workflow/scripts/postprocess_diffexp.py b/workflow/scripts/postprocess_diffexp.py index 28d371d1..f1193076 100644 --- a/workflow/scripts/postprocess_diffexp.py +++ b/workflow/scripts/postprocess_diffexp.py @@ -20,28 +20,28 @@ def sort_columns(df, matching_columns): return df[other_columns + b_column_order] -def sort_rows(df, primary_variable): +def sort_rows(df): """Sort DataFrame by the absolute value of signed_p_value of primary variable in ascending order.""" - print(df.columns) - df = df.reindex( - df['signed_pi_value_' + primary_variable + '+'].abs().sort_values(ascending=False).index) - return df + signed_pi_start = 'signed_pi_value_' + \ + snakemake.params['model']['primary_variable'] + columns_with_prefix = [ + col for col in df.columns if col.startswith(signed_pi_start)] + if len(columns_with_prefix) != 1: + raise ValueError( + f"Expected exactly one column starting with '{signed_pi_start}', found {len(columns_with_prefix)}") -# def sort_rows(df, first_b_val): -# """Sort by b_vals if b_val < 0 sort by lower interval limit else by upper limit""" -# df['sort_value'] = df.apply(lambda row: abs( -# row[f"{first_b_val}_lower"]) if row[first_b_val] < 0 else abs(row[f"{first_b_val}_upper"]), axis=1) -# df = df.sort_values(by='sort_value') -# df.drop(columns=["sort_value"], inplace=True) -# return df + signed_pi_col = columns_with_prefix[0] + + df_sorted = df.reindex( + df[signed_pi_col].abs().sort_values(ascending=False).index) + return df_sorted df = pd.read_csv(snakemake.input[0], sep='\t') df, matching_columns = process_columns(df) df = sort_columns(df, matching_columns) -# df = sort_rows(df, matching_columns[0]) -df = sort_rows(df, snakemake.params['model']['primary_variable']) +df = sort_rows(df) df = df.dropna(subset=matching_columns, how='all') df.to_csv(snakemake.output[0], sep='\t', index=False) diff --git a/workflow/scripts/postprocess_go_enrichment.py b/workflow/scripts/postprocess_go_enrichment.py index acf40324..3060aad8 100644 --- a/workflow/scripts/postprocess_go_enrichment.py +++ b/workflow/scripts/postprocess_go_enrichment.py @@ -20,33 +20,8 @@ def sort_group(group): df_enr = pd.read_csv(snakemake.input["enrichment"], sep='\t') df_sig = pd.read_csv(snakemake.input["significant_terms"], sep='\t') -# Only keep data if GO term exists in both tables -# common_ids = df_sig[df_sig['GO'].isin(df_enr['GO'])]['GO'] -# df_enr_filtered = df_enr[df_enr['GO'].isin(common_ids)] -# df_sig_filtered = df_sig[df_sig['GO'].isin(common_ids)] - -# # Add study items from significant terms to dataset -# df_enr_filtered['study_items_sig_terms'] = df_enr_filtered['GO'].map( -# df_sig_filtered.set_index('GO')['study_items']) - -# # Sort and calculate enrichment ratios -# df_enr_filtered_sorted = df_enr_filtered.groupby( -# 'class', group_keys=False).apply(sort_group) - -# if not df_enr_filtered_sorted.empty: -# df_enr_filtered_sorted['enrichment'] = df_enr_filtered_sorted.apply( -# lambda row: calculate_enrichment(row['ratio_in_study'], row['ratio_in_pop']), axis=1) -# else: -# df_enr_filtered_sorted['enrichment'] = None - -# # Save the result to a file -# df_enr_filtered_sorted.to_csv(snakemake.output[0], sep='\t', index=False) - - # Merge the two dataframes on the 'GO' column to keep only common GO terms df_merged = df_sig.join(df_enr.set_index('GO'), on='GO', rsuffix='_enr') -print(df_merged, df_merged.columns) - # Add study items from significant terms to the merged dataset df_merged['study_items_sig_terms'] = df_merged['study_items']