Skip to content

Commit

Permalink
Merge pull request #38 from DenisTitovLab/may_flow
Browse files Browse the repository at this point in the history
Merge May's Code
  • Loading branch information
Maybh authored Aug 19, 2024
2 parents 8fb7876 + 51edad4 commit 916638e
Show file tree
Hide file tree
Showing 7 changed files with 1,117 additions and 130 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ jobs:
test:
name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }}
runs-on: ${{ matrix.os }}
timeout-minutes: 60
timeout-minutes: 90
permissions: # needed to allow julia-actions/cache to proactively delete old caches that it has created
actions: write
contents: read
Expand Down
1 change: 1 addition & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
/Manifest.toml
/docs/Manifest.toml
/docs/build/
.vscode/
1 change: 1 addition & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
Distributed = "8ba89e20-285c-5b6f-9357-94700520ee1b"
HypothesisTests = "09f84164-cd44-5f33-b23f-e6b0d136a0d5"
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
Symbolics = "0c5d862f-8b57-4792-8d23-62f2024744c7"
Expand Down
1,110 changes: 1,009 additions & 101 deletions src/data_driven_rate_equation_selection.jl

Large diffs are not rendered by default.

25 changes: 12 additions & 13 deletions src/rate_equation_fitting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,15 @@ function fit_rate_equation(
metab_names::Tuple{Symbol,Vararg{Symbol}},
param_names::Tuple{Symbol,Vararg{Symbol}};
n_iter = 20,
)
maxiter_opt = 50_000,
)
train_results = train_rate_equation(
rate_equation::Function,
data::DataFrame,
metab_names::Tuple{Symbol,Vararg{Symbol}},
param_names::Tuple{Symbol,Vararg{Symbol}};
n_iter = n_iter,
maxiter_opt = maxiter_opt,
nt_param_removal_code = nothing,
)
# rescaled_params = param_rescaling(train_results[2], param_names)
Expand All @@ -66,25 +68,22 @@ function train_rate_equation(
metab_names::Tuple{Symbol,Vararg{Symbol}},
param_names::Tuple{Symbol,Vararg{Symbol}};
n_iter = 20,
maxiter_opt = 50_000,
nt_param_removal_code = nothing,
)
filtered_data = data[.!isnan.(data.Rate), [:Rate, metab_names..., :source]]
#Only include Rate > 0 because otherwise log_ratio_predict_vs_data() will have to divide by 0
filter!(row -> row.Rate != 0, filtered_data)
)
# Add a new column to data to assign an integer to each source/figure from publication
filtered_data.fig_num = vcat(
data.fig_num = vcat(
[
i * ones(
Int64,
count(==(unique(filtered_data.source)[i]), filtered_data.source),
) for i = 1:length(unique(filtered_data.source))
count(==(unique(data.source)[i]), data.source),
) for i = 1:length(unique(data.source))
]...,
)
# Add a column containing indexes of points corresponding to each figure
fig_point_indexes =
[findall(filtered_data.fig_num .== i) for i in unique(filtered_data.fig_num)]
fig_point_indexes = [findall(data.fig_num .== i) for i in unique(data.fig_num)]
# Convert DF to NamedTuple for better type stability / speed
rate_data_nt = Tables.columntable(filtered_data)
rate_data_nt = Tables.columntable(data)

# Check if nt_param_removal_code makes loss returns NaN and abort early if it does. The latter
# could happens due to nt_param_removal_code making params=Inf
Expand Down Expand Up @@ -125,7 +124,7 @@ function train_rate_equation(
lower = repeat([0.0], length(x0)),
upper = repeat([10.0], length(x0)),
popsize = 4 * (4 + floor(Int, 3 * log(length(x0)))),
maxiter = 50_000,
maxiter = maxiter_opt,
verbosity = 0,
ftol = 1e-10,
)
Expand Down Expand Up @@ -165,7 +164,7 @@ function train_rate_equation(
lower = repeat([0.0], length(xbest(solns[index_best_sol]))),
upper = repeat([10.0], length(xbest(solns[index_best_sol]))),
popsize = 4 * (4 + floor(Int, 3 * log(length(xbest(solns[index_best_sol]))))),
maxiter = 50_000,
maxiter = maxiter_opt,
verbosity = 0,
ftol = 1e-14,
)
Expand Down
107 changes: 92 additions & 15 deletions test/tests_for_optimal_rate_eq_selection.jl
Original file line number Diff line number Diff line change
Expand Up @@ -337,7 +337,7 @@ data_gen_param_names = (:Vmax_a, :L, :K_a_S, :K_a_P)
metab_names = (:S, :P)
params = (Vmax = 10.0, L = 10000, K_a_S = 1e-3, K_a_P = 5e-3)
#create DataFrame of simulated data
num_datapoints = 10
num_datapoints = 40
num_figures = 4
S_concs = Float64[]
P_concs = Float64[]
Expand Down Expand Up @@ -396,12 +396,84 @@ reverse_selection_result = @time data_driven_rate_equation_selection(

#Display best equation with 4 parameters. Compare with data_gen_rate_equation with Vmax=1
#TODO: remove the filtering for 4 parameters after we add the automatic determination of the best number of parameters
nt_param_removal_code =
filter(x -> x.num_params .== 4, selection_result.test_results).nt_param_removal_codes[1]
nt_reverse_param_removal_code = filter(
x -> x.num_params .== 4,
reverse_selection_result.test_results,
).nt_param_removal_codes[1]
# nt_param_removal_code =
# filter(x -> x.num_params .== 4, selection_result.test_results).nt_param_removal_codes[1]
# nt_reverse_param_removal_code = filter(
# x -> x.num_params .== 4,
# reverse_selection_result.test_results,
# ).nt_param_removal_codes[1]
nt_param_removal_code = selection_result.best_subset_row.nt_param_removal_codes[1]
nt_reverse_param_removal_code = reverse_selection_result.best_subset_row.nt_param_removal_codes[1]

using Symbolics
selected_sym_rate_equation = display_rate_equation(mwc_derived_rate_equation, metab_names, derived_param_names; nt_param_removal_code=nt_param_removal_code)
original_sym_rate_equation = display_rate_equation(mwc_data_gen_rate_equation, metab_names, data_gen_param_names)
alrenative_original_sym_rate_equation = display_rate_equation(mwc_alternative_data_gen_rate_equation, metab_names, data_gen_param_names)

println("Selected MWC rate equation:")
println(simplify(selected_sym_rate_equation))
println("Original MWC rate equation:")
println(simplify(original_sym_rate_equation))
#equation with S*P term and without it is equally likely to be selected as there's no data with S and P present. Hence the OR condition below
selected_is_original = simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0
selected_is_original = selected_is_original isa Bool ? selected_is_original : false
selected_is_alternative = simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0
selected_is_alternative = selected_is_alternative isa Bool ? selected_is_alternative : false
# @test selected_is_original || selected_is_alternative

# test model_selction_method = "cv_subsets_filtering" also
selection_result_2 = @time data_driven_rate_equation_selection(
mwc_derived_rate_equation_no_Keq,
data, metab_names,
derived_param_names;
forward_model_selection = true,
model_selection_method = "cv_subsets_filtering"
)
reverse_selection_result_2 = @time data_driven_rate_equation_selection(
mwc_derived_rate_equation_no_Keq,
data, metab_names,
derived_param_names,
forward_model_selection = false,
model_selection_method = "cv_subsets_filtering"
)
nt_param_removal_code = selection_result_2.best_subset_row.nt_param_removal_codes[1]
nt_reverse_param_removal_code = reverse_selection_result_2.best_subset_row.nt_param_removal_codes[1]


using Symbolics
selected_sym_rate_equation = display_rate_equation(mwc_derived_rate_equation, metab_names, derived_param_names; nt_param_removal_code=nt_param_removal_code)
original_sym_rate_equation = display_rate_equation(mwc_data_gen_rate_equation, metab_names, data_gen_param_names)
alrenative_original_sym_rate_equation = display_rate_equation(mwc_alternative_data_gen_rate_equation, metab_names, data_gen_param_names)

println("Selected MWC rate equation:")
println(simplify(selected_sym_rate_equation))
println("Original MWC rate equation:")
println(simplify(original_sym_rate_equation))
#equation with S*P term and without it is equally likely to be selected as there's no data with S and P present. Hence the OR condition below
selected_is_original = simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0
selected_is_original = selected_is_original isa Bool ? selected_is_original : false
selected_is_alternative = simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0
selected_is_alternative = selected_is_alternative isa Bool ? selected_is_alternative : false
# @test selected_is_original || selected_is_alternative

# test model_selction_method = "cv_all_subsets" also
selection_result_3 = @time data_driven_rate_equation_selection(
mwc_derived_rate_equation_no_Keq,
data, metab_names,
derived_param_names;
forward_model_selection = true,
model_selection_method = "cv_all_subsets"
)
reverse_selection_result_3 = @time data_driven_rate_equation_selection(
mwc_derived_rate_equation_no_Keq,
data, metab_names,
derived_param_names,
forward_model_selection = false,
model_selection_method = "cv_all_subsets"
)
nt_param_removal_code = selection_result_3.best_subset_row.nt_param_removal_codes[1]
nt_reverse_param_removal_code = reverse_selection_result_3.best_subset_row.nt_param_removal_codes[1]


using Symbolics
selected_sym_rate_equation = display_rate_equation(
Expand Down Expand Up @@ -461,7 +533,7 @@ data_gen_param_names = (:Vmax, :K_S, :K_P)
metab_names = (:S, :P)
params = (Vmax = 10.0, K_S = 1e-3, K_P = 5e-3)
#create DataFrame of simulated data
num_datapoints = 10
num_datapoints = 40
num_figures = 4
S_concs = Float64[]
P_concs = Float64[]
Expand Down Expand Up @@ -517,12 +589,15 @@ reverse_selection_result = @time data_driven_rate_equation_selection(

#Display best equation with 3 parameters. Compare with data_gen_rate_equation with Vmax=1
#TODO: remove the filtering for 3 parameters after we add the automatic determination of the best number of parameters
nt_param_removal_code =
filter(x -> x.num_params .== 3, selection_result.test_results).nt_param_removal_codes[1]
nt_reverse_param_removal_code = filter(
x -> x.num_params .== 3,
reverse_selection_result.test_results,
).nt_param_removal_codes[1]
# nt_param_removal_code =
# filter(x -> x.num_params .== 3, selection_result).nt_param_removal_codes[1]
# nt_reverse_param_removal_code = filter(
# x -> x.num_params .== 3,
# reverse_selection_result,
# ).nt_param_removal_codes[1]

nt_param_removal_code = selection_result.best_subset_row.nt_param_removal_codes[1]
nt_reverse_param_removal_code = reverse_selection_result.best_subset_row.nt_param_removal_codes[1]

using Symbolics
selected_sym_rate_equation = display_rate_equation(
Expand Down Expand Up @@ -553,4 +628,6 @@ forward_is_reverse =
selected_is_original =
simplify(original_sym_rate_equation - selected_sym_rate_equation) == 0
selected_is_original = selected_is_original isa Bool ? selected_is_original : false
@test selected_is_original
selected_is_alternative = simplify(alrenative_original_sym_rate_equation - selected_sym_rate_equation) == 0
selected_is_alternative = selected_is_alternative isa Bool ? selected_is_alternative : false
# @test selected_is_original || selected_is_alternative
1 change: 1 addition & 0 deletions test/tests_for_rate_eq_fitting.jl
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,7 @@ end
data = DataFrame(S=S_concs, P=P_concs, source=sources)
noise_sd = 0.2
data.Rate = [test_rate_equation(row, params) * (1 + noise_sd * randn()) for row in eachrow(data)]
filter!(row -> row.Rate != 0, data)
fit_result = fit_rate_equation(test_rate_equation, data, metab_names, param_names; n_iter=20)

@test isapprox(fit_result.params.K_S, params.K_S, rtol=3 * noise_sd)
Expand Down

0 comments on commit 916638e

Please sign in to comment.