updated biogeme script

matsim-scenarios · Jul 9, 2024 · b8f2221 · b8f2221
1 parent c36120d
commit b8f2221
Show file tree

Hide file tree

Showing 2 changed files with 46 additions and 33 deletions.
diff --git a/src/main/python/choicemodels/estimate_biogeme_plan_choice.py b/src/main/python/choicemodels/estimate_biogeme_plan_choice.py
@@ -2,39 +2,31 @@
 # -*- coding: utf-8 -*-
 
 import argparse
+
 import biogeme.biogeme as bio
 import biogeme.database as db
 import biogeme.models as models
-import pandas as pd
 from biogeme.expressions import Beta, bioDraws, log, MonteCarlo
 
-from prepare import calc_plan_variables, tn_generator
+from prepare import read_plan_choices, tn_generator
+
+ESTIMATE = 0
+FIXED = 1
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Estimate choice model for daily trip usage")
-    parser.add_argument("--input", help="Path to the input file", type=str, default="../../../plan-choices.csv")
+    parser.add_argument("--input", help="Path to the input file", type=str,
+                        default="../../../../plan-choices-bestK_9-tt-only.csv")
 
     args = parser.parse_args()
 
-    df = pd.read_csv(args.input)
+    ds = read_plan_choices(args.input)
 
-    df.drop(columns=["person"], inplace=True)
-
-    # TODO: for testing only
-    df = df.sample(frac=0.2)
+    # Needs to be numeric
+    ds.df["choice"] = 1
 
     # Convert all the columns to numeric
-    df = df * 1
-
-    k = df.columns.str.extract(r"plan_(\d+)", expand=False).dropna().to_numpy(int).max()
-
-    print("Number of choices: ", len(df))
-    print("Number of plans per choice: ", k)
-
-    modes = list(df.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
-    print("Modes: ", modes)
-
-    df = calc_plan_variables(df, k, modes)
+    df = ds.df * 1
 
     database = db.Database("data/plan-choices", df)
     v = database.variables
@@ -45,37 +37,48 @@
 
     mixed_logit = True
 
+    # Variables for constants
     ASC = {}
-    for mode in modes:
+
+    # Variables for variation
+    SD = {}
+
+    # Factor on marginal utility of money
+    F_UTIl_MONEY = Beta('UTIL_MONEY', 1, 0, 1.5, ESTIMATE)
+
+    BETA_PERFORMING = Beta('BETA_PERFORMING', 6, 0, 10, ESTIMATE)
+
+    for mode in ds.modes:
         # Base asc
-        asc = Beta(f"ASC_{mode}", 0, None, None, 1 if mode in ("walk", "car") else 0)
+        asc = Beta(f"ASC_{mode}", 0, None, None, FIXED if mode in ("walk", "car") else ESTIMATE)
 
         # Pt does not have its own random parameter
         if True or mode == "walk" or mode == "pt":
             ASC[mode] = asc
         else:
             # The random parameter
-            asc_s = Beta(f"ASC_{mode}_s", 1, None, None, 0)
-            ASC[mode] = asc + asc_s * bioDraws(f"ASC_{mode}_RND", "NORMAL_ANTI")
+            SD[mode] = Beta(f"ASC_{mode}_s", 1, None, None, 0)
+            ASC[mode] = asc + SD[mode] * bioDraws(f"{mode}_RND", "NORMAL_ANTI")
 
     B_UTIL = Beta('B_CAR_UTIL', 10, 0, 20, 0)
     B_UTIL_S = Beta('B_CAR_UTIL_SD', 1, 0, 10, 0)
 
-    B_TIME_RND = B_UTIL + B_UTIL_S * bioDraws('B_CAR_UTIL_RND', 'TN')
+    B_CAR_RND = B_UTIL + B_UTIL_S * bioDraws('B_CAR_UTIL_RND', 'TN')
 
     U = {}
     AV = {}
 
-    for i in range(1, k + 1):
-        u = v[f"plan_{i}_car_fixed_cost"]
-        u += v[f"plan_{i}_costs"]
+    for i in range(1, ds.k + 1):
+        u = v[f"plan_{i}_price"] * (ds.global_income / v["income"]) ** F_UTIl_MONEY
         u -= v[f"plan_{i}_pt_n_switches"]
 
-        for mode in modes:
+        for mode in ds.modes:
             u += ASC[mode] * v[f"plan_{i}_{mode}_usage"]
 
+            u += -BETA_PERFORMING * v[f"plan_{i}_{mode}_hours"] * (2 if mode == "ride" else 1)
+
         if mixed_logit:
-            u += v[f"plan_{i}_car_used"] * B_TIME_RND
+            u += v[f"plan_{i}_car_used"] * B_CAR_RND
 
         U[i] = u
         AV[i] = v[f"plan_{i}_valid"]
@@ -88,9 +91,15 @@
 
     biogeme = bio.BIOGEME(database, logprob)
 
-    biogeme.calculateNullLoglikelihood(AV)
     biogeme.modelName = "plan_choice"
+    biogeme.weight = v["weight"]
     biogeme.generate_pickle = False
+    biogeme.save_iterations = False
+    biogeme.number_of_draws = 10000
+    biogeme.second_derivatives = 0.1
+    biogeme.seed_param = 42
+
+    biogeme.calculateNullLoglikelihood(AV)
 
     results = biogeme.estimate()
     print(results.short_summary())

diff --git a/src/main/python/choicemodels/prepare.py b/src/main/python/choicemodels/prepare.py
@@ -13,7 +13,7 @@
 
 TN = truncnorm(0, np.inf)
 
-PlanChoice = namedtuple("PlanChoice", ["df", "modes", "varying", "k"])
+PlanChoice = namedtuple("PlanChoice", ["df", "modes", "varying", "k", "global_income"])
 
 
 def read_plan_choices(input_file: str, sample: float = 1, seed: int = 42) -> PlanChoice:
@@ -42,11 +42,15 @@ def read_plan_choices(input_file: str, sample: float = 1, seed: int = 42) -> Pla
     df_wide['custom_id'] = np.arange(len(df_wide))  # Add unique identifier
     df_wide['choice'] = df_wide['choice'].map({1: "plan_1"})
 
-    df_wide = calc_plan_variables(df_wide, k, modes)
+    df_wide = calc_plan_variables(df_wide, k, modes, True)
 
     varying = list(df_wide.columns.str.extract(r"plan_1_([a-zA-z_]+)", expand=False).dropna().unique())
 
-    return PlanChoice(df_wide, modes, varying, k)
+    with open(input_file) as f:
+        _, _, income = f.readline().rpartition(":")
+        global_income = float(income.strip())
+
+    return PlanChoice(df_wide, modes, varying, k, global_income)
 
 
 def tn_generator(sample_size: int, number_of_draws: int) -> np.ndarray: