refactored choice models

matsim-scenarios · Jul 9, 2024 · c36120d · c36120d
1 parent fe7b6f1
commit c36120d
Show file tree

Hide file tree

Showing 7 changed files with 168 additions and 107 deletions.
diff --git a/src/main/java/org/matsim/prepare/choices/ComputePlanChoices.java b/src/main/java/org/matsim/prepare/choices/ComputePlanChoices.java
@@ -53,22 +53,34 @@ public class ComputePlanChoices implements MATSimAppCommand, PersonAlgorithm {
 	 */
 	private final Queue<List<Object>> rows = new ConcurrentLinkedQueue<>();
 	private final MainModeIdentifier mmi = new DefaultAnalysisMainModeIdentifier();
+
 	@CommandLine.Mixin
 	private ScenarioOptions scenario;
+
 	@CommandLine.Option(names = "--top-k", description = "Use top k estimates", defaultValue = "9")
 	private int topK;
+
 	@CommandLine.Option(names = "--modes", description = "Modes to include in estimation", split = ",")
 	private Set<String> modes;
+
 	@CommandLine.Option(names = "--id-filter", description = "Filter for person ids")
 	private Pattern idFilter;
+
 	@CommandLine.Option(names = "--time-util-only", description = "Reset scoring for estimation and only use time utility", defaultValue = "false")
 	private boolean timeUtil;
+
 	@CommandLine.Option(names = "--calc-scores", description = "Perform pseudo scoring for each plan", defaultValue = "false")
 	private boolean calcScores;
-	@CommandLine.Option(names = "--plan-candidates", description = "Method to generate plan candidates", defaultValue = "diverse")
+
+	@CommandLine.Option(names = "--plan-candidates", description = "Method to generate plan candidates", defaultValue = "bestK")
 	private PlanCandidates planCandidates = PlanCandidates.bestK;
+
+	@CommandLine.Option(names = "--max-plan-length", description = "Maximum plan length", defaultValue = "7")
+	private int maxPlanLength = 7;
+
 	@CommandLine.Option(names = "--output", description = "Path to output csv.", defaultValue = "plan-choices.csv")
 	private Path output;
+
 	private ThreadLocal<Ctx> thread;
 	private ProgressBar pb;
 	private double globalAvgIncome;
@@ -169,6 +181,10 @@ public Integer call() throws Exception {
 
 		String out = output.toString().replace(".csv", "-%s_%d.csv".formatted(planCandidates, topK));
 
+		if (timeUtil && planCandidates == PlanCandidates.bestK) {
+			out = out.replace(".csv", "-tt-only.csv");
+		}
+
 		log.info("Writing {} choices to {}", rows.size(), out);
 
 		try (CSVPrinter csv = new CSVPrinter(Files.newBufferedWriter(Path.of(out)), CSVFormat.DEFAULT.builder().setCommentMarker('#').build())) {
@@ -223,6 +239,11 @@ public void run(Person person) {
 		Plan plan = person.getSelectedPlan();
 		PlanModel model = PlanModel.newInstance(plan);
 
+		if (model.trips() > maxPlanLength) {
+			pb.step();
+			return;
+		}
+
 		String refModes = (String) person.getAttributes().getAttribute(TripAnalysis.ATTR_REF_MODES);
 		String[] split = refModes.strip().split("-");
 		String[] currentModes = model.getCurrentModesMutable();

diff --git a/src/main/python/estimate_plan_choice.py → ...icemodels/estimate_biogeme_plan_choice.py b/src/main/python/estimate_plan_choice.py → ...icemodels/estimate_biogeme_plan_choice.py
@@ -5,75 +5,10 @@
 import biogeme.biogeme as bio
 import biogeme.database as db
 import biogeme.models as models
-import numpy as np
 import pandas as pd
 from biogeme.expressions import Beta, bioDraws, log, MonteCarlo
-from collections import defaultdict
-from scipy.stats import truncnorm
-
-TN = truncnorm(0, np.inf)
-
-
-def tn_generator(sample_size: int, number_of_draws: int) -> np.ndarray:
-    """
-    User-defined random number generator to the database.
-    See the numpy.random documentation to obtain a list of other distributions.
-    """
-    return TN.rvs((sample_size, number_of_draws))
-
-
-def calc_costs(df, k, modes):
-
-    # Cost parameter from current berlin model
-    daily_costs = defaultdict(lambda: 0.0, car=-14.30, pt=-3)
-    km_costs = defaultdict(lambda: 0.0, car=-0.149, ride=-0.149)
-    util_performing = -6.88
-
-    # Normalize activity utilities to be near zero
-    # columns = [f"plan_{i}_act_util" for i in range(1, k + 1)]
-    # for t in df.itertuples():
-    #     utils = df.loc[t.Index, columns]
-    #     df.loc[t.Index, columns] -= utils.max()
-
-    # Marginal utility of money as factor
-    util_money = df.util_money
-
-    for i in range(1, k + 1):
-
-        # Price is only monetary costs
-        df[f"plan_{i}_price"] = 0
-
-        # Costs will also include time costs
-        df[f"plan_{i}_utils"] = 0
-
-        df[f"plan_{i}_tt_hours"] = 0
-
-        for mode in modes:
-
-            fixed_costs = (df[f"plan_{i}_{mode}_usage"] > 0) * daily_costs[mode]
-            distance_costs = df[f"plan_{i}_{mode}_km"] * km_costs[mode]
-
-            df[f"plan_{i}_{mode}_fixed_cost"] = fixed_costs
-            df[f"plan_{i}_price"] += fixed_costs + distance_costs
-
-            df[f"plan_{i}_{mode}_used"] = (df[f"plan_{i}_{mode}_usage"] > 0) * 1
-            df[f"plan_{i}_tt_hours"] += df[f"plan_{i}_{mode}_hours"]
-
-            # Add configured time costs
-            df[f"plan_{i}_utils"] += (fixed_costs + distance_costs) * util_money
-
-            # Add time costs the overall costs
-            df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]
-
-            # Add additional ride time utils for the driver
-            if mode == "ride":
-                df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]
-
-        # Defragment df
-        df = df.copy()
-
-    return df
 
+from prepare import calc_plan_variables, tn_generator
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Estimate choice model for daily trip usage")
@@ -99,7 +34,7 @@ def calc_costs(df, k, modes):
     modes = list(df.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
     print("Modes: ", modes)
 
-    df = calc_costs(df, k, modes)
+    df = calc_plan_variables(df, k, modes)
 
     database = db.Database("data/plan-choices", df)
     v = database.variables

diff --git a/...main/python/estimate_mixed_plan_choice.py → ...hoicemodels/estimate_mixed_plan_choice.py b/...main/python/estimate_mixed_plan_choice.py → ...hoicemodels/estimate_mixed_plan_choice.py
@@ -2,62 +2,34 @@
 # -*- coding: utf-8 -*-
 
 import argparse
-import numpy as np
-import pandas as pd
+
 from xlogit import MixedLogit, MultinomialLogit
 from xlogit.utils import wide_to_long
 
-from estimate_plan_choice import calc_costs
+from prepare import read_plan_choices
 
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description="Estimate the plan choice mixed logit model")
-    parser.add_argument("--input", help="Path to the input file", type=str, default="../../../plan-choices.csv")
+    parser.add_argument("--input", help="Path to the input file", type=str,
+                        default="../../../plan-choices-bestK_9-tt-only.csv")
     parser.add_argument("--n-draws", help="Number of draws for the estimation", type=int, default=1500)
     parser.add_argument("--batch-size", help="Batch size for the estimation", type=int, default=None)
-    parser.add_argument("--sample", help="Use sample of choice data", type=float, default=0.2)
+    parser.add_argument("--sample", help="Use sample of choice data", type=float, default=1)
     parser.add_argument("--seed", help="Random seed", type=int, default=0)
     parser.add_argument("--mnl", help="Use MNL instead of mixed logit", action="store_true")
 
     args = parser.parse_args()
 
-    df_wide = pd.read_csv(args.input, comment="#")
-
-    modes = list(df_wide.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
-    print("Modes: ", modes)
-
-    k = df_wide.columns.str.extract(r"plan_(\d+)", expand=False).dropna().to_numpy(int).max()
-    print("Number of plans: ", len(df_wide))
-    print("Number of choices for plan: ", k)
-
-    # df_wide["p_id"] = df_wide["p_id"].str.replace(r"_\d+$", "", regex=True)
-    # df_wide["person"] = df_wide["person"].astype('category').cat.codes
-
-    # sample = set(df_wide.person.sample(frac=0.2))
-    # df_wide = df_wide[df_wide.person.isin(sample)]
-    if args.sample < 1:
-        df_wide = df_wide.sample(frac=args.sample, random_state=args.seed)
-
-    print("Modes:", modes)
-    print("Number of choices:", len(df_wide))
+    ds = read_plan_choices(args.input, sample=args.sample, seed=args.seed)
 
-    df_wide['custom_id'] = np.arange(len(df_wide))  # Add unique identifier
-    df_wide['choice'] = df_wide['choice'].map({1: "plan_1"})
-
-    df_wide = calc_costs(df_wide, k, modes)
-
-    varying = list(df_wide.columns.str.extract(r"plan_1_([a-zA-z_]+)", expand=False).dropna().unique())
-
-    print("Varying:", varying)
-
-    df = wide_to_long(df_wide, id_col='custom_id', alt_name='alt', sep='_',
-                      alt_list=[f"plan_{i}" for i in range(1, k + 1)], empty_val=0,
-                      varying=varying, alt_is_prefix=True)
+    df = wide_to_long(ds.df, id_col='custom_id', alt_name='alt', sep='_',
+                      alt_list=[f"plan_{i}" for i in range(1, ds.k + 1)], empty_val=0,
+                      varying=ds.varying, alt_is_prefix=True)
 
     MixedLogit.check_if_gpu_available()
 
-
     # ASC is present as mode_usage
-    varnames = [f"{mode}_usage" for mode in modes if mode != "walk" and mode != "car"] + ["car_used"]
+    varnames = [f"{mode}_usage" for mode in ds.modes if mode != "walk" and mode != "car"] + ["car_used"]
     # varnames += ["pt_ride_hours", "car_ride_hours", "bike_ride_hours"]
     # varnames = ["car_used", "car_usage"]
 
@@ -77,7 +49,7 @@
                   optim_method='L-BFGS-B')
 
     else:
-        #varnames += ["car_usage"]
+        # varnames += ["car_usage"]
 
         model = MultinomialLogit()
         model.fit(X=df[varnames], y=df['choice'], weights=df['weight'], varnames=varnames,

diff --git a/...main/python/estimate_mixed_trip_choice.py → ...hoicemodels/estimate_mixed_trip_choice.py b/...main/python/estimate_mixed_trip_choice.py → ...hoicemodels/estimate_mixed_trip_choice.py
diff --git a/src/main/python/estimate_trip_choice.py → ...thon/choicemodels/estimate_trip_choice.py b/src/main/python/estimate_trip_choice.py → ...thon/choicemodels/estimate_trip_choice.py
diff --git a/src/main/python/choicemodels/prepare.py b/src/main/python/choicemodels/prepare.py
@@ -0,0 +1,108 @@
+#!/usr/bin/env python
+# -*- coding: utf-8 -*-
+
+from collections import namedtuple, defaultdict
+
+import numpy as np
+import pandas as pd
+from scipy.stats import truncnorm
+
+# Cost parameter from current berlin model
+daily_costs = defaultdict(lambda: 0.0, car=-14.30, pt=-3)
+km_costs = defaultdict(lambda: 0.0, car=-0.149, ride=-0.149)
+
+TN = truncnorm(0, np.inf)
+
+PlanChoice = namedtuple("PlanChoice", ["df", "modes", "varying", "k"])
+
+
+def read_plan_choices(input_file: str, sample: float = 1, seed: int = 42) -> PlanChoice:
+    """ Read plan choices from input file """
+
+    df_wide = pd.read_csv(input_file, comment="#")
+
+    modes = list(df_wide.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
+    print("Modes: ", modes)
+
+    k = df_wide.columns.str.extract(r"plan_(\d+)", expand=False).dropna().to_numpy(int).max()
+    print("Number of plans: ", len(df_wide))
+    print("Number of choices for plan: ", k)
+
+    # df_wide["p_id"] = df_wide["p_id"].str.replace(r"_\d+$", "", regex=True)
+    # df_wide["person"] = df_wide["person"].astype('category').cat.codes
+
+    # sample = set(df_wide.person.sample(frac=0.2))
+    # df_wide = df_wide[df_wide.person.isin(sample)]
+    if sample < 1:
+        df_wide = df_wide.sample(frac=sample, random_state=seed)
+
+    print("Modes:", modes)
+    print("Number of choices:", len(df_wide))
+
+    df_wide['custom_id'] = np.arange(len(df_wide))  # Add unique identifier
+    df_wide['choice'] = df_wide['choice'].map({1: "plan_1"})
+
+    df_wide = calc_plan_variables(df_wide, k, modes)
+
+    varying = list(df_wide.columns.str.extract(r"plan_1_([a-zA-z_]+)", expand=False).dropna().unique())
+
+    return PlanChoice(df_wide, modes, varying, k)
+
+
+def tn_generator(sample_size: int, number_of_draws: int) -> np.ndarray:
+    """
+    User-defined random number generator to the database.
+    See the numpy.random documentation to obtain a list of other distributions.
+    """
+    return TN.rvs((sample_size, number_of_draws))
+
+
+def calc_plan_variables(df, k, modes, use_util_money=False):
+    """ Calculate utility and costs variables for all alternatives in the dataframe"""
+
+    util_performing = -6.88
+
+    # Normalize activity utilities to be near zero
+    # columns = [f"plan_{i}_act_util" for i in range(1, k + 1)]
+    # for t in df.itertuples():
+    #     utils = df.loc[t.Index, columns]
+    #     df.loc[t.Index, columns] -= utils.max()
+
+    # Marginal utility of money as factor
+    util_money = df.util_money if use_util_money else 1
+
+    for i in range(1, k + 1):
+
+        # Price is only monetary costs
+        df[f"plan_{i}_price"] = 0
+
+        # Costs will also include time costs
+        df[f"plan_{i}_utils"] = 0
+
+        df[f"plan_{i}_tt_hours"] = 0
+
+        for mode in modes:
+
+            fixed_costs = (df[f"plan_{i}_{mode}_usage"] > 0) * daily_costs[mode]
+            distance_costs = df[f"plan_{i}_{mode}_km"] * km_costs[mode]
+
+            df[f"plan_{i}_{mode}_fixed_cost"] = fixed_costs
+            df[f"plan_{i}_price"] += fixed_costs + distance_costs
+
+            df[f"plan_{i}_{mode}_used"] = (df[f"plan_{i}_{mode}_usage"] > 0) * 1
+            df[f"plan_{i}_tt_hours"] += df[f"plan_{i}_{mode}_hours"]
+
+            # Add configured time costs
+            df[f"plan_{i}_utils"] += (fixed_costs + distance_costs) * util_money
+
+            # Add time costs the overall costs
+            df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]
+
+            # Add additional ride time utils for the driver
+            if mode == "ride":
+                df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]
+
+        # Defragment df
+        df = df.copy()
+
+    return df
diff --git a/src/main/sh/runPlanChoices.sh b/src/main/sh/runPlanChoices.sh
@@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+
+CONFIG="./input/v6.3/berlin-v6.3.config.xml"
+JVM_ARGS="-Xmx22G -Xms22G -XX:+AlwaysPreTouch -XX:+UseParallelGC"
+
+run_eval() {
+    echo "Running evaluation with $1"
+    java $JVM_ARGS -cp matsim-berlin-*.jar org.matsim.prepare.choices.ComputePlanChoices --config $CONFIG\
+      --scenario org.matsim.run.OpenBerlinScenario\
+      --args --10pct\
+      --modes walk,pt,car,bike,ride\
+      $1
+}
+
+
+run_eval "--plan-candidates bestK --top-k 3"
+run_eval "--plan-candidates bestK --top-k 5"
+run_eval "--plan-candidates bestK --top-k 9"
+run_eval "--plan-candidates bestK --top-k 3 --time-util-only"
+run_eval "--plan-candidates bestK --top-k 5 --time-util-only"
+run_eval "--plan-candidates bestK --top-k 9 --time-util-only"
+run_eval "--plan-candidates random --top-k 3"
+run_eval "--plan-candidates random --top-k 5"
+run_eval "--plan-candidates random --top-k 9"
+run_eval "--plan-candidates diverse --top-k 9"