Skip to content

Commit

Permalink
refactored choice models
Browse files Browse the repository at this point in the history
  • Loading branch information
rakow committed Jul 9, 2024
1 parent fe7b6f1 commit c36120d
Show file tree
Hide file tree
Showing 7 changed files with 168 additions and 107 deletions.
23 changes: 22 additions & 1 deletion src/main/java/org/matsim/prepare/choices/ComputePlanChoices.java
Original file line number Diff line number Diff line change
Expand Up @@ -53,22 +53,34 @@ public class ComputePlanChoices implements MATSimAppCommand, PersonAlgorithm {
*/
private final Queue<List<Object>> rows = new ConcurrentLinkedQueue<>();
private final MainModeIdentifier mmi = new DefaultAnalysisMainModeIdentifier();

@CommandLine.Mixin
private ScenarioOptions scenario;

@CommandLine.Option(names = "--top-k", description = "Use top k estimates", defaultValue = "9")
private int topK;

@CommandLine.Option(names = "--modes", description = "Modes to include in estimation", split = ",")
private Set<String> modes;

@CommandLine.Option(names = "--id-filter", description = "Filter for person ids")
private Pattern idFilter;

@CommandLine.Option(names = "--time-util-only", description = "Reset scoring for estimation and only use time utility", defaultValue = "false")
private boolean timeUtil;

@CommandLine.Option(names = "--calc-scores", description = "Perform pseudo scoring for each plan", defaultValue = "false")
private boolean calcScores;
@CommandLine.Option(names = "--plan-candidates", description = "Method to generate plan candidates", defaultValue = "diverse")

@CommandLine.Option(names = "--plan-candidates", description = "Method to generate plan candidates", defaultValue = "bestK")
private PlanCandidates planCandidates = PlanCandidates.bestK;

@CommandLine.Option(names = "--max-plan-length", description = "Maximum plan length", defaultValue = "7")
private int maxPlanLength = 7;

@CommandLine.Option(names = "--output", description = "Path to output csv.", defaultValue = "plan-choices.csv")
private Path output;

private ThreadLocal<Ctx> thread;
private ProgressBar pb;
private double globalAvgIncome;
Expand Down Expand Up @@ -169,6 +181,10 @@ public Integer call() throws Exception {

String out = output.toString().replace(".csv", "-%s_%d.csv".formatted(planCandidates, topK));

if (timeUtil && planCandidates == PlanCandidates.bestK) {
out = out.replace(".csv", "-tt-only.csv");
}

log.info("Writing {} choices to {}", rows.size(), out);

try (CSVPrinter csv = new CSVPrinter(Files.newBufferedWriter(Path.of(out)), CSVFormat.DEFAULT.builder().setCommentMarker('#').build())) {
Expand Down Expand Up @@ -223,6 +239,11 @@ public void run(Person person) {
Plan plan = person.getSelectedPlan();
PlanModel model = PlanModel.newInstance(plan);

if (model.trips() > maxPlanLength) {
pb.step();
return;
}

String refModes = (String) person.getAttributes().getAttribute(TripAnalysis.ATTR_REF_MODES);
String[] split = refModes.strip().split("-");
String[] currentModes = model.getCurrentModesMutable();
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,75 +5,10 @@
import biogeme.biogeme as bio
import biogeme.database as db
import biogeme.models as models
import numpy as np
import pandas as pd
from biogeme.expressions import Beta, bioDraws, log, MonteCarlo
from collections import defaultdict
from scipy.stats import truncnorm

TN = truncnorm(0, np.inf)


def tn_generator(sample_size: int, number_of_draws: int) -> np.ndarray:
"""
User-defined random number generator to the database.
See the numpy.random documentation to obtain a list of other distributions.
"""
return TN.rvs((sample_size, number_of_draws))


def calc_costs(df, k, modes):

# Cost parameter from current berlin model
daily_costs = defaultdict(lambda: 0.0, car=-14.30, pt=-3)
km_costs = defaultdict(lambda: 0.0, car=-0.149, ride=-0.149)
util_performing = -6.88

# Normalize activity utilities to be near zero
# columns = [f"plan_{i}_act_util" for i in range(1, k + 1)]
# for t in df.itertuples():
# utils = df.loc[t.Index, columns]
# df.loc[t.Index, columns] -= utils.max()

# Marginal utility of money as factor
util_money = df.util_money

for i in range(1, k + 1):

# Price is only monetary costs
df[f"plan_{i}_price"] = 0

# Costs will also include time costs
df[f"plan_{i}_utils"] = 0

df[f"plan_{i}_tt_hours"] = 0

for mode in modes:

fixed_costs = (df[f"plan_{i}_{mode}_usage"] > 0) * daily_costs[mode]
distance_costs = df[f"plan_{i}_{mode}_km"] * km_costs[mode]

df[f"plan_{i}_{mode}_fixed_cost"] = fixed_costs
df[f"plan_{i}_price"] += fixed_costs + distance_costs

df[f"plan_{i}_{mode}_used"] = (df[f"plan_{i}_{mode}_usage"] > 0) * 1
df[f"plan_{i}_tt_hours"] += df[f"plan_{i}_{mode}_hours"]

# Add configured time costs
df[f"plan_{i}_utils"] += (fixed_costs + distance_costs) * util_money

# Add time costs the overall costs
df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]

# Add additional ride time utils for the driver
if mode == "ride":
df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]

# Defragment df
df = df.copy()

return df

from prepare import calc_plan_variables, tn_generator

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Estimate choice model for daily trip usage")
Expand All @@ -99,7 +34,7 @@ def calc_costs(df, k, modes):
modes = list(df.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
print("Modes: ", modes)

df = calc_costs(df, k, modes)
df = calc_plan_variables(df, k, modes)

database = db.Database("data/plan-choices", df)
v = database.variables
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,62 +2,34 @@
# -*- coding: utf-8 -*-

import argparse
import numpy as np
import pandas as pd

from xlogit import MixedLogit, MultinomialLogit
from xlogit.utils import wide_to_long

from estimate_plan_choice import calc_costs
from prepare import read_plan_choices

if __name__ == "__main__":
parser = argparse.ArgumentParser(description="Estimate the plan choice mixed logit model")
parser.add_argument("--input", help="Path to the input file", type=str, default="../../../plan-choices.csv")
parser.add_argument("--input", help="Path to the input file", type=str,
default="../../../plan-choices-bestK_9-tt-only.csv")
parser.add_argument("--n-draws", help="Number of draws for the estimation", type=int, default=1500)
parser.add_argument("--batch-size", help="Batch size for the estimation", type=int, default=None)
parser.add_argument("--sample", help="Use sample of choice data", type=float, default=0.2)
parser.add_argument("--sample", help="Use sample of choice data", type=float, default=1)
parser.add_argument("--seed", help="Random seed", type=int, default=0)
parser.add_argument("--mnl", help="Use MNL instead of mixed logit", action="store_true")

args = parser.parse_args()

df_wide = pd.read_csv(args.input, comment="#")

modes = list(df_wide.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
print("Modes: ", modes)

k = df_wide.columns.str.extract(r"plan_(\d+)", expand=False).dropna().to_numpy(int).max()
print("Number of plans: ", len(df_wide))
print("Number of choices for plan: ", k)

# df_wide["p_id"] = df_wide["p_id"].str.replace(r"_\d+$", "", regex=True)
# df_wide["person"] = df_wide["person"].astype('category').cat.codes

# sample = set(df_wide.person.sample(frac=0.2))
# df_wide = df_wide[df_wide.person.isin(sample)]
if args.sample < 1:
df_wide = df_wide.sample(frac=args.sample, random_state=args.seed)

print("Modes:", modes)
print("Number of choices:", len(df_wide))
ds = read_plan_choices(args.input, sample=args.sample, seed=args.seed)

df_wide['custom_id'] = np.arange(len(df_wide)) # Add unique identifier
df_wide['choice'] = df_wide['choice'].map({1: "plan_1"})

df_wide = calc_costs(df_wide, k, modes)

varying = list(df_wide.columns.str.extract(r"plan_1_([a-zA-z_]+)", expand=False).dropna().unique())

print("Varying:", varying)

df = wide_to_long(df_wide, id_col='custom_id', alt_name='alt', sep='_',
alt_list=[f"plan_{i}" for i in range(1, k + 1)], empty_val=0,
varying=varying, alt_is_prefix=True)
df = wide_to_long(ds.df, id_col='custom_id', alt_name='alt', sep='_',
alt_list=[f"plan_{i}" for i in range(1, ds.k + 1)], empty_val=0,
varying=ds.varying, alt_is_prefix=True)

MixedLogit.check_if_gpu_available()


# ASC is present as mode_usage
varnames = [f"{mode}_usage" for mode in modes if mode != "walk" and mode != "car"] + ["car_used"]
varnames = [f"{mode}_usage" for mode in ds.modes if mode != "walk" and mode != "car"] + ["car_used"]
# varnames += ["pt_ride_hours", "car_ride_hours", "bike_ride_hours"]
# varnames = ["car_used", "car_usage"]

Expand All @@ -77,7 +49,7 @@
optim_method='L-BFGS-B')

else:
#varnames += ["car_usage"]
# varnames += ["car_usage"]

model = MultinomialLogit()
model.fit(X=df[varnames], y=df['choice'], weights=df['weight'], varnames=varnames,
Expand Down
File renamed without changes.
108 changes: 108 additions & 0 deletions src/main/python/choicemodels/prepare.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-

from collections import namedtuple, defaultdict

import numpy as np
import pandas as pd
from scipy.stats import truncnorm

# Cost parameter from current berlin model
daily_costs = defaultdict(lambda: 0.0, car=-14.30, pt=-3)
km_costs = defaultdict(lambda: 0.0, car=-0.149, ride=-0.149)

TN = truncnorm(0, np.inf)

PlanChoice = namedtuple("PlanChoice", ["df", "modes", "varying", "k"])


def read_plan_choices(input_file: str, sample: float = 1, seed: int = 42) -> PlanChoice:
""" Read plan choices from input file """

df_wide = pd.read_csv(input_file, comment="#")

modes = list(df_wide.columns.str.extract(r"_([a-zA-z]+)_usage", expand=False).dropna().unique())
print("Modes: ", modes)

k = df_wide.columns.str.extract(r"plan_(\d+)", expand=False).dropna().to_numpy(int).max()
print("Number of plans: ", len(df_wide))
print("Number of choices for plan: ", k)

# df_wide["p_id"] = df_wide["p_id"].str.replace(r"_\d+$", "", regex=True)
# df_wide["person"] = df_wide["person"].astype('category').cat.codes

# sample = set(df_wide.person.sample(frac=0.2))
# df_wide = df_wide[df_wide.person.isin(sample)]
if sample < 1:
df_wide = df_wide.sample(frac=sample, random_state=seed)

print("Modes:", modes)
print("Number of choices:", len(df_wide))

df_wide['custom_id'] = np.arange(len(df_wide)) # Add unique identifier
df_wide['choice'] = df_wide['choice'].map({1: "plan_1"})

df_wide = calc_plan_variables(df_wide, k, modes)

varying = list(df_wide.columns.str.extract(r"plan_1_([a-zA-z_]+)", expand=False).dropna().unique())

return PlanChoice(df_wide, modes, varying, k)


def tn_generator(sample_size: int, number_of_draws: int) -> np.ndarray:
"""
User-defined random number generator to the database.
See the numpy.random documentation to obtain a list of other distributions.
"""
return TN.rvs((sample_size, number_of_draws))


def calc_plan_variables(df, k, modes, use_util_money=False):
""" Calculate utility and costs variables for all alternatives in the dataframe"""

util_performing = -6.88

# Normalize activity utilities to be near zero
# columns = [f"plan_{i}_act_util" for i in range(1, k + 1)]
# for t in df.itertuples():
# utils = df.loc[t.Index, columns]
# df.loc[t.Index, columns] -= utils.max()

# Marginal utility of money as factor
util_money = df.util_money if use_util_money else 1

for i in range(1, k + 1):

# Price is only monetary costs
df[f"plan_{i}_price"] = 0

# Costs will also include time costs
df[f"plan_{i}_utils"] = 0

df[f"plan_{i}_tt_hours"] = 0

for mode in modes:

fixed_costs = (df[f"plan_{i}_{mode}_usage"] > 0) * daily_costs[mode]
distance_costs = df[f"plan_{i}_{mode}_km"] * km_costs[mode]

df[f"plan_{i}_{mode}_fixed_cost"] = fixed_costs
df[f"plan_{i}_price"] += fixed_costs + distance_costs

df[f"plan_{i}_{mode}_used"] = (df[f"plan_{i}_{mode}_usage"] > 0) * 1
df[f"plan_{i}_tt_hours"] += df[f"plan_{i}_{mode}_hours"]

# Add configured time costs
df[f"plan_{i}_utils"] += (fixed_costs + distance_costs) * util_money

# Add time costs the overall costs
df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]

# Add additional ride time utils for the driver
if mode == "ride":
df[f"plan_{i}_utils"] += util_performing * df[f"plan_{i}_{mode}_hours"]

# Defragment df
df = df.copy()

return df
25 changes: 25 additions & 0 deletions src/main/sh/runPlanChoices.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
#!/usr/bin/env bash

CONFIG="./input/v6.3/berlin-v6.3.config.xml"
JVM_ARGS="-Xmx22G -Xms22G -XX:+AlwaysPreTouch -XX:+UseParallelGC"

run_eval() {
echo "Running evaluation with $1"
java $JVM_ARGS -cp matsim-berlin-*.jar org.matsim.prepare.choices.ComputePlanChoices --config $CONFIG\
--scenario org.matsim.run.OpenBerlinScenario\
--args --10pct\
--modes walk,pt,car,bike,ride\
$1
}


run_eval "--plan-candidates bestK --top-k 3"
run_eval "--plan-candidates bestK --top-k 5"
run_eval "--plan-candidates bestK --top-k 9"
run_eval "--plan-candidates bestK --top-k 3 --time-util-only"
run_eval "--plan-candidates bestK --top-k 5 --time-util-only"
run_eval "--plan-candidates bestK --top-k 9 --time-util-only"
run_eval "--plan-candidates random --top-k 3"
run_eval "--plan-candidates random --top-k 5"
run_eval "--plan-candidates random --top-k 9"
run_eval "--plan-candidates diverse --top-k 9"

0 comments on commit c36120d

Please sign in to comment.