Skip to content

Commit

Permalink
Merge pull request #1 from troiwill/remove-dict-in-response
Browse files Browse the repository at this point in the history
Simplified the response variable and the model.
  • Loading branch information
troiwill authored Mar 25, 2024
2 parents 6058dbe + 4cdcf60 commit 42e77f0
Show file tree
Hide file tree
Showing 14 changed files with 162 additions and 93 deletions.
24 changes: 12 additions & 12 deletions pomdp_py/algorithms/po_rollout.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ cdef class PORollout(Planner):
self._particles = particles

self._agent = None
self._last_best_response = Response({"reward": float('-inf')})
self._last_best_response = None

@property
def last_best_response(self):
Expand All @@ -60,24 +60,24 @@ cdef class PORollout(Planner):

cpdef _search(self):
cdef Action best_action
cdef Response best_response = Response()
cdef Response response_avg = Response()
cdef Response total_discounted_response = Response()
cdef Response best_response
cdef Response response_avg
cdef Response total_discounted_response
cdef set legal_actions
cdef list responses

best_action, best_response["reward"] = None, float("-inf")
best_action, best_response = None, Response(float("-inf"))
legal_actions = self._agent.valid_actions(history=self._agent.history)
for action in legal_actions:
responses = []
for i in range(self._num_sims // len(legal_actions)):
state = self._agent.belief.random()
total_discounted_response = self._rollout(state, 0)
responses.append(total_discounted_response["reward"])
response_avg["reward"] = sum(responses) / len(responses)
if response_avg["reward"] > best_response["reward"]:
responses.append(total_discounted_response)
response_avg = sum(responses) / len(responses)
if response_avg > best_response:
best_action = action
best_response["reward"] = response_avg["reward"]
best_response = response_avg
return best_action, best_response

cpdef _rollout(self, State state, int depth):
Expand All @@ -87,7 +87,7 @@ cdef class PORollout(Planner):
cdef Response total_discounted_response = Response()
cdef State next_state
cdef Observation observation
cdef Response response = Response()
cdef Response response
cdef int nsteps
cdef tuple history = self._agent.history

Expand Down Expand Up @@ -130,8 +130,8 @@ cdef class PORollout(Planner):
def clear_agent(self):
"""clear_agent(self)"""
self._agent = None # forget about current agent so that can plan for another agent.
self._last_best_response["reward"] = float('-inf')

self._last_best_response = Response(float('-inf'))
cpdef set_rollout_policy(self, RolloutPolicy rollout_policy):
"""
set_rollout_policy(self, RolloutPolicy rollout_policy)
Expand Down
8 changes: 4 additions & 4 deletions pomdp_py/algorithms/po_uct.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ cdef class QNode(TreeNode):

cpdef void update(QNode self, Response response):
self.num_visits += 1
self.value = self.value + (response["reward"] - self.value) / self.num_visits
self.value = self.value + (response.reward - self.value) / self.num_visits


cdef class VNode(TreeNode):
Expand Down Expand Up @@ -371,7 +371,7 @@ cdef class POUCT(Planner):
State state, tuple history, VNode root, QNode parent,
Observation observation, int depth):
if depth > self._max_depth:
return Response()
return self._agent.response_model.create_response()
if root is None:
if self._agent.tree is None:
root = self._VNode(root=True)
Expand Down Expand Up @@ -409,10 +409,10 @@ cdef class POUCT(Planner):
cpdef _rollout(self, State state, tuple history, VNode root, int depth):
cdef Action action
cdef float discount = 1.0
cdef Response total_discounted_response = Response()
cdef Response total_discounted_response = self._agent.response_model.create_response()
cdef State next_state
cdef Observation observation
cdef Response response = Response()
cdef Response response

while depth < self._max_depth:
action = self._rollout_policy.rollout(state, history)
Expand Down
2 changes: 1 addition & 1 deletion pomdp_py/algorithms/value_iteration.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ cdef class _PolicyTreeNode:
else:
subtree_value = 0.0
response = self._agent.response_model.sample(s, self.action, sp)
expected_future_value += trans_prob * obsrv_prob * (response["reward"] + discount_factor*subtree_value)
expected_future_value += trans_prob * obsrv_prob * (response.reward + discount_factor*subtree_value)
values[s] = expected_future_value
return values

Expand Down
10 changes: 6 additions & 4 deletions pomdp_py/framework/basics.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,10 @@ cdef class TransitionModel:
cdef class PolicyModel:
pass

cdef class ResponseModel(dict):
pass
cdef class ResponseModel:
cdef dict _model_dict
cdef Response _response
cdef dict __dict__

cdef class BlackboxModel:
pass
Expand All @@ -34,8 +36,8 @@ cdef class Observation:
cdef class Vector(list):
pass

cdef class Response(dict):
pass
cdef class Response:
cdef float _reward

cdef class Agent:
cdef GenerativeDistribution _init_belief
Expand Down
139 changes: 102 additions & 37 deletions pomdp_py/framework/basics.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -186,19 +186,50 @@ cdef class RewardModel:
Returns the underlying distribution of the model"""
raise NotImplementedError

cdef class ResponseModel(dict):
cdef class ResponseModel:
"""A ResponseModel returns a real or simulated response
after the agent interacts with the real or a simulated environment.
The implementation of this model contains a collection of more
specific models such as reward and cost models."""

def __init__(self, models):
if not isinstance(models, dict):
raise TypeError("models must be a dictionary of models.")
for key, model in models.items():
def __init__(self, response):
self._model_dict = dict()
self._response = response

@staticmethod
def generate_response_model(model_dict, response=Response()):
# Do a sanity check to ensure the response model and response are compatible.
for name in model_dict.keys():
if not hasattr(response, name):
raise AttributeError(f"The response {type(response)} does not have the attribute {name}.")

# Create the response model and add the models.
model = ResponseModel(response)
model.add_models(model_dict)
return model

def add_attrs(self, attr_dict):
if not isinstance(attr_dict, dict):
raise TypeError(f"attr_dict must be type dict, but got {type(attr_dict)}.")

for ak, av in attr_dict.items():
if hasattr(self, ak):
raise KeyError(f"The attribute {ak} already exists.")
setattr(self, ak, None)

def add_models(self, model_dict):
if not isinstance(model_dict, dict):
raise TypeError(f"model_dict must be type dict, but got {type(model_dict)}.")

for model_name, model in model_dict.items():
# Perform a sanity check.
if not hasattr(model, "sample"):
raise NotImplementedError(f"Model named {key} must implement a sample function.")
self[key] = model
raise AttributeError(f"The model {model_name} does not have a sample(...) function.")

# Store the model name for quick access in sample(...) function.
self._model_dict[model_name] = model

# Add the models to the response model.
self.add_attrs(model_dict)

def sample(self, state, action, next_state, **kwargs):
"""sample(self, state, action, next_state)
Expand All @@ -212,12 +243,17 @@ cdef class ResponseModel(dict):
Returns:
Response: the response
"""
return Response(
dict([
(name, model.sample(state, action, next_state, **kwargs))
for name, model in self.items()
])
)
return self.create_response(**dict([
(name, model.sample(state, action, next_state, **kwargs))
for name, model in self._model_dict.items()
]))

def create_response(self, *args, **kwargs):
return self._response.new(*args, **kwargs)

# @property
# def response_type(self):
# return type(self._response_type)

cdef class BlackboxModel:
"""
Expand Down Expand Up @@ -371,51 +407,80 @@ cdef class Vector(list):
raise TypeError(f"other must be type Vector, float, or int, but got {type(other)}.")
return Vector([v0 + v1 for v0, v1 in zip(self, vec)])

def __radd__(self, other):
return self.__add__(other)

def __mul__(self, other):
if not isinstance(other, (float, int)):
raise TypeError(f"other must be type float or int, but got {type(other)}.")
return Vector([v * other for v in self])

cdef class Response(dict):
def __rmul__(self, other):
return self.__mul__(other)


cdef class Response:
"""
The Response class.
A Response class that only handles a scalar reward. Subclasses of Response can add
more (scalar or vector) variables. But the subclasses must implement how to handle
arithmetic and comparison operations.
"""
def __init__(self, variables=dict(reward=0.0)):
def __init__(self, reward=0.0):
super().__init__()
if not isinstance(variables, dict):
raise TypeError(f"reward must be type dict, but got {type(variables)}.")
for k, v in variables.items():
self[k] = v
self._reward = reward

@property
def reward(self):
return self._reward

@classmethod
def new(cls, reward=0.0):
return cls(reward=reward)

def _check_reward_compatibility(self, value):
if not isinstance(value, (float, int, Response)):
raise TypeError(f"other must be type Response, float, or int, but got {type(value)}.")

def _get_value(self, value):
self._check_reward_compatibility(value)
if isinstance(value, Response):
value = value.reward
return value

def __add__(self, other):
if not isinstance(other, Response):
raise TypeError("other must be type Response.")
return Response(
dict([
(name, value + other[name])
for name, value in self.items()
])
)
return Response(self._reward + self._get_value(other))

def __radd__(self, other):
return self.__add__(other)

def __mul__(self, other):
if not isinstance(other, (float, int)):
raise TypeError("other must be type float or int.")
return Response(
dict([
(name, value * other)
for name, value in self.items()
])
)
return Response(self._reward * other)

def __rmul__(self, other):
return self.__mul__(other)

def __str__(self):
return ", ".join([f"{k}={v}" for k, v in self.items()])
def __eq__(self, other):
return self._reward == self._get_value(other)

def __ne__(self, other):
return self._reward != self._get_value(other)

def __lt__(self, other):
return self._reward < self._get_value(other)

def __le__(self, other):
return self._reward <= self._get_value(other)

def __gt__(self, other):
return self._reward > self._get_value(other)

def __ge__(self, other):
return self._reward >= self._get_value(other)

def __str__(self):
return f"reward={self._reward}"

cdef class Agent:
""" An Agent operates in an environment by taking actions, receiving
Expand Down
9 changes: 6 additions & 3 deletions pomdp_py/problems/load_unload/load_unload.py
Original file line number Diff line number Diff line change
Expand Up @@ -215,15 +215,18 @@ def get_all_actions(self, **kwargs):
class LoadUnloadProblem(pomdp_py.POMDP):
def __init__(self, init_state, init_belief):
"""init_belief is a Distribution."""
import copy

response_model = pomdp_py.ResponseModel.generate_response_model({"reward": LURewardModel()})
agent = pomdp_py.Agent(
init_belief,
LUPolicyModel(),
LUTransitionModel(),
LUObservationModel(),
pomdp_py.ResponseModel({"reward": LURewardModel()}),
copy.deepcopy(response_model),
)

env = pomdp_py.Environment(init_state, LUTransitionModel(), pomdp_py.ResponseModel({"reward": LURewardModel()}))
env = pomdp_py.Environment(init_state, LUTransitionModel(), copy.deepcopy(response_model))

super().__init__(agent, env, name="LoadUnloadProblem")

Expand Down Expand Up @@ -268,7 +271,7 @@ def update(t):
action = planner.plan(load_unload_problem.agent)

env_response = load_unload_problem.env.state_transition(action, execute=True)
env_reward = env_response["reward"]
env_reward = env_response.reward
true_state = copy.deepcopy(load_unload_problem.env.state)

real_observation = load_unload_problem.env.provide_observation(
Expand Down
2 changes: 1 addition & 1 deletion pomdp_py/problems/multi_object_search/agent/agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,7 @@ def __init__(
policy_model,
transition_model=transition_model,
observation_model=observation_model,
response_model=pomdp_py.ResponseModel({"reward": reward_model}),
response_model=pomdp_py.ResponseModel.generate_response_model(dict(reward=reward_model)),
)

def clear_history(self):
Expand Down
4 changes: 2 additions & 2 deletions pomdp_py/problems/multi_object_search/env/env.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ def __init__(self, dim, init_state, sensors, obstacles=set({})):
if not isinstance(init_state.object_states[objid], RobotState)
}
reward_model = GoalRewardModel(self.target_objects)
super().__init__(init_state, transition_model, pomdp_py.ResponseModel({"reward": reward_model}))
super().__init__(init_state, transition_model, pomdp_py.ResponseModel.generate_response_model(dict(reward=reward_model)))

@property
def robot_ids(self):
Expand Down Expand Up @@ -69,7 +69,7 @@ def state_transition(self, action, execute=True, robot_id=None):
response = self.response_model.sample(
self.state, action, next_state, robot_id=robot_id
)
reward = response["reward"]
reward = response.reward
if execute:
self.apply_transition(next_state)
return reward
Expand Down
Loading

0 comments on commit 42e77f0

Please sign in to comment.