-
Notifications
You must be signed in to change notification settings - Fork 0
/
markov.tex
268 lines (232 loc) · 16.1 KB
/
markov.tex
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
\chapter{Markov Decision Processes}
\label{chap:markov-decision-processes}
State-space representation of systems characterized by discrete states and by a non-linear update equation are better described by \emph{Markov decision processes} (MDP). An MDP is a problem formulation for modeling sequential decisions guided by rewards under uncertainty in how the system transitions from state to state.
An MDP is characterized by
\begin{itemize}
\item a set $\mathcal{X}$ of $N$ states;
\item a set $\mathcal{U}$ of $M$ control actions;
\item the time-evolution described by a transition probability function\footnote{I increasingly like the notation from~\cite[Sect.~3.1]{reinforcement-learning-sutton-barto} for the model dynamics
\begin{equation*}
p:\mathcal{X}\times\mathcal{R}\times{X}\times\mathcal{U} \rightarrow [0,1]
\end{equation*}
such that
\begin{equation*}
p(x^\prime,r|x,u) \doteq Pr\{S_{t+1}=x^\prime,R_{t+1}=r|S_t=x,A_t=u\}.
\end{equation*}
The three argument function used in class is defined as
\begin{equation*}
p(x^\prime|x,u) = \sum_{r\in\mathcal{R}} p(x^\prime,r|x,u);
\end{equation*}
and the expected reward becomes
\begin{equation*}
r(x^\prime,x,u) \doteq \mathbb{E}[R_t|S_t=x,A_t=u,S_{t+1}=x^\prime] = \sum_{r\in\mathcal{R}} r \frac{p(x^\prime,r|x,u)}{p(x^\prime|x,u)}.
\end{equation*}
The Bellman equation is
\begin{equation*}
V_k^\pi(x) = \sum_u \pi_k(x,u) \sum_{x^\prime,r} p(x^\prime,r|x,u) \left[r+V_{k+1}^\pi(x^\prime)\right]
\end{equation*}
or, for the infinite horizon,
\begin{equation*}
V^\pi(x) = \sum_u \pi(x,u) \sum_{x^\prime,r} p(x^\prime,r|x,u) \left[r+\gamma V^\pi(x^\prime)\right]
\end{equation*}
highlighting the quantity $r+\gamma V^\pi(x^\prime)$ similar to that in use in RL.}
\begin{equation*}
P : \mathcal{X}\times \mathcal{X}\times \mathcal{U} \rightarrow [0,1]
\end{equation*}
where
\begin{equation*}
P^u_{x,x^\prime} = \mathbb{P}[x^\prime |x,u]
\end{equation*}
is the \emph{probability} to transition to state $x^\prime$ if the MDP is in state $x$ and the input $u$ is applied. The Markov property for probabilistic systems is the extension of the definition for deterministic systems: for every $x^\prime$, the probability $P^u_{x,x\prime}$ depends only on $x$ and $u$ and not on the past system history;
\item an immediate cost
\begin{equation*}
r(x,u,x^\prime)
\end{equation*}
for the transition to state $x^\prime$ given the MDP being in state $x$ and when action $u$ is taken.
\end{itemize}
MDP are typically represented by a Markov chain, with states depicted by circles connected by arrows with the corresponding transition probability.
A control policy selects a suitable action $u\in \mathcal{U}$ based on the current state $x\in \mathcal{X}$ of the system. A policy can be deterministic
\begin{equation*}
\mu : \mathcal{X} \rightarrow \mathcal{U}
\end{equation*}
or more generally, stochastic
\begin{equation*}
\pi : \mathcal{X} \times \mathcal{U} \rightarrow [0,1].
\end{equation*}
where $\pi(x,u)$ is the conditional probability of selecting the input action $u$ given that the MDP is in state $x$. A deterministic policy is a special case of a stochastic one, taking on the probability value $1$ for the action $u$ that the deterministic policy would select and $0$ for all others. An optimal policy is deterministic\footnote{Explain why.}, but for intermediate steps it is useful to work with stochastic policies.
The stage cost at time $k$ for the states transition from $x_k$ to $x_{k+1}$ under the action $u_k$ is
\begin{equation*}
r_k = r(x_k,u_k,x_{k+1}).
\end{equation*}
The quantity we want to control is $\sum_{k=0}^K r_k$: in particular we want to find the policy $\bs{\pi}=\{\pi_0,\pi_1,\ldots \pi_K\}$ that minimizes
\begin{equation*}
%V^\pi(x) = \mathbb{E}\left[\sum_{i=0}^K r_i|x_0=x\right]
V^\pi(x) = \underset{x_0=x}{\mathbb{E}}\ \sum_{k=0}^K r_k
\end{equation*}
given the initial state $x_0=x$.
\section{Dynamic Programming for MDP}
\label{sec:MDP-dynamic-programming}
In order to solve this problem recursively, we define the accumulated cost
\begin{equation*}
%V_k^\pi(x) = \mathbb{E}\left[\sum_{i=k}^K r_i|x_k=x\right]
V_k^\pi(x) = \underset{x_k=x}{\mathbb{E}}\ \sum_{i=k}^K r_i,
\end{equation*}
where the average is done on the policies $\{\pi_k, \pi_{k+1},\ldots \pi_K\}$. Since the system is Markovian, backward induction can be used to evaluate $V_k^\pi(x)$:
\begin{equation}
\label{eq:MDP-recursive-value}
\begin{aligned}
V_k^\pi(x) &= \underset{x_k=x}{\mathbb{E}}\sum_{i=k}^K r_i \\
&= \underset{x_k=x}{\mathbb{E}}\left[r_k + \sum_{i=k+1}^K r_i\right] \\
&= \sum_u \pi_k(x,u)\sum_{x^\prime} P_{x,x^\prime}^u \left[r(x,u,x^\prime) + \underset{x_{k+1}=x^\prime}{\mathbb{E}}\sum_{i=k+1}^K r_i\right] & \quad (*) \\
&= \sum_u \pi_k(x,u)\left[R^u_x + \sum_{x^\prime}P_{x,x^\prime}^u V^\pi_{k+1}(x^\prime)\right]
% &= \sum_u \pi_k(x,u)\sum_{x^\prime}P_{x,x^\prime}û \left[r(x,u,x^\prime) + V^\pi_{k+1}(x^\prime)\right]
\end{aligned}
\end{equation}
where
\begin{equation*}
% R^u_x = \mathbb{E}[r_k|x_k=x,u_k=u]
R^u_x = \sum_{x^\prime} P^u_{x,x^\prime}r(x,u,x^\prime)
\end{equation*}
is the averaged immediate cost.
To compute the average in $(*)$ we need to sum over the possible control actions $u$, each one of them having the probability $P_{x,x^\prime}^u$ to transition the system from $x$ to $x^\prime$. The term in square brackets is the accumulated cost from time $k+1$ starting from the initial state $x_{k+1}=x^\prime$ and the immediate cost $r(x,u,x^\prime)$ at $k$.
By Bellman's optimality principle, the optimal cost can be computed recursively using eq.~\eqref{eq:MDP-recursive-value} as a minimization on the action $u$
\begin{equation*}
V_k^\star(x) = \min_\pi \sum_u \pi_k(x,u)\left[R^u_x + \sum_{x^\prime}P_{x,x^\prime}^u V^\star_{k+1}(x^\prime)\right].
\end{equation*}
For a finite horizon problem $K$ and starting from the base case $V_K$, at each step $k$ a policy $\pi_k$ is found: under weak conditions\footnote{Which ones?}, the policy is deterministic because the optimum is at one of the corners of the simplex defined by the constraints on the probabilities
\begin{equation*}
\pi(x,u) \ge 0,\qquad \sum_u \pi(x,u) = 1.
\end{equation*}
Not sure about this one: the plot in class was given without labeling the axes.
\subsection{Complexity for Finite Horizon}
\label{sec:MDP-complexity-finite-horizon}
We have reduced the original stochastic non-linear system to a linear one and solved by backward induction with linear programming. The price to pay is the increased size of the system of equations to solve: for each time step $k$, the minimization must be done $M$ times, once for each control input $u$, for each of the $N$ states; the policy $\pi_k$ at step $k$ reduces to a matrix of size $M\times N$ since the optimal policy is expected to be deterministic.
%For instance if MDP is applied to an inverted pendulum, the discrete state space is obtained by slicing the pendulum angle and the torque applied in discrete values. Assuming that both angle and torque take on 1000 discrete values, at each time step $k$
\section{Infinite Horizon Problem}
\label{sec:MDP-infinite-horizon}
Most of the time when working with MDP we consider an infinite horizon, making the Markov process stationary with a time-invariant expected cost $R_x^u$ and a time-invariant policy $\pi$.
The infinite-horizon cost is defined to be the quantity
\begin{equation}
\label{eq:MDP-infinite-horizon-cost}
\sum_{k=0}^\infty \gamma^k r_k
\end{equation}
where $0 \le \gamma < 1$ is the discount factor, which ensures that the cost is finite\footnote{Without a discount factor the sum of the rewards can become infinite. Suppose that strategy $\pi_1$ results in a reward $1$ for each time step and $\pi_2$ in a reward 100. A rational agent should prefer $\pi_2$ but both strategies have the same infinite cost~\cite{decision-making-kochenderfer}.}. The value of $\gamma$ achieves a balance between immediate and future costs and accounts for the lower impact of future costs: $\gamma \approx 0$ generates a myopic policy, $\gamma \approx 1$ a long-sighted one; for the extreme case $\gamma = 0$, only the immediate cost is considered.
The cost eq.~\eqref{eq:MDP-infinite-horizon-cost} mirrors real-world problems: for instance it is economically advantageous to delay spending an amount of money that can instead be invested at a guaranteed base rate $\gamma$. It also maps to the Bernoulli termination problem (?), with $1-\gamma$ being the termination probability.
\subsection{Closed-Loop MDP}
For a state-space system with linear dynamics $\dot{x} = Ax+Bu$, the feedback law $u=-Kx$ reduces the system dynamics to the closed loop response
\begin{equation*}
\dot{x} = (A-BK)x.
\end{equation*}
Similarly, for an MDP, given the policy $\pi$, the dynamics reduces to a Markov chain where the closed-loop transition probabilities can be expressed by
\begin{equation*}
P^\pi _{x,x^\prime} = \sum_u \mathbb{P}[u|x] \mathbb{P}[x^\prime|x,u] = \sum_u \pi(x,u) P_{x,x^\prime}^u
\end{equation*}
Let $\mathtt{P}^\pi = \sum_u \pi(x,u) P_{x,x^\prime}^u\in \mathbb{R}^{N\times N}$. Since $\mathtt{P}^\pi$ represents transition probabilities, its rows must sum to $1$. Moreover, from the Perron-Frobenious, its eigenvalues have modulus 1.
At time $k$, the system will be in state $x$ with probability distribution $d_k(x)$, which depends on $\pi$; the distribution evolves as the linear equation
\begin{equation*}
d_{k+1}^\top = d_k^\top \mathtt{P}^\pi.
\end{equation*}
When the horizon is infinite, the stationary distribution $d(x)$ satisfies
\begin{equation*}
d^\top = d^\top \mathtt{P}^\pi.
\end{equation*}
\subsection{Example: The Stationary Distribution for the Traffic Light Problem}
We consider the example problem of controlling a queue of vehicles at a traffic light. The states are represented by the number of cars waiting, the control actions are the states of the traffic light.
At each time step, there is a probability $p$ that a new car arrives and $1-p$ that the queue length does not grow. We select the deterministic policy $\mu(x)$ that the traffic light turns green as soon as there are 3 cars waiting; we need not consider states with more than 3 cars because these states are never reached. Under this policy, the transition probabilities become
\begin{equation*}
P^\pi =
\begin{bmatrix}
1-p & p & 0 & 0 \\
0 & 1-p & p & 0 \\
0 & 0 & 1-p & p \\
1-p & p & 0 & 0
\end{bmatrix}
\end{equation*}
The stationary distribution is found by solving the right eigenvector problem corresponding to the eigenvalue $1$:
\begin{equation*}
d =
%\begin{bmatrix}
% \frac{1-p}{3} & \frac{1}{3} & \frac{1}{3} & \frac{p}{3}
%\end{bmatrix}^\top.
\frac{1}{3}\begin{bmatrix}
1-p \\ 1 \\ 1 \\ p
\end{bmatrix}.
\end{equation*}
\section{Computational Complexity of the Bellman Equation}
\label{sec:MDP-computation-complexity-bellman-eq}
The infinite horizon value of a policy can be expressed recursively similar to eq.~\eqref{eq:MDP-recursive-value} with no $k$ dependence since policy and optimal cost are stationary
\begin{equation}
\label{eq:MDP-recursive-value-infinite-horizon}
V^\pi(x) = \sum_u \pi(x,u)\left[R^u_x + \gamma \sum_{x^\prime}P^u_{x,x^\prime} V^\pi(x^\prime)\right]
\end{equation}
and with the discount term $\gamma$. The proof is similar to that for eq.~\eqref{eq:MDP-recursive-value}.
If the minimum is achieved at a deterministic policy, it becomes a system of the $N$ \emph{non-linear} equations\footnote{But how can the system be determinimistic if $P_{x,x^\prime}^u$ is still a probabilistic function? Should it not be
\begin{equation*}
V^\star(x) = \min_u \left[r(x,u,x^+) + \gamma V^\star(x^+)\right].
\end{equation*}
with $x^+ = f(x,u)$? Or is it because of the disturbances?}:
\begin{equation*}
V^\star(x) = \min_u \left[R^u_x + \gamma \sum_{x^\prime}P^u_{x,x^\prime} V^\star(x^\prime)\right].
\end{equation*}
We next consider two different approaches to solve this equation: policy iteration and value iteration.
\subsection{Policy Iteration}
\label{sec:MDP-policy-iteration}
Eq.~\eqref{eq:MDP-recursive-value-infinite-horizon} is a system of linear equations that can be written in the vector $\mathtt{V}^\pi\in\mathbb{R}^N$ as
\begin{equation}
\label{eq:policy-iteration-policy-evaluation}
\mathtt{V}^\pi = \mathtt{R}^\pi + \gamma \mathtt{P}^\pi \mathtt{V}^\pi
\end{equation}
where $\mathtt{R}^\pi = \sum_u \pi(x,u)R_x^u$ is a $N$ element vector.
Policy iteration starts by guessing a policy $\pi_0$ and then repeats the two steps until convergence of the policy:
\begin{itemize}
\item policy evaluation: compute the value function $V^\pi$ associated to the policy $\pi$ by solving eq.~\eqref{eq:policy-iteration-policy-evaluation};
\item policy update by evaluating
\begin{equation}
\label{eq:MDP-policy-improvement}
\pi^\prime = \arg \min_\nu \sum_u \nu(x,u)\left[R^u_x + \gamma \sum_{x^\prime}P^u_{x,x^\prime} V^\pi(x^\prime)\right]
\end{equation}
\end{itemize}
An optimal policy can be indeed found using the greedy approach of eq.~\eqref{eq:MDP-policy-improvement}, where the improved policy $\pi^\prime$ is only applied on one step only and the previous policy $\pi$ is used for the computation of $V^\pi(x^\prime)$. This greedy improving finds the optimal policy because for any state $x$, it does not make the value function worse and it improves it for at least one state $x^\prime$.
\begin{theorem}
The policy update eq.~\eqref{eq:MDP-policy-improvement} satisfies
\begin{equation*}
V^{\pi^\prime}(x) \le V^\pi(x)\ \forall x,\quad \text{and }\exists x^\prime \text{ such that }V^{\pi^\prime}(x) < V^\pi(x)
\end{equation*}
unless $\pi$ is already the optimal policy.
\end{theorem}
\begin{proof}
If $\pi^\prime$ is the policy improving on $\pi$, one has
\begin{align*}
V^\pi(x) &\ge \sum_u \pi^\prime(x,u)\left[R^u_x + \gamma \sum_{x^\prime}P^u_{x,x^\prime} V^\pi(x^\prime)\right] \\
&= \mathbb{E} \left[r_0 + \gamma V^\pi(x_1)\right] \\
&\ge \mathbb{E} \left[r_0 + \gamma \left[r_1 + \gamma V^\pi(x_2)\right]\right] = \mathbb{E} \left[r_0 + \gamma r_1 + \gamma^2 V^\pi(x_2)\right] \\
&\ge \ldots \\
&\ge \mathbb{E} \left[r_0 + \gamma r_1 + \gamma^2 r_2 + \gamma^3 r_3 + \ldots \right] = V^{\pi^\prime}(x)
\end{align*}
Moreover the optimum is reached in a finite amount of improvements (at most $M\times N$), since there is a finite number of actions and states.
\end{proof}
\subsection{Value Iteration}
\label{sec:MDP-value-iteration}
An alternative approach is to look for a fixed point of the Bellman optimality equation by iterating
\begin{equation}
\label{eq:MDP-value-iteration}
V^{(t+1)}(x) = \min_\pi \sum_u \pi(x,u)\left[R^u_x + \gamma \sum_{x^\prime}P^u_{x,x^\prime} V^{(t)}(x^\prime)\right]
\end{equation}
until convergence, \textit{i.e.} when the \emph{Bellman residual} falls below a certain threshold $\delta$
\begin{equation*}
|\!| V^{(t+1)} - V^{(t)}|\!|_\infty \le \delta.
\end{equation*}
Convergence of the value iteration to $V^\star$ at rate $\gamma$ is guaranteed by the observation that
\begin{equation*}
||V^{(t+1)}-W^{(t+1)}||_\infty \le \gamma ||V^{(t)}-W^{(t)}||_\infty
\end{equation*}
where $V^{(t)}$ and $W^{(t)}$ are two value functions. To see why the condition implies convergence, it is sufficient to take $W^{(t)}(x) = V^\star(x)$ identically, the fixed-point of eq.~\eqref{eq:MDP-value-iteration}. Then $V^{(t)}(x)$ converges to $V^\star(x)$ at rate $\gamma$.
Value iteration starts by guessing a value function $V_0$ and then at each iteration updates the value function using eq.~\eqref{eq:MDP-value-iteration}. At convergence, the optimal policy $\pi^\star$ is computed
\begin{equation*}
\pi^\star = \arg \min_\nu \sum_u \nu(x,u)\left[R^u_x + \gamma \sum_{x^\prime}P^u_{x,x^\prime} V^\star(x^\prime)\right]
\end{equation*}
The computational complexity of value iteration is lower than policy iteration for iteration, because of the step required to solve the linear equation but the convergence is only asymptotic. For large discount factors $\gamma$ with long horizon, the convergence is typically very slow.
%%% Local Variables:
%%% mode: latex
%%% TeX-master: "notes"
%%% End: