Skip to content

Commit

Permalink
update ch8
Browse files Browse the repository at this point in the history
  • Loading branch information
johnjim0816 committed Aug 20, 2023
1 parent 481a482 commit 1e62262
Show file tree
Hide file tree
Showing 10 changed files with 1,510 additions and 202 deletions.
428 changes: 246 additions & 182 deletions docs/ch8/main.md

Large diffs are not rendered by default.

Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Binary file modified docs/figs/ch8/per_dqn_pseu.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
582 changes: 582 additions & 0 deletions notebooks/NoisyDQN.ipynb

Large diffs are not rendered by default.

644 changes: 644 additions & 0 deletions notebooks/PER_DQN.ipynb

Large diffs are not rendered by default.

2 changes: 1 addition & 1 deletion pseudocodes/pseudo.tex
Original file line number Diff line number Diff line change
Expand Up @@ -260,7 +260,7 @@ \section{PER-DQN算法}
\begin{algorithmic}[1]
% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
% \ENSURE $y = x^n$ % 输出
\STATE 初始化策略网络参数$\theta$ % 初始化
\STATE 初始化当前网络参数$\theta$ % 初始化
\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
Expand Down
Binary file modified pseudocodes/pseudo_without_notes.pdf
Binary file not shown.
35 changes: 16 additions & 19 deletions pseudocodes/pseudo_without_notes.tex
Original file line number Diff line number Diff line change
Expand Up @@ -248,39 +248,36 @@ \section{DRQN算法}

\section{PER-DQN算法}
\begin{algorithm}[H] % [H]固定位置
\floatname{algorithm}{{PER-DQN算法}\footnotemark[1]}
\floatname{algorithm}{{PER-DQN算法}}
\renewcommand{\thealgorithm}{} % 去掉算法标号
\caption{}
\renewcommand{\algorithmicrequire}{\textbf{输入:}}
\renewcommand{\algorithmicensure}{\textbf{输出:}}
\begin{algorithmic}[1]
% \REQUIRE $n \geq 0 \vee x \neq 0$ % 输入
% \ENSURE $y = x^n$ % 输出
\STATE 初始化策略网络参数$\theta$ % 初始化
\STATE 复制参数到目标网络$\hat{Q} \leftarrow Q$
\STATE 初始化当前网络参数 $\theta$
\STATE 复制参数到目标网络$\hat{\theta} \leftarrow \theta$
\STATE 初始化经验回放$D$
\FOR {回合数 = $1,M$}
\STATE 重置环境,获得初始状态$s_t$
\FOR {时步 = $1,t$}
\FOR {回合数 $m = 1,2,\cdots,M$}
\STATE 重置环境,获得初始状态$s_0$
\FOR {时步 $t= 1,2,\cdots,T$}
\STATE {\bfseries 交互采样:}
\STATE 根据$\varepsilon-greedy$策略采样动作$a_t$
\STATE 环境根据$a_t$反馈奖励$r_t$和下一个状态$s_{t+1}$
\STATE 存储transition即$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$,并根据TD-error损失确定其优先级$p_t$
\STATE 存储样本$(s_t,a_t,r_t,s_{t+1})$到经验回放$D$中,并根据$TD$误差损失确定其优先级$p_t$
\STATE 更新环境状态$s_{t+1} \leftarrow s_t$
\STATE {\bfseries 更新策略:}
\STATE 按照经验回放中的优先级别,每个样本采样概率为$P(j)=p_j^\alpha / \sum_i p_i^\alpha$,从$D$中采样一个大小为batch的transition
\STATE {\bfseries 模型更新:}
\STATE 根据每个样本的优先级计算采样概率$P(j)=p_j^\alpha / \sum_i p_i^\alpha$,从$D$中采样一个批量的样本
\STATE 计算各个样本重要性采样权重 $w_j=(N \cdot P(j))^{-\beta} / \max _i w_i$
\STATE 计算TD-error $\delta_j$ ; 并根据TD-error更新优先级$p_j$
\STATE 计算实际的$Q$值,即$y_{j}$\footnotemark[2]
\STATE 根据重要性采样权重调整损失 $L(\theta)=\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\cdot w_j \right)^{2}$,并将其关于参数$\theta$做随机梯度下降\footnotemark[3]
\STATE 计算$TD$误差$\delta_j$ ; 并根据$TD$误差更新优先级$p_j$
\STATE 计算$Q$的估计值,即$y_{j}$
\STATE 根据重要性采样权重调整损失 $L(\theta)=\left(y_{j}-Q\left(s_{j}, a_{j} ; \theta\right)\cdot w_j \right)^{2}$,并关于$\theta$做随机梯度下降
\STATE$C$步复制参数$\hat{Q}\leftarrow Q$
\ENDFOR
\STATE$C$个回合复制参数$\hat{Q}\leftarrow Q$\footnotemark[4]

\ENDFOR
\end{algorithmic}
\end{algorithm}
\footnotetext[1]{Playing Atari with Deep Reinforcement Learning}
\footnotetext[2]{$y_{i}= \begin{cases}r_{i} & \text {对于终止状态} s_{i+1} \\ r_{i}+\gamma \max _{a^{\prime}} Q\left(s_{i+1}, a^{\prime} ; \theta\right) & \text {对于非终止状态} s_{i+1}\end{cases}$}
\footnotetext[3]{$\theta_i \leftarrow \theta_i - \lambda \nabla_{\theta_{i}} L_{i}\left(\theta_{i}\right)$}
\footnotetext[4]{此处也可像原论文中放到小循环中改成每$C$步,但没有每$C$个回合稳定}

\clearpage


Expand Down
21 changes: 21 additions & 0 deletions pseudocodes/texput.log
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
This is XeTeX, Version 3.141592653-2.6-0.999994 (TeX Live 2022) (preloaded format=xelatex 2022.8.16) 20 AUG 2023 16:24
entering extended mode
restricted \write18 enabled.
%&-line parsing enabled.
**

! Emergency stop.
<*>

End of file on the terminal!


Here is how much of TeX's memory you used:
4 strings out of 476179
35 string characters out of 5813072
298507 words of memory out of 5000000
20858 multiletter control sequences out of 15000+600000
469259 words of font info for 28 fonts, out of 8000000 for 9000
1348 hyphenation exceptions out of 8191
0i,0n,0p,28b,6s stack positions out of 10000i,1000n,20000p,200000b,200000s
No pages of output.

0 comments on commit 1e62262

Please sign in to comment.