-
Notifications
You must be signed in to change notification settings - Fork 3
/
DQN_v2.py
856 lines (677 loc) · 35.3 KB
/
DQN_v2.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 29 22:54:51 2018
@author: koushik
"""
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Mar 26 19:41:49 2018
@author: koushik
"""
import os
import numpy as np
import tensorflow as tf
import pylab as pl
from operator import itemgetter
from matplotlib import cm
import matplotlib.pyplot as plt
import pandas as pd
from numpy import random
from math import cos, sin, pi,exp
import random
''' import network module '''
from algorithms import Station_map,STA_clustering
#################################################################################################
# input data information for Convolutional layer
length = 30;
width = 1;
state_shape = [1,length]; # we consider single array input
#########################################################################################
# function to get state , reward and etc
class Wifi_NW():
"""
class UAV positoin is the function which calculates the reward based on the chosen position and
again to generate new UAV
states and also network parameters.
"""
def __init__(self,STA_xloc,STA_yloc,AP_loc,area,nSTAs,nAPs,sim_Time,visualize,MCS,actions):
self.STA_xloc =STA_xloc;
self.STA_yloc =STA_yloc;
self.AP_loc = AP_loc;
self.area = area;
self.nSTAs = nSTAs;
self.nAPs = nAPs;
self.sim_Time = sim_Time;
self.visualize=visualize;
self.MCS = MCS;
self.actions=actions;
def send_to_nw(self,action,DL,UL):
os.chdir('/home/koushik/ns-3/ns-3.28/') # setting the NS3 directory path
"""
Input: 1. action--> STA_map
2. DL --> download rates of STAs
3. UL --> upload rates of STAs
Ouput: new state
"""
STA_map = self.actions[action];
filenum=5;# NS3 file number
inputs = {
'AP_xloc' : pd.Series(self.AP_loc[:,0]),
'AP_yloc' : pd.Series(self.AP_loc[:,1]),
'STA_xloc': pd.Series(self.STA_xloc),
'STA_yloc': pd.Series(self.STA_yloc),
'STA_map' : pd.Series(STA_map),
'UL_Rate' : pd.Series(UL),
'DL_Rate' : pd.Series(DL)
}
df1 = pd.DataFrame(inputs)
df1.to_csv('inputs.csv')
### Pass network parameters to NS3 and run network simulation
if self.visualize:
os.system('./waf --pyrun "scratch/NW_%s.py --nAPs=%s --nSTAs=%s --area=%s --simulationTime=%s --MCS=%s" --vis '
%(str(filenum),str(self.nAPs),str(self.nSTAs),str(self.area),str(self.sim_Time),str(self.MCS)))
else:
os.system('./waf --pyrun "scratch/NW_%s.py --nAPs=%s --nSTAs=%s --area=%s --simulationTime=%s --MCS=%s" '
%(str(filenum),str(self.nAPs),str(self.nSTAs),str(self.area),str(self.sim_Time),str(self.MCS)))
return self._get_state(action),self._get_reward()
def _get_reward(self):
df = pd.read_csv('NW_th.csv') # read from the results stored in csv file
Umean = np.mean(df.values[:,-1]);
Dmean = np.mean(df.values[:,-2]);
Usum = np.sum(df.values[:,-1]);
Dsum = np.sum(df.values[:,-2]);
return (Dsum+Usum) # aggregated throughput: Uplink throughput + Downlink Throughput
def _get_state(self,action):
state = action;
return state;
################################################################################################
''' code here for Experience Replay '''
'''
for every action in state 's' the node takes action 'a' and gets reward 'r' then moves to state 's'' . This is stored in a
memory called as experience replay as a tuple '<s,a,r,s'>'. When the node requires, the node takes Batch of 'B' previous
experiences and uses that to train the Deep neural network module to improve the learning performance
'''
class ReplayMemory:
def __init__(self, model_dir,num_actions,batch_size=128,memory_size=10000):
self.model_dir = model_dir
self.num_actions=num_actions
self.memory_size = memory_size
self.actions = np.empty(shape = [self.memory_size], dtype = np.uint8)
self.rewards = np.empty(shape = [self.memory_size], dtype = np.float64)
self.prestate = np.empty(shape = [self.memory_size]+state_shape, dtype = np.float16)
self.poststate = np.empty(shape = [self.memory_size]+state_shape, dtype = np.float16)
self.q_values = np.zeros(shape=[self.memory_size, self.num_actions], dtype=np.float)
self.batch_size = batch_size
self.count = 0
self.current = 0
''' add current state, next state, Q_values of next state, reward and action for to replay memory '''
def add(self, prestate, Q_st,poststate, reward, action):
self.actions[self.current] = action
self.rewards[self.current] = reward
self.prestate[self.current] = prestate
self.poststate[self.current] = poststate
self.q_values[self.current] = Q_st
self.count = max(self.count, self.current + 1)
self.current = (self.current + 1) % self.memory_size
''' take sample equal to batch size for updating learning model '''
def sample(self):
B_range = min(self.count,self.batch_size)
indexes = np.random.choice(range(self.count),B_range,replace=False)
prestate = self.prestate[indexes]
poststate = self.poststate[indexes]
actions = self.actions[indexes]
rewards = self.rewards[indexes]
Qs = self.q_values[indexes]
return prestate, Qs,poststate, actions, rewards
###################################################################################################
'''
Class Linear Control Signal is used to select the optimal tuning parameters like learning rate etc according to the number
of iterations and current iteration by using Starting and ending values as the limiting values for such parameters
'''
class LinearControlSignal:
"""
A control signal that changes linearly over time.
This is used to change e.g. the learning-rate for the optimizer
of the Neural Network, as well as other parameters.
TensorFlow has functionality for doing this, but it uses the
global_step counter inside the TensorFlow graph, while we
want the control signals to use a state-counter for the
game-environment. So it is easier to make this in Python.
"""
def __init__(self, start_value, end_value, num_iterations, repeat=False):
"""
Create a new object.
:param start_value:
Start-value for the control signal.
:param end_value:
End-value for the control signal.
:param num_iterations:
Number of iterations it takes to reach the end_value
from the start_value.
:param repeat:
Boolean whether to reset the control signal back to the start_value
after the end_value has been reached.
"""
# Store arguments in this object.
self.start_value = start_value
self.end_value = end_value
self.num_iterations = num_iterations
self.repeat = repeat
# Calculate the linear coefficient.
self._coefficient = (end_value - start_value) / num_iterations
def get_value(self, iteration):
"""Get the value of the control signal for the given iteration."""
if self.repeat:
iteration %= self.num_iterations
if iteration < self.num_iterations:
value = iteration * self._coefficient + self.start_value
else:
value = self.end_value
return value
##########################################################################################
class NeuralNetwork:
"""
Creates a Neural Network for Reinforcement Learning (Q-Learning).
Functions are provided for estimating Q-values from states of the
game-environment, and for optimizing the Neural Network so it becomes
better at estimating the Q-values.
"""
def __init__(self, num_actions, replay_memory, use_pretty_tensor=False):
"""
:param num_actions:
Number of discrete actions for the game-environment.
:param replay_memory:
Object-instance of the ReplayMemory-class.
:param use_pretty_tensor:
Boolean whether to use PrettyTensor (True) which must then be
installed, or use the tf.layers API (False) which is already
built into TensorFlow.
"""
self.num_actions=num_actions
# Whether to use the PrettyTensor API (True) or tf.layers (False).
self.use_pretty_tensor = use_pretty_tensor
# Replay-memory used for sampling random batches.
self.replay_memory = replay_memory
# Path for saving/restoring checkpoints.
#self.checkpoint_path = os.path.join(checkpoint_dir, "checkpoint")
# Placeholder variable for inputting states into the Neural Network.
# A state is a multi-dimensional array holding image-frames from
# the game-environment.
self.x = tf.placeholder(dtype=tf.float32, shape=[None] + state_shape, name='x')
# Placeholder variable for inputting the learning-rate to the optimizer.
self.learning_rate = tf.placeholder(dtype=tf.float32, shape=[])
# Placeholder variable for inputting the target Q-values
# that we want the Neural Network to be able to estimate.
self.q_values_new = tf.placeholder(tf.float32,
shape=[None, self.num_actions],
name='q_values_new')
# This is a hack that allows us to save/load the counter for
# the number of states processed in the game-environment.
# We will keep it as a variable in the TensorFlow-graph
# even though it will not actually be used by TensorFlow.
self.count_states = tf.Variable(initial_value=0,
trainable=False, dtype=tf.int64,
name='count_states')
# Similarly, this is the counter for the number of episodes.
self.count_episodes = tf.Variable(initial_value=0,
trainable=False, dtype=tf.int64,
name='count_episodes')
# TensorFlow operation for increasing count_states.
self.count_states_increase = tf.assign(self.count_states,
self.count_states + 1)
# TensorFlow operation for increasing count_episodes.
self.count_episodes_increase = tf.assign(self.count_episodes,
self.count_episodes + 1)
# The Neural Network will be constructed in the following.
# Note that the architecture of this Neural Network is very
# different from that used in the original DeepMind papers,
# which was something like this:
# Input image: 84 x 84 x 4 (4 gray-scale images of 84 x 84 pixels).
# Conv layer 1: 16 filters 8 x 8, stride 4, relu.
# Conv layer 2: 32 filters 4 x 4, stride 2, relu.
# Fully-conn. 1: 256 units, relu. (Sometimes 512 units).
# Fully-conn. 2: num-action units, linear.
# The DeepMind architecture does a very aggressive downsampling of
# the input images so they are about 10 x 10 pixels after the final
# convolutional layer. I found that this resulted in significantly
# distorted Q-values when using the training method further below.
# The reason DeepMind could get it working was perhaps that they
# used a very large replay memory (5x as big as here), and a single
# optimization iteration was performed after each step of the game,
# and some more tricks.
# Initializer for the layers in the Neural Network.
# If you change the architecture of the network, particularly
# if you add or remove layers, then you may have to change
# the stddev-parameter here. The initial weights must result
# in the Neural Network outputting Q-values that are very close
# to zero - but the network weights must not be too low either
# because it will make it hard to train the network.
# You can experiment with values between 1e-2 and 1e-3.
init = tf.truncated_normal_initializer(mean=0.0, stddev=2e-2)
if self.use_pretty_tensor:
# This builds the Neural Network using the PrettyTensor API,
# which is a very elegant builder API, but some people are
# having problems installing and using it.
import prettytensor as pt
# Wrap the input to the Neural Network in a PrettyTensor object.
x_pretty = pt.wrap(self.x)
# Create the convolutional Neural Network using Pretty Tensor.
with pt.defaults_scope(activation_fn=tf.nn.relu):
self.q_values = x_pretty. \
conv2d(kernel=3, depth=20, stride=1, name='layer_conv1', weights=init). \
conv2d(kernel=2, depth=40, stride=1, name='layer_conv2', weights=init). \
flatten().\
fully_connected(size=180, name='layer_fc3', weights=init). \
fully_connected(size=self.num_actions, name='layer_fc_out', weights=init,
activation_fn=None)
# Loss-function which must be optimized. This is the mean-squared
# error between the Q-values that are output by the Neural Network
# and the target Q-values.
self.loss = self.q_values.l2_regression(target=self.q_values_new)
else:
# This builds the Neural Network using the tf.layers API,
# which is very verbose and inelegant, but should work for everyone.
# Note that the checkpoints for Tutorial #16 which can be
# downloaded from the internet only support PrettyTensor.
# Although the Neural Networks appear to be identical when
# built using the PrettyTensor and tf.layers APIs,
# they actually create somewhat different TensorFlow graphs
# where the variables have different names, which means the
# checkpoints are incompatible for the two builder APIs.
# Padding used for the convolutional layers.
padding = 'SAME'
# Activation function for all convolutional and fully-connected
# layers, except the last.
activation = tf.nn.relu
# Reference to the lastly added layer of the Neural Network.
# This makes it easy to add or remove layers.
net = self.x
# First convolutional layer.
# Flatten output of the last convolutional layer so it can
# be input to a fully-connected (aka. dense) layer.
# TODO: For some bizarre reason, this function is not yet in tf.layers
# TODO: net = tf.layers.flatten(net)
#net = tf.contrib.layers.flatten(net)
# Fourth fully-connected layer.
net = tf.layers.dense(inputs=net, name='layer_fc4', units=180,
kernel_initializer=init, activation=activation)
# Final fully-connected layer.
net = tf.layers.dense(inputs=net, name='layer_fc_out', units=num_actions,
kernel_initializer=init, activation=None)
# The output of the Neural Network is the estimated Q-values
# for each possible action in the game-environment.
self.q_values = net
# TensorFlow has a built-in loss-function for doing regression:
# self.loss = tf.nn.l2_loss(self.q_values - self.q_values_new)
# But it uses tf.reduce_sum() rather than tf.reduce_mean()
# which is used by PrettyTensor. This means the scale of the
# gradient is different and hence the hyper-parameters
# would have to be re-tuned. So instead we calculate the
# L2-loss similarly to how it is done in PrettyTensor.
squared_error = tf.square(self.q_values - self.q_values_new)
sum_squared_error = tf.reduce_sum(squared_error, axis=1)
self.loss = tf.reduce_mean(sum_squared_error)
#self.loss = self.q_values.l2_regression(target=self.q_values_new)
# Optimizer used for minimizing the loss-function.
# Note the learning-rate is a placeholder variable so we can
# lower the learning-rate as optimization progresses.
self.optimizer = tf.train.RMSPropOptimizer(learning_rate=self.learning_rate).minimize(self.loss)
# Used for saving and loading checkpoints.
#self.saver = tf.train.Saver()
# Create a new TensorFlow session so we can run the Neural Network.
self.session = tf.Session()
# Load the most recent checkpoint if it exists,
# otherwise initialize all the variables in the TensorFlow graph.
self.load_checkpoint()
def close(self):
"""Close the TensorFlow session."""
self.session.close()
def load_checkpoint(self):
"""
Load all variables of the TensorFlow graph from a checkpoint.
If the checkpoint does not exist, then initialize all variables.
"""
self.session.run(tf.global_variables_initializer())
def save_checkpoint(self, current_iteration):
"""Save all variables of the TensorFlow graph to a checkpoint."""
self.saver.save(self.session,
save_path=self.checkpoint_path,
global_step=current_iteration)
print("Saved checkpoint.")
def get_q_values(self, states):
"""
Calculate and return the estimated Q-values for the given states.
A single state contains two images (or channels): The most recent
image-frame from the game-environment, and a motion-tracing image.
See the MotionTracer-class for details.
The input to this function is an array of such states which allows
for batch-processing of the states. So the input is a 4-dim
array with shape: [batch, height, width, state_channels].
The output of this function is an array of Q-value-arrays.
There is a Q-value for each possible action in the game-environment.
So the output is a 2-dim array with shape: [batch, num_actions]
"""
# Create a feed-dict for inputting the states to the Neural Network.
feed_dict = {self.x: states}
# Use TensorFlow to calculate the estimated Q-values for these states.
values = self.session.run(self.q_values, feed_dict=feed_dict)
return values
def optimize(self, min_epochs=1.0, max_epochs=10,
batch_size=128, loss_limit=0.015,
learning_rate=1e-2,epsilon=0.3):
"""
Optimize the Neural Network by sampling states and Q-values
from the replay-memory.
The original DeepMind paper performed one optimization iteration
after processing each new state of the game-environment. This is
an un-natural way of doing optimization of Neural Networks.
So instead we perform a full optimization run every time the
Replay Memory is full (or it is filled to the desired fraction).
This also gives more efficient use of a GPU for the optimization.
The problem is that this may over-fit the Neural Network to whatever
is in the replay-memory. So we use several tricks to try and adapt
the number of optimization iterations.
:param min_epochs:
Minimum number of optimization epochs. One epoch corresponds
to the replay-memory being used once. However, as the batches
are sampled randomly and biased somewhat, we may not use the
whole replay-memory. This number is just a convenient measure.
:param max_epochs:
Maximum number of optimization epochs.
:param batch_size:
Size of each random batch sampled from the replay-memory.
:param loss_limit:
Optimization continues until the average loss-value of the
last 100 batches is below this value (or max_epochs is reached).
:param learning_rate:
Learning-rate to use for the optimizer.
"""
# print("Optimizing Neural Network to better estimate Q-values ...")
# print("\tLearning-rate: {0:.1e}".format(learning_rate))
# print("\tLoss-limit: {0:.3f}".format(loss_limit))
# print("\tMax epochs: {0:.1f}".format(max_epochs))
# Prepare the probability distribution for sampling the replay-memory.
#self.replay_memory.prepare_sampling_prob(batch_size=batch_size)
# Number of optimization iterations corresponding to one epoch.
#iterations_per_epoch = self.replay_memory.num_used / batch_size
iterations_per_epoch = 1;
# Minimum number of iterations to perform.
min_iterations = int(iterations_per_epoch * min_epochs)
# Maximum number of iterations to perform.
max_iterations = 50#int(iterations_per_epoch * max_epochs)
# Buffer for storing the loss-values of the most recent batches.
loss_history = np.zeros(self.num_actions, dtype=float)
for i in range(max_iterations):
# Randomly sample a batch of states and target Q-values
# from the replay-memory. These are the Q-values that we
# want the Neural Network to be able to estimate.
#state_batch, q_values_batch = self.replay_memory.sample()
state_batch, Q_tp1,STp1_batch, actions, rewards = self.replay_memory.sample()
# target_q_t = self.discount * max_q_t_plus_1 +reward
#feed_dict = {self.x=}
q_values_batch = np.zeros(shape=[len(state_batch),self.num_actions],dtype=float)
#for i in range(len(state_batch)):
#Max_q_Tp1 = np.max(self.get_q_values(states=[STp1_batch[i]])[0])
self.discount = epsilon;
for i in range(len(state_batch)):
q_values_batch[i,actions[i]] = rewards[i] + self.discount*np.max(Q_tp1[i])
# Create a feed-dict for inputting the data to the TensorFlow graph.
# Note that the learning-rate is also in this feed-dict.
#loss_val = np.zeros(len(state_batch),dtype=float)
#for i in range(len(state_batch)):
feed_dict = {self.x: state_batch,
self.q_values_new: q_values_batch,
self.learning_rate: learning_rate}
# Perform one optimization step and get the loss-value.
loss_val, _ = self.session.run([self.loss, self.optimizer],
feed_dict=feed_dict)
# Shift the loss-history and assign the new value.
# This causes the loss-history to only hold the most recent values.
loss_history = np.roll(loss_history, 1)
loss_history[0] = loss_val
# Calculate the average loss for the previous batches.
loss_mean = np.mean(loss_history)
# Print status.
#pct_epoch = i / iterations_per_epoch
#msg = "\tBatch loss: {2:.4f}, Mean loss: {3:.4f}"
#msg = msg.format(loss_val, loss_mean)
# Stop the optimization if we have performed the required number
# of iterations and the loss-value is sufficiently low.
if i > min_iterations and loss_mean < loss_limit:
break
return loss_val, loss_mean
def get_weights_variable(self, layer_name):
"""
Return the variable inside the TensorFlow graph for the weights
in the layer with the given name.
Note that the actual values of the variables are not returned,
you must use the function get_variable_value() for that.
"""
if self.use_pretty_tensor:
# PrettyTensor uses this name for the weights in a conv-layer.
variable_name = 'weights'
else:
# The tf.layers API uses this name for the weights in a conv-layer.
variable_name = 'kernel'
with tf.variable_scope(layer_name, reuse=True):
variable = tf.get_variable(variable_name)
return variable
def get_variable_value(self, variable):
"""Return the value of a variable inside the TensorFlow graph."""
weights = self.session.run(variable)
return weights
def get_layer_tensor(self, layer_name):
"""
Return the tensor for the output of a layer.
Note that this does not return the actual values,
but instead returns a reference to the tensor
inside the TensorFlow graph. Use get_tensor_value()
to get the actual contents of the tensor.
"""
# The name of the last operation of a layer,
# assuming it uses Relu as the activation-function.
tensor_name = layer_name + "/Relu:0"
# Get the tensor with this name.
tensor = tf.get_default_graph().get_tensor_by_name(tensor_name)
return tensor
def get_tensor_value(self, tensor, state):
"""Get the value of a tensor in the Neural Network."""
# Create a feed-dict for inputting the state to the Neural Network.
feed_dict = {self.x: [state]}
# Run the TensorFlow session to calculate the value of the tensor.
output = self.session.run(tensor, feed_dict=feed_dict)
return output
def get_count_states(self):
"""
Get the number of states that has been processed in the game-environment.
This is not used by the TensorFlow graph. It is just a hack to save and
reload the counter along with the checkpoint-file.
"""
return self.session.run(self.count_states)
def get_count_episodes(self):
"""
Get the number of episodes that has been processed in the game-environment.
"""
return self.session.run(self.count_episodes)
def increase_count_states(self):
"""
Increase the number of states that has been processed
in the game-environment.
"""
return self.session.run(self.count_states_increase)
def increase_count_episodes(self):
"""
Increase the number of episodes that has been processed
in the game-environment.
"""
return self.session.run(self.count_episodes_increase)
########################################################################
class EpsilonGreedy:
"""
The epsilon-greedy policy either takes a random action with
probability epsilon, or it takes the action for the highest
Q-value.
If epsilon is 1.0 then the actions are always random.
If epsilon is 0.0 then the actions are always argmax for the Q-values.
Epsilon is typically decreased linearly from 1.0 to 0.1
and this is also implemented in this class.
During testing, epsilon is usually chosen lower, e.g. 0.05 or 0.01
"""
def __init__(self, num_actions,
epsilon_testing=0.05,
num_iterations=1e6,
start_value=1.0, end_value=0.1,
repeat=False):
"""
:param num_actions:
Number of possible actions in the game-environment.
:param epsilon_testing:
Epsilon-value when testing.
:param num_iterations:
Number of training iterations required to linearly
decrease epsilon from start_value to end_value.
:param start_value:
Starting value for linearly decreasing epsilon.
:param end_value:
Ending value for linearly decreasing epsilon.
:param repeat:
Boolean whether to repeat and restart the linear decrease
when the end_value is reached, or only do it once and then
output the end_value forever after.
"""
# Store parameters.
self.num_actions = num_actions
self.epsilon_testing = epsilon_testing
# Create a control signal for linearly decreasing epsilon.
self.epsilon_linear = LinearControlSignal(num_iterations=num_iterations,
start_value=start_value,
end_value=end_value,
repeat=repeat)
def get_epsilon(self, iteration, training):
"""
Return the epsilon for the given iteration.
If training==True then epsilon is linearly decreased,
otherwise epsilon is a fixed number.
"""
if training:
epsilon = self.epsilon_linear.get_value(iteration=iteration)
else:
epsilon = self.epsilon_testing
return epsilon
def get_action(self, q_values, iteration, training):
"""
Use the epsilon-greedy policy to select an action.
:param q_values:
These are the Q-values that are estimated by the Neural Network
for the current state of the game-environment.
:param iteration:
This is an iteration counter. Here we use the number of states
that has been processed in the game-environment.
:param training:
Boolean whether we are training or testing the
Reinforcement Learning agent.
:return:
action (integer), epsilon (float)
"""
epsilon = self.get_epsilon(iteration=iteration, training=training)
# With probability epsilon.
if np.random.random() < epsilon:
# Select a random action.
action = np.random.randint(low=0, high=self.num_actions)
else:
# Otherwise select the action that has the highest Q-value.
action = np.argmax(q_values)
return action, epsilon
########################################################################
class Agent:
def __init__(self,UL_R,DL_R,STA_xloc,STA_yloc,AP_loc,area,nSTAs,nAPs,sim_Time,visualize,MCS,n_actions,actions,epoch,epoch1,model_dir,render):
# self.STA_xloc = STA_xloc;
# self.STA_yloc = STA_yloc;
# self.AP_loc = AP_loc;
# self.area= area;
# self.nSTAs = nSTAs;
# self.nAPs = nAPs;
# self.sim_Time=sim_Time;
# self.visualize = visualize;
# self.MCS = MCS;
self.num_actions = n_actions;
#self.actions = actions;
self.DL = DL_R; # download bit rate
self.UL = UL_R; # Upload bit rate
''' set up the UAV positioning parameters '''
self.env = Wifi_NW(STA_xloc,STA_yloc,AP_loc,area,nSTAs,nAPs,sim_Time,visualize,MCS,actions)
#UAV_position(self.UAV_range,self.grid_size,self.Jamming)
self.render = render;
self.epoch=epoch
self.epoch1=epoch1
self.training=True
self.epsilon_greedy=EpsilonGreedy(start_value=1.0,
end_value=0.01,
num_iterations = self.epoch,
num_actions=self.num_actions,
epsilon_testing=0.01)
self.learning_rate_control = LinearControlSignal(start_value=1e-3,
end_value=1e-5,
num_iterations = self.epoch)
''' Replay memory initialization '''
self.ReplayMemory = ReplayMemory(model_dir,self.num_actions,batch_size=256,memory_size=10000)
''' Nural Network model initiazation '''
self.model = NeuralNetwork(num_actions=self.num_actions,
replay_memory=self.ReplayMemory)
''' Running UAV positioning model '''
def run(self,fig,ax):
self.update_count=0;
# initialize the reward
R1 = []#np.array([[0.0 for i in range(self.epoch)] for i in range(self.epoch1)])
N_pos=30;
state_hist=[0]*length # to store UAV parameters in 3d tensor format
''' collect first 36 samples with random actions to form a complete state input '''
input_t = np.resize(state_hist,(1,length)) # each state input is of shape (6,6,3)
for e1 in range(self.epoch1):
for e in range(self.epoch-1):
''' input at t-1 time slot '''
input_tm1 = input_t
''' determine the Q values for the current state input with learned training model
then using Epsilon greedy model determine the action '''
q_values = self.model.get_q_values(states=[input_tm1])
action, epsilon = self.epsilon_greedy.get_action(q_values=q_values,
iteration=e,
training=self.training)
''' determine reward and new UAV position based on current action taken '''
pos_t,reward = self.env.send_to_nw(action,self.DL[e,:],self.UL[e,:])#(action=action)
R1.append(reward);
''' update state history '''
state_hist = np.append(state_hist,pos_t)
''' construct state input with updated UAV positions '''
input_t = np.resize(state_hist[-N_pos:],(1,length))
''' determine the Q value for input_t '''
Q_tp1 = self.model.get_q_values(states=[input_t])[0]
''' add datas to replay memory for future batch training of Nural network '''
self.ReplayMemory.add(input_tm1,Q_tp1,input_t,reward,action)
''' updated learning rate for every iteration. Initially keep learning rate as 1 so that it learns greedily
then keep decreesing the value for every iteration '''
learning_rate = self.learning_rate_control.get_value(iteration=e)
''' Retrain the Neural Network learning model based on the past state, reward, actions '''
loss_val, loss_mean = self.model.optimize(learning_rate,epsilon)
if self.epoch%1==0:
print("\rEpoch {:03d}/{:03d} | Batch Loss {:.4f} | Mean Loss {:.4f}".format(e+1,self.epoch,loss_val, loss_mean),end='\r')
msg = "Epsilon: {:.4f}| Reward: {:.4f} | action:{}"
#print(msg.format(epsilon,reward,action),end='\n')
''' plot the data at each eposch e1 '''
ax.clear()
ax.plot(R1)
fig.canvas.draw()
plt.grid();
plt.legend(['Throughput_aggr'],loc=3)
plt.ylabel('Mbps')
plt.xlabel('Timeslot')
plt.show()
return R1
''' custom printing code
x=['=']
for i in range(10):
print('\r{}'.format(''.join((x*(i+1))+['>'])),end='\r')
time.sleep(2)
'''