关于DQN训练速度慢的问题

  统计/机器学习 Python 人工神经网络 强化学习    浏览次数:4429        分享
0

看完DQN的大概算法后,我尝试写下DQN试试训练gym下的‘SpaceInvader-v0’,这玩意就是这个


这个玩意每一个action后会返回一个reward,我直接用的这个reward来更新神经网络的参数,调用这个游戏的step函数返回的是一张(210,160,3)的图片,我采用的是CNN处理图片,只有卷积层 没有池化层,输出层采用的是softmax

采用的结构下图所示。

问题:训练的时间太长了,且根本没有成效。训练了几十轮之后得分还是跟个随机数一样。 是我神经网络的结构不对还是奖励机制不对?  看完了莫烦的CNN视频还是感觉很迷茫,具体对CNN还是感觉了解太少了  有没有大佬推荐的学习的资料什么那就太好了


#run

# reward = 5, 10, 15, 30, 200
# observation = 210, 160, 3
# action '''0:stay          1:shoot
#           2:right_move    3:left_move
#           4:right_shoot   5:left_shoot'''

import gym
from time import sleep
import cv2
import numpy as np
from DeepQNetwork_wjk.SpaceInvader_cnn.SpaceInvaders_cnn import DeepQNetwork
from pykeyboard import *
import threading


env = gym.make('SpaceInvaders-v0')
env = env.unwrapped

print(env.action_space) # 查看这个环境中可用的 action 有多少个
print(env.observation_space)    # 查看这个环境中可用的 state 的 observation 有多少个
# print(env.observation_space.high)   # 查看 observation 最高取值
# print(env.observation_space.low)    # 查看 observation 最低取值

# k = PyKeyboard()
#
#
# class TapRecord(PyKeyboardEvent):
#     def __init__(self):
#         PyKeyboardEvent.__init__(self)
#         self.list1 = [0, 1, 2, 3, 4, 5]
#
#     def tap(self, keycode, character, press):
#         global action
#         if press:
#             if keycode == 81:
#                 action = 5
#             elif keycode == 87:
#                 action = 1
#             elif keycode == 69:
#                 action = 4
#             elif keycode == 65:
#                 action = 3
#             elif keycode == 83:
#                 action = 0
#             elif keycode == 68:
#                 action = 2
#         print(action)
#
#
# t = TapRecord()

action = 4
#
# def game():
#     for episode in range(1):
#         observation = env.reset()
#         observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
#         # observation = observation.reshape(-1)
#         ep_r = 0
#
#         while True:
#             env.render()
#
#             observation_, reward, done, _ = env.step(action)
#             observation_ = cv2.cvtColor(observation_, cv2.COLOR_RGB2GRAY)
#
#             ep_r += reward
#
#             DQN.store_transition(observation, action, reward, observation_)
#
#             DQN.learn()
#
#             observation = observation_
#             # observation = observation.reshape(-1)
#
#             if done:
#                 DQN.Saver.save(DQN.sess, "model_hand/my-model", global_step=episode)
#                 print("episode{0},\nep_r:{1}".format(episode, ep_r))
#                 break
#             sleep(0.01)
#         t.stop()



observation_rgb = env.reset()
observation = cv2.cvtColor(observation_rgb, cv2.COLOR_RGB2GRAY)

DQN = DeepQNetwork(env.action_space.n,
                   observation.shape,
                   e_greedy_increment=1e-6,
                   memory_size=1000
                   )

total_step = 0


# threads = []
# t1 = threading.Thread(target=t.run)
# threads.append(t1)
# t2 = threading.Thread(target=game)
# threads.append(t2)
#
# for i in range(len(threads)):
#     threads[i].start()
# for i in range(len(threads)):
#     threads[i].join()


# 强化学习
for episode in range(100):
    observation = env.reset()
    observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
    # observation = observation.reshape(-1)
    ep_r = 0
    action = 4
    while True:
        env.render()
        total_step += 1

        action = DQN.choose_action(observation)

        observation_, reward, done, _ = env.step(action)
        observation_ = cv2.cvtColor(observation_, cv2.COLOR_RGB2GRAY)

        ep_r += reward

        DQN.store_transition(observation, action, reward, observation_)

        #if total_step > 1000:
        DQN.learn()



        observation = observation_
        # observation = observation.reshape(-1)
        total_step += 1

        if done:
            DQN.Saver.save(DQN.sess, "model_little/my-model",global_step=episode)
            print("episode{0},\nep_r:{1}".format(episode,ep_r))
            break

DQN.plot_cost()

import numpy as np
import tensorflow as tf
import pandas as pd

np.random.seed(1)
tf.set_random_seed(1)

class DeepQNetwork:
    def __init__(self,
                 n_actions,
                 img_shape,
                 learning_rate=0.01,
                 reward_decay=0.9,
                 e_greedy=0.9,
                 replace_target_iter=300,
                 memory_size=500,
                 batch_size=32,
                 e_greedy_increment=None,
                 output_graph=False,
                 ):
        self.n_actions = n_actions
        self.img_shape = img_shape
        self.lr = learning_rate
        self.gamma = reward_decay
        self.epsilon_max = e_greedy
        self.replace_target_iter = replace_target_iter
        self.memory_size = memory_size
        self.batch_size = batch_size
        self.epsilon_increment = e_greedy_increment
        self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
        # the observation_ will influence the action_choose more and more if increment is set
        # total learning step
        self.learn_step_counter = 0
        self.__shape_len = 1 # use for store the data in memory
        for i in img_shape:
            self.__shape_len *= i

        # initialize zero memory[observation, a, r, observation_]
        self.memory = np.zeros((self.memory_size, self.__shape_len * 2 + 2))  # observation +  action + rate + observation_
        self.__built_net()
        model_file = tf.train.latest_checkpoint('model_hand/')
        self.Saver = tf.train.Saver(max_to_keep=2)
        self.sess = tf.Session()
        self.Saver.restore(self.sess, model_file)

        t_params = tf.get_collection('target_net_params')  # target_net_parmas
        e_params = tf.get_collection('eval_net_params')  # eval_net_params
        self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]



        if output_graph:
            tf.summary.FileWriter("logs/", self.sess.graph)

        self.sess.run(tf.global_variables_initializer())
        self.Saver = tf.train.Saver(max_to_keep=2)
        self.cost_his = []

    def __built_net(self):
        self.s = tf.placeholder(tf.float32, [None, self.__shape_len], name='s')
        self.ss = tf.reshape(self.s, [-1, self.img_shape[0], self.img_shape[1], 1])
        self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
        with tf.variable_scope('eval_net'):
            c_names, n_l1, w_initializer, b_initializer = \
                ['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
                tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)

            with tf.variable_scope('conv_l1'):
                w1 = tf.get_variable('w1', [8, 8, 1, 32], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [32], initializer=b_initializer, collections=c_names)
                conv1 = tf.nn.relu(
                    tf.nn.conv2d(self.ss, w1, strides=[1,8,8,1],padding="VALID") + b1
                )

            with tf.variable_scope('conv_l2'):
                w2 = tf.get_variable('w2', [4, 1, 32, 64], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
                conv2 = tf.nn.relu(
                    tf.nn.conv2d(conv1, w2, strides=[1,2,1,1], padding="VALID") + b2
                ) # 12x12x64
                conv2_flat = tf.reshape(conv2, [-1, 12*20*64])

            # with tf.variable_scope('conv_l3'):
            #     w3 = tf.get_variable('w2', [3, 3, 64, 64], initializer=w_initializer, collections=c_names)
            #     b3 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
            #     conv3 = tf.nn.relu(
            #         tf.nn.conv2d(conv2, w3, strides=[1,1,1,1], padding="VALID") + b3
            #     )
            #     conv3_flat = tf.reshape(conv3, [-1, 4480])

            with tf.variable_scope('fc_l4'):
                w4 = tf.get_variable('w2', [12*20*64, 784], initializer=w_initializer, collections=c_names)
                b4 = tf.get_variable('b2', [784], initializer=b_initializer, collections=c_names)
                fc4 = tf.nn.relu(tf.matmul(conv2_flat, w4) + b4)

            with tf.variable_scope('fc_l5'):
                w5 = tf.get_variable('w2', [784, self.n_actions], initializer=w_initializer, collections=c_names)
                b5 = tf.get_variable('b2', [self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_eval = tf.nn.softmax(tf.matmul(fc4, w5) + b5)

        with tf.variable_scope('loss'): # q_target is the value that calculate by q_eval and the reward etc.
            self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))

        with tf.variable_scope('train'):
            self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)

        self.s_ = tf.placeholder(tf.float32, [None, self.__shape_len], name='s_')
        self.ss_ = tf.reshape(self.s_, [-1, self.img_shape[0], self.img_shape[1], 1])
        with tf.variable_scope('target_net'):
            c_names= ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]

            with tf.variable_scope('conv_l1'):
                w1 = tf.get_variable('w1', [8, 8, 1, 32], initializer=w_initializer, collections=c_names)
                b1 = tf.get_variable('b1', [32], initializer=b_initializer, collections=c_names)
                conv1 = tf.nn.relu(
                    tf.nn.conv2d(self.ss_, w1, strides=[1,8,8,1],padding="VALID") + b1
                )

            with tf.variable_scope('conv_l2'):
                w2 = tf.get_variable('w2', [4, 1, 32, 64], initializer=w_initializer, collections=c_names)
                b2 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
                conv2 = tf.nn.relu(
                    tf.nn.conv2d(conv1, w2, strides=[1,2,1,1], padding="VALID") + b2
                )
                conv3_flat = tf.reshape(conv2, [-1, 12*20*64])

            # with tf.variable_scope('conv_l3'):
            #     w3 = tf.get_variable('w2', [3, 3, 64, 64], initializer=w_initializer, collections=c_names)
            #     b3 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
            #     conv3 = tf.nn.relu(
            #         tf.nn.conv2d(conv2, w3, strides=[1,1,1,1], padding="VALID") + b3
            #     )
            #     conv3_flat = tf.reshape(conv3, [-1, 4480])

            with tf.variable_scope('fc_l4'):
                w4 = tf.get_variable('w2', [12*20*64, 784], initializer=w_initializer, collections=c_names)
                b4 = tf.get_variable('b2', [784], initializer=b_initializer, collections=c_names)
                fc4 = tf.nn.relu(tf.matmul(conv3_flat, w4) + b4)

            with tf.variable_scope('fc_l5'):
                w5 = tf.get_variable('w2', [784, self.n_actions], initializer=w_initializer, collections=c_names)
                b5 = tf.get_variable('b2', [self.n_actions], initializer=b_initializer, collections=c_names)
                self.q_next = tf.nn.softmax(tf.matmul(fc4, w5) + b5)

    def store_transition(self, s, a, r, s_):
        if not hasattr(self, 'memory_counter'):  # Judge self whether include the attribute memory_counter
            self.memory_counter = 0

        s = s.reshape(-1)
        s_ = s_.reshape(-1)


        transiton = np.hstack((s, [a, r], s_))
        # print(transiton.shape)
        # memory_counter is used to Record the times store_transition
        # memory_size is used to Record the size of memory
        index = self.memory_counter % self.memory_size # if memory_counter > memory_size, record the data from 0
        self.memory[index, :] = transiton

        self.memory_counter += 1

    def choose_action(self, observation):
        observation = observation.reshape(-1)
        observation = observation[np.newaxis,:]
        # print("choose_action observation", observation)
        # observation = observation[np.newaxis,:,:,np.newaxis]
        # print("choose_action new_observation", observation)
        if np.random.uniform() < self.epsilon:
            # print("sess.run")
            action_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
            action = np.argmax(action_value)
            # print("this time my choose is", action)
        else:
            # print("else ")
            action = np.random.randint(0, self.n_actions)
        return action

    def _replace_target_params(self):
        t_parms = tf.get_collection('target_net_params')
        e_parms = tf.get_collection('eval_net_params')
        self.sess.run([tf.assign(t, e) for t, e in zip(t_parms, e_parms)])


    def learn(self):
        if self.learn_step_counter % self.replace_target_iter == 0:
            self.sess.run(self.replace_target_op)
            print('replaced the w,b in network')

        if self.memory_counter > self.memory_size:
            # we only store memory_size datas, when memory_counter is bigger than we store, we must
            # use np.random.choice(self.memory_size)
            sample_index = np.random.choice(self.memory_size, size=self.batch_size)
        else:
            sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
        batch_memory = self.memory[sample_index, :]

        # print(batch_memory[:,0:self.n_features].shape)

        q_next, q_eval = self.sess.run(
            [self.q_next, self.q_eval],
            feed_dict={
                self.s_: batch_memory[:, -self.__shape_len:],
                self.s: batch_memory[:, :self.__shape_len],
            })

        q_target = q_eval.copy()

        batch_index = np.arange(self.batch_size, dtype=np.int32) # batch_size is equal to q_eval_size
        eval_act_index = batch_memory[:, self.__shape_len].astype(int) # get observation's action to updata
        reward = batch_memory[:, self.__shape_len + 1] # get each step's reward

        q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)

        _, self.cost = self.sess.run([self._train_op, self.loss],
                                    feed_dict={self.s: batch_memory[:, :self.__shape_len],
                                               self.q_target: q_target})
        self.cost_his.append(self.cost)

        self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
        self.learn_step_counter += 1

    def plot_cost(self):
        import matplotlib.pyplot as plt
        plt.plot(np.arange(len(self.cost_his)), self.cost_his)
        plt.ylabel('Cost')
        plt.xlabel('training steps')
        plt.show()
 


    还没有回答。我来答!  


  相关讨论

Bellman equation里没有梯度下降为什么会有学习率?

计算机视觉与信息最新技术

Q-learning里的Q是什么意思?

RNN

关于举办“迁移学习(Transfer Learning)核心技术

深度强化学习课程介绍

深度强化学习核心课程

用tensorflow实现强化学习的dql算法报错:tensorflow.python.framework.errors_impl.InternalError: Could not find valid device for node. Node: {{node OneHot}} = OneHot[T=DT_FLOAT, TI=DT_FLOAT, axis=-1](dummy_input, dummy_input, dummy_input, dummy_input)

迁移学习(Transfer Learning)核心技术课程开课了

关于张量运算,求问框框部分运算是怎么计算的?具体运算规则是怎样的?

  随便看看

z test和t test什么区别?

为什么样本方差是除以n-1

统计学中的自变量和因变量分别是什么意思?

pandas把一列日期转换为星期

用户人群分层分析的RFM模型是什么?