看完DQN的大概算法后,我尝试写下DQN试试训练gym下的‘SpaceInvader-v0’,这玩意就是这个
这个玩意每一个action后会返回一个reward,我直接用的这个reward来更新神经网络的参数,调用这个游戏的step函数返回的是一张(210,160,3)的图片,我采用的是CNN处理图片,只有卷积层 没有池化层,输出层采用的是softmax
采用的结构下图所示。
问题:训练的时间太长了,且根本没有成效。训练了几十轮之后得分还是跟个随机数一样。 是我神经网络的结构不对还是奖励机制不对? 看完了莫烦的CNN视频还是感觉很迷茫,具体对CNN还是感觉了解太少了 有没有大佬推荐的学习的资料什么那就太好了
#run
# reward = 5, 10, 15, 30, 200
# observation = 210, 160, 3
# action '''0:stay 1:shoot
# 2:right_move 3:left_move
# 4:right_shoot 5:left_shoot'''
import gym
from time import sleep
import cv2
import numpy as np
from DeepQNetwork_wjk.SpaceInvader_cnn.SpaceInvaders_cnn import DeepQNetwork
from pykeyboard import *
import threading
env = gym.make('SpaceInvaders-v0')
env = env.unwrapped
print(env.action_space) # 查看这个环境中可用的 action 有多少个
print(env.observation_space) # 查看这个环境中可用的 state 的 observation 有多少个
# print(env.observation_space.high) # 查看 observation 最高取值
# print(env.observation_space.low) # 查看 observation 最低取值
# k = PyKeyboard()
#
#
# class TapRecord(PyKeyboardEvent):
# def __init__(self):
# PyKeyboardEvent.__init__(self)
# self.list1 = [0, 1, 2, 3, 4, 5]
#
# def tap(self, keycode, character, press):
# global action
# if press:
# if keycode == 81:
# action = 5
# elif keycode == 87:
# action = 1
# elif keycode == 69:
# action = 4
# elif keycode == 65:
# action = 3
# elif keycode == 83:
# action = 0
# elif keycode == 68:
# action = 2
# print(action)
#
#
# t = TapRecord()
action = 4
#
# def game():
# for episode in range(1):
# observation = env.reset()
# observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
# # observation = observation.reshape(-1)
# ep_r = 0
#
# while True:
# env.render()
#
# observation_, reward, done, _ = env.step(action)
# observation_ = cv2.cvtColor(observation_, cv2.COLOR_RGB2GRAY)
#
# ep_r += reward
#
# DQN.store_transition(observation, action, reward, observation_)
#
# DQN.learn()
#
# observation = observation_
# # observation = observation.reshape(-1)
#
# if done:
# DQN.Saver.save(DQN.sess, "model_hand/my-model", global_step=episode)
# print("episode{0},\nep_r:{1}".format(episode, ep_r))
# break
# sleep(0.01)
# t.stop()
observation_rgb = env.reset()
observation = cv2.cvtColor(observation_rgb, cv2.COLOR_RGB2GRAY)
DQN = DeepQNetwork(env.action_space.n,
observation.shape,
e_greedy_increment=1e-6,
memory_size=1000
)
total_step = 0
# threads = []
# t1 = threading.Thread(target=t.run)
# threads.append(t1)
# t2 = threading.Thread(target=game)
# threads.append(t2)
#
# for i in range(len(threads)):
# threads[i].start()
# for i in range(len(threads)):
# threads[i].join()
# 强化学习
for episode in range(100):
observation = env.reset()
observation = cv2.cvtColor(observation, cv2.COLOR_RGB2GRAY)
# observation = observation.reshape(-1)
ep_r = 0
action = 4
while True:
env.render()
total_step += 1
action = DQN.choose_action(observation)
observation_, reward, done, _ = env.step(action)
observation_ = cv2.cvtColor(observation_, cv2.COLOR_RGB2GRAY)
ep_r += reward
DQN.store_transition(observation, action, reward, observation_)
#if total_step > 1000:
DQN.learn()
observation = observation_
# observation = observation.reshape(-1)
total_step += 1
if done:
DQN.Saver.save(DQN.sess, "model_little/my-model",global_step=episode)
print("episode{0},\nep_r:{1}".format(episode,ep_r))
break
DQN.plot_cost()
import numpy as np
import tensorflow as tf
import pandas as pd
np.random.seed(1)
tf.set_random_seed(1)
class DeepQNetwork:
def __init__(self,
n_actions,
img_shape,
learning_rate=0.01,
reward_decay=0.9,
e_greedy=0.9,
replace_target_iter=300,
memory_size=500,
batch_size=32,
e_greedy_increment=None,
output_graph=False,
):
self.n_actions = n_actions
self.img_shape = img_shape
self.lr = learning_rate
self.gamma = reward_decay
self.epsilon_max = e_greedy
self.replace_target_iter = replace_target_iter
self.memory_size = memory_size
self.batch_size = batch_size
self.epsilon_increment = e_greedy_increment
self.epsilon = 0 if e_greedy_increment is not None else self.epsilon_max
# the observation_ will influence the action_choose more and more if increment is set
# total learning step
self.learn_step_counter = 0
self.__shape_len = 1 # use for store the data in memory
for i in img_shape:
self.__shape_len *= i
# initialize zero memory[observation, a, r, observation_]
self.memory = np.zeros((self.memory_size, self.__shape_len * 2 + 2)) # observation + action + rate + observation_
self.__built_net()
model_file = tf.train.latest_checkpoint('model_hand/')
self.Saver = tf.train.Saver(max_to_keep=2)
self.sess = tf.Session()
self.Saver.restore(self.sess, model_file)
t_params = tf.get_collection('target_net_params') # target_net_parmas
e_params = tf.get_collection('eval_net_params') # eval_net_params
self.replace_target_op = [tf.assign(t, e) for t, e in zip(t_params, e_params)]
if output_graph:
tf.summary.FileWriter("logs/", self.sess.graph)
self.sess.run(tf.global_variables_initializer())
self.Saver = tf.train.Saver(max_to_keep=2)
self.cost_his = []
def __built_net(self):
self.s = tf.placeholder(tf.float32, [None, self.__shape_len], name='s')
self.ss = tf.reshape(self.s, [-1, self.img_shape[0], self.img_shape[1], 1])
self.q_target = tf.placeholder(tf.float32, [None, self.n_actions], name='Q_target')
with tf.variable_scope('eval_net'):
c_names, n_l1, w_initializer, b_initializer = \
['eval_net_params', tf.GraphKeys.GLOBAL_VARIABLES], 10, \
tf.random_normal_initializer(0., 0.3), tf.constant_initializer(0.1)
with tf.variable_scope('conv_l1'):
w1 = tf.get_variable('w1', [8, 8, 1, 32], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [32], initializer=b_initializer, collections=c_names)
conv1 = tf.nn.relu(
tf.nn.conv2d(self.ss, w1, strides=[1,8,8,1],padding="VALID") + b1
)
with tf.variable_scope('conv_l2'):
w2 = tf.get_variable('w2', [4, 1, 32, 64], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
conv2 = tf.nn.relu(
tf.nn.conv2d(conv1, w2, strides=[1,2,1,1], padding="VALID") + b2
) # 12x12x64
conv2_flat = tf.reshape(conv2, [-1, 12*20*64])
# with tf.variable_scope('conv_l3'):
# w3 = tf.get_variable('w2', [3, 3, 64, 64], initializer=w_initializer, collections=c_names)
# b3 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
# conv3 = tf.nn.relu(
# tf.nn.conv2d(conv2, w3, strides=[1,1,1,1], padding="VALID") + b3
# )
# conv3_flat = tf.reshape(conv3, [-1, 4480])
with tf.variable_scope('fc_l4'):
w4 = tf.get_variable('w2', [12*20*64, 784], initializer=w_initializer, collections=c_names)
b4 = tf.get_variable('b2', [784], initializer=b_initializer, collections=c_names)
fc4 = tf.nn.relu(tf.matmul(conv2_flat, w4) + b4)
with tf.variable_scope('fc_l5'):
w5 = tf.get_variable('w2', [784, self.n_actions], initializer=w_initializer, collections=c_names)
b5 = tf.get_variable('b2', [self.n_actions], initializer=b_initializer, collections=c_names)
self.q_eval = tf.nn.softmax(tf.matmul(fc4, w5) + b5)
with tf.variable_scope('loss'): # q_target is the value that calculate by q_eval and the reward etc.
self.loss = tf.reduce_mean(tf.squared_difference(self.q_target, self.q_eval))
with tf.variable_scope('train'):
self._train_op = tf.train.RMSPropOptimizer(self.lr).minimize(self.loss)
self.s_ = tf.placeholder(tf.float32, [None, self.__shape_len], name='s_')
self.ss_ = tf.reshape(self.s_, [-1, self.img_shape[0], self.img_shape[1], 1])
with tf.variable_scope('target_net'):
c_names= ['target_net_params', tf.GraphKeys.GLOBAL_VARIABLES]
with tf.variable_scope('conv_l1'):
w1 = tf.get_variable('w1', [8, 8, 1, 32], initializer=w_initializer, collections=c_names)
b1 = tf.get_variable('b1', [32], initializer=b_initializer, collections=c_names)
conv1 = tf.nn.relu(
tf.nn.conv2d(self.ss_, w1, strides=[1,8,8,1],padding="VALID") + b1
)
with tf.variable_scope('conv_l2'):
w2 = tf.get_variable('w2', [4, 1, 32, 64], initializer=w_initializer, collections=c_names)
b2 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
conv2 = tf.nn.relu(
tf.nn.conv2d(conv1, w2, strides=[1,2,1,1], padding="VALID") + b2
)
conv3_flat = tf.reshape(conv2, [-1, 12*20*64])
# with tf.variable_scope('conv_l3'):
# w3 = tf.get_variable('w2', [3, 3, 64, 64], initializer=w_initializer, collections=c_names)
# b3 = tf.get_variable('b2', [64], initializer=b_initializer, collections=c_names)
# conv3 = tf.nn.relu(
# tf.nn.conv2d(conv2, w3, strides=[1,1,1,1], padding="VALID") + b3
# )
# conv3_flat = tf.reshape(conv3, [-1, 4480])
with tf.variable_scope('fc_l4'):
w4 = tf.get_variable('w2', [12*20*64, 784], initializer=w_initializer, collections=c_names)
b4 = tf.get_variable('b2', [784], initializer=b_initializer, collections=c_names)
fc4 = tf.nn.relu(tf.matmul(conv3_flat, w4) + b4)
with tf.variable_scope('fc_l5'):
w5 = tf.get_variable('w2', [784, self.n_actions], initializer=w_initializer, collections=c_names)
b5 = tf.get_variable('b2', [self.n_actions], initializer=b_initializer, collections=c_names)
self.q_next = tf.nn.softmax(tf.matmul(fc4, w5) + b5)
def store_transition(self, s, a, r, s_):
if not hasattr(self, 'memory_counter'): # Judge self whether include the attribute memory_counter
self.memory_counter = 0
s = s.reshape(-1)
s_ = s_.reshape(-1)
transiton = np.hstack((s, [a, r], s_))
# print(transiton.shape)
# memory_counter is used to Record the times store_transition
# memory_size is used to Record the size of memory
index = self.memory_counter % self.memory_size # if memory_counter > memory_size, record the data from 0
self.memory[index, :] = transiton
self.memory_counter += 1
def choose_action(self, observation):
observation = observation.reshape(-1)
observation = observation[np.newaxis,:]
# print("choose_action observation", observation)
# observation = observation[np.newaxis,:,:,np.newaxis]
# print("choose_action new_observation", observation)
if np.random.uniform() < self.epsilon:
# print("sess.run")
action_value = self.sess.run(self.q_eval, feed_dict={self.s: observation})
action = np.argmax(action_value)
# print("this time my choose is", action)
else:
# print("else ")
action = np.random.randint(0, self.n_actions)
return action
def _replace_target_params(self):
t_parms = tf.get_collection('target_net_params')
e_parms = tf.get_collection('eval_net_params')
self.sess.run([tf.assign(t, e) for t, e in zip(t_parms, e_parms)])
def learn(self):
if self.learn_step_counter % self.replace_target_iter == 0:
self.sess.run(self.replace_target_op)
print('replaced the w,b in network')
if self.memory_counter > self.memory_size:
# we only store memory_size datas, when memory_counter is bigger than we store, we must
# use np.random.choice(self.memory_size)
sample_index = np.random.choice(self.memory_size, size=self.batch_size)
else:
sample_index = np.random.choice(self.memory_counter, size=self.batch_size)
batch_memory = self.memory[sample_index, :]
# print(batch_memory[:,0:self.n_features].shape)
q_next, q_eval = self.sess.run(
[self.q_next, self.q_eval],
feed_dict={
self.s_: batch_memory[:, -self.__shape_len:],
self.s: batch_memory[:, :self.__shape_len],
})
q_target = q_eval.copy()
batch_index = np.arange(self.batch_size, dtype=np.int32) # batch_size is equal to q_eval_size
eval_act_index = batch_memory[:, self.__shape_len].astype(int) # get observation's action to updata
reward = batch_memory[:, self.__shape_len + 1] # get each step's reward
q_target[batch_index, eval_act_index] = reward + self.gamma * np.max(q_next, axis=1)
_, self.cost = self.sess.run([self._train_op, self.loss],
feed_dict={self.s: batch_memory[:, :self.__shape_len],
self.q_target: q_target})
self.cost_his.append(self.cost)
self.epsilon = self.epsilon + self.epsilon_increment if self.epsilon < self.epsilon_max else self.epsilon_max
self.learn_step_counter += 1
def plot_cost(self):
import matplotlib.pyplot as plt
plt.plot(np.arange(len(self.cost_his)), self.cost_his)
plt.ylabel('Cost')
plt.xlabel('training steps')
plt.show()