ML强化学习算法：使用Q学习的Python实现

2021年5月5日13:31:54 发表评论 1,681 次浏览

先决条件：Q学习技术.

强化学习是一种机器学习示例, 其中的学习算法不是基于预设数据而是基于反馈系统来训练的。这些算法被吹捧为机器学习的未来, 因为它们消除了收集和清理数据的成本。

在本文中, 我们将演示如何实现一种称为"强化学习"的基本强化学习算法。Q学习技术。在本演示中, 我们尝试教机器人使用Q学习技术.

步骤1：导入所需的库

import numpy as np
import pylab as pl
import networkx as nx

步骤2：定义和可视化图形

edges = [( 0 , 1 ), ( 1 , 5 ), ( 5 , 6 ), ( 5 , 4 ), ( 1 , 2 ), ( 1 , 3 ), ( 9 , 10 ), ( 2 , 4 ), ( 0 , 6 ), ( 6 , 7 ), ( 8 , 9 ), ( 7 , 8 ), ( 1 , 7 ), ( 3 , 9 )]
  
goal = 10
G = nx.Graph()
G.add_edges_from(edges)
pos = nx.spring_layout(G)
nx.draw_networkx_nodes(G, pos)
nx.draw_networkx_edges(G, pos)
nx.draw_networkx_labels(G, pos)
pl.show()

注意：上图在代码复制时可能看起来不一样, 因为网络python中的库从给定的边缘生成随机图。

第3步：为机器人定义奖励系统

MATRIX_SIZE = 11
M = np.matrix(np.ones(shape = (MATRIX_SIZE, MATRIX_SIZE)))
M * = - 1
  
for point in edges:
     print (point)
     if point[ 1 ] = = goal:
         M[point] = 100
     else :
         M[point] = 0
  
     if point[ 0 ] = = goal:
         M[point[:: - 1 ]] = 100
     else :
         M[point[:: - 1 ]] = 0
         # reverse of point
  
M[goal, goal] = 100
print (M)
# add goal point round trip

步骤4：定义要在培训中使用的一些实用程序功能

Q = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
  
gamma = 0.75
# learning parameter
initial_state = 1
  
# Determines the available actions for a given state
def available_actions(state):
     current_state_row = M[state, ]
     available_action = np.where(current_state_row> = 0 )[ 1 ]
     return available_action
  
available_action = available_actions(initial_state)
  
# Chooses one of the available actions at random
def sample_next_action(available_actions_range):
     next_action = int (np.random.choice(available_action, 1 ))
     return next_action
  
  
action = sample_next_action(available_action)
  
def update(current_state, action, gamma):
  
   max_index = np.where(Q[action, ] = = np. max (Q[action, ]))[ 1 ]
   if max_index.shape[ 0 ]> 1 :
       max_index = int (np.random.choice(max_index, size = 1 ))
   else :
       max_index = int (max_index)
   max_value = Q[action, max_index]
   Q[current_state, action] = M[current_state, action] + gamma * max_value
   if (np. max (Q)> 0 ):
     return (np. sum (Q /np. max (Q) * 100 ))
   else :
     return ( 0 )
# Updates the Q-Matrix according to the path chosen
  
update(initial_state, action, gamma)

步骤5：使用Q-Matrix训练和评估机器人

scores = []
for i in range ( 1000 ):
     current_state = np.random.randint( 0 , int (Q.shape[ 0 ]))
     available_action = available_actions(current_state)
     action = sample_next_action(available_action)
     score = update(current_state, action, gamma)
     scores.append(score)
  
# print("Trained Q matrix:")
# print(Q /np.max(Q)*100)
# You can uncomment the above two lines to view the trained Q matrix
  
# Testing
current_state = 0
steps = [current_state]
  
while current_state ! = 10 :
  
     next_step_index = np.where(Q[current_state, ] = = np. max (Q[current_state, ]))[ 1 ]
     if next_step_index.shape[ 0 ]> 1 :
         next_step_index = int (np.random.choice(next_step_index, size = 1 ))
     else :
         next_step_index = int (next_step_index)
     steps.append(next_step_index)
     current_state = next_step_index
  
print ( "Most efficient path:" )
print (steps)
  
pl.plot(scores)
pl.xlabel( 'No of iterations' )
pl.ylabel( 'Reward gained' )
pl.show()

现在, 让我们将该机器人设置为更现实的设置。让我们想象一下, 该机器人是一名侦探, 并正在试图找出大型毒品球拍的位置。他自然得出这样的结论, 即贩毒者将不会在警察经常光顾的地方出售他们的产品, 并且贩卖地点在毒品球拍附近。而且, 卖家在他们销售产品的地方留下了他们的产品踪迹, 这可以帮助侦探找到所需的位置。我们想训练我们的机器人使用这些来找到位置环境线索.

步骤6：使用环境线索定义和可视化新图形

# Defining the locations of the police and the drug traces
police = [ 2 , 4 , 5 ]
drug_traces = [ 3 , 8 , 9 ]
  
G = nx.Graph()
G.add_edges_from(edges)
mapping = { 0 : '0 - Detective' , 1 : '1' , 2 : '2 - Police' , 3 : '3 - Drug traces' , 4 : '4 - Police' , 5 : '5 - Police' , 6 : '6' , 7 : '7' , 8 : 'Drug traces' , 9 : '9 - Drug traces' , 10 : '10 - Drug racket location' }
  
H = nx.relabel_nodes(G, mapping)
pos = nx.spring_layout(H)
nx.draw_networkx_nodes(H, pos, node_size = [ 200 , 200 , 200 , 200 , 200 , 200 , 200 , 200 ])
nx.draw_networkx_edges(H, pos)
nx.draw_networkx_labels(H, pos)
pl.show()

注意：上面的图看起来可能与以前的图有些不同, 但实际上它们是相同的图。这是由于节点随机放置了节点网络图书馆。

步骤7：为训练过程定义一些实用程序功能

Q = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
env_police = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
env_drugs = np.matrix(np.zeros([MATRIX_SIZE, MATRIX_SIZE]))
initial_state = 1
  
# Same as above
def available_actions(state):
     current_state_row = M[state, ]
     av_action = np.where(current_state_row> = 0 )[ 1 ]
     return av_action
  
# Same as above
def sample_next_action(available_actions_range):
     next_action = int (np.random.choice(available_action, 1 ))
     return next_action
  
# Exploring the environment
def collect_environmental_data(action):
     found = []
     if action in police:
         found.append( 'p' )
     if action in drug_traces:
         found.append( 'd' )
     return (found)
  
   
available_action = available_actions(initial_state)
action = sample_next_action(available_action)
  
def update(current_state, action, gamma):
   max_index = np.where(Q[action, ] = = np. max (Q[action, ]))[ 1 ]
   if max_index.shape[ 0 ]> 1 :
       max_index = int (np.random.choice(max_index, size = 1 ))
   else :
       max_index = int (max_index)
   max_value = Q[action, max_index]
   Q[current_state, action] = M[current_state, action] + gamma * max_value
   environment = collect_environmental_data(action)
   if 'p' in environment:
     env_police[current_state, action] + = 1
   if 'd' in environment:
     env_drugs[current_state, action] + = 1
   if (np. max (Q)> 0 ):
     return (np. sum (Q /np. max (Q) * 100 ))
   else :
     return ( 0 )
# Same as above
update(initial_state, action, gamma)
  
def available_actions_with_env_help(state):
     current_state_row = M[state, ]
     av_action = np.where(current_state_row> = 0 )[ 1 ]
  
     # if there are multiple routes, dis-favor anything negative
     env_pos_row = env_matrix_snap[state, av_action]
  
     if (np. sum (env_pos_row <0 )):
         # can we remove the negative directions from av_act?
         temp_av_action = av_action[np.array(env_pos_row)[ 0 ]> = 0 ]
         if len (temp_av_action)> 0 :
             av_action = temp_av_action
     return av_action
# Determines the available actions according to the environment

步骤8：可视化环境矩阵

scores = []
for i in range ( 1000 ):
     current_state = np.random.randint( 0 , int (Q.shape[ 0 ]))
     available_action = available_actions(current_state)
     action = sample_next_action(available_action)
     score = update(current_state, action, gamma)
  
# print environmental matrices
print ( 'Police Found' )
print (env_police)
print ('')
print ( 'Drug traces Found' )
print (env_drugs)

步骤9：训练和评估模型

scores = []
for i in range ( 1000 ):
     current_state = np.random.randint( 0 , int (Q.shape[ 0 ]))
     available_action = available_actions_with_env_help(current_state)
     action = sample_next_action(available_action)
     score = update(current_state, action, gamma)
     scores.append(score)
  
pl.plot(scores)
pl.xlabel( 'Number of iterations' )
pl.ylabel( 'Reward gained' )
pl.show()

上面的示例是一个非常基本的示例, 许多实际示例如自驾车涉及到博弈论.

发表评论取消回复

登录 注册 找回密码

登录注册找回密码