I am using Python on Q_learning to determine the best action on each episode of the 4x4 board. There are five special episodes on the board which are start, goal, forbidden and wall-square. The starting square is fixed on square 2. The two goals, forbidden and wall-squares are determined from the input. I have a
living_reward = -0.1
goal_reward = 100
forbidden_reward = 100
discount rate gamma = 0.1
learning rate alpha =0.5
greedy probability epsilon = 0.3
and max iteration = 10000.
When there are two similar max q-value for the up and right actions, a clockwise priority for printing the final policy will be used (up, right, down, left).
I feed the following input to my code 12 7 5 6 p
and it prints out the following output:
1 right
2 right
3 up
4 left
5 forbid
6 wall-square
7 goal
8 left
9 up
10 left
11 left
12 goal
13 right
14 left
15 left
16 left
However, I am looking for it to print out this output
1 right
2 right
3 up
4 up
5 forbid
6 wall-square
7 goal
8 up
9 up
10 up
11 up
12 goal
13 up
14 up
15 up
16 up
It goes wrong at index 4,8,10,13,14,15,16,17 For the second input 15 12 8 6 p
it prints out this
1 up
2 right
3 up
4 left
5 up
6 wall-square
7 up
8 forbid
9 up
10 up
11 up
12 goal
13 right
14 right
15 goal
16 left
while I am looking for this output
1 up
2 right
3 up
4 left
5 up
6 wall-square
7 up
8 forbid
9 up
10 up
11 up
12 goal
13 right
14 right
15 goal
16 up
It goes wrong at the index 16. Instead of going up, it goes left. I wonder if anyone could advise what goes wrong with my code? I am also providing my codes below. Any advice would be very appreciated! Thanks
What I have tried:
import random
import numpy as np
import enum
EACH_STEP_REWARD = -0.1
GOAL_SQUARE_REWARD = 100
FORBIDDEN_SQUARE_REWARD = -100
DISCOUNT_RATE_GAMMA = 0.1 # Discount Rate
LEARNING_RATE_ALPHA = 0.3 # Learning Rate
GREEDY_PROBABILITY_EPSILON = 0.5 # Greedy Probability
ITERATION_MAX_NUM = 10000 # Will be 10,000
START_LABEL = 2
LEVEL = 4
class Direction(enum.Enum):
up = 1
right = 2
down = 3
left = 0
exit = 4
class Node:
def __init__(self, title, next, Goal=False, Forbidden=False, Wall=False, qValues=None, actions=None):
self.title = title
self.next = next
self.qValues = [qValues] * 5
self.move = [actions] * 5
self.goal = Goal
self.forbidden = Forbidden
self.wall = Wall
def max_Q_value(self):
if self.wall:
return False
max_q = []
for q in self.qValues:
if q is not None:
max_q.append(q)
return max(max_q)
def find_best_move(self):
max_q = self.max_Q_value()
q_index = self.qValues.index(max_q)
return Direction(q_index)
class create_env:
def __init__(self, input_list, wall=None):
self.wall = wall
self.episode = [[16, 15, 14, 13], [12, 11, 10, 9], [8, 7, 6, 5], [4, 3, 2, 1]]
S = 2
Node_1 = Node(1, [self.wall, 5, S, self.wall])
Node_Start = Node(S, [1, 6, 3, self.wall])
Node_3 = Node(3, [S, 7, 4, self.wall])
Node_4 = Node(4, [3, 8, self.wall, self.wall])
Node_5 = Node(5, [self.wall, 9, 6, 1])
Node_6 = Node(6, [5, 10, 7, S])
Node_7 = Node(7, [6, 11, 8, 3])
Node_8 = Node(8, [7, 12, self.wall, 4])
Node_9 = Node(9, [self.wall, 13, 10, 5])
Node_10 = Node(10, [9, 14, 11, 6])
Node_11 = Node(11, [10, 15, 12, 7])
Node_12 = Node(12, [11, 16, self.wall, 8])
Node_13 = Node(13, [self.wall, self.wall, 14, 9])
Node_14 = Node(14, [13, self.wall, 15, 10])
Node_15 = Node(15, [14, self.wall, 16, 11])
Node_16 = Node(16, [15, self.wall, self.wall, 12])
self.episode[0][0] = Node_1
self.episode[0][1] = Node_Start
self.episode[0][S] = Node_3
self.episode[0][3] = Node_4
self.episode[1][0] = Node_5
self.episode[1][1] = Node_6
self.episode[1][S] = Node_7
self.episode[1][3] = Node_8
self.episode[S][0] = Node_9
self.episode[S][1] = Node_10
self.episode[S][S] = Node_11
self.episode[S][3] = Node_12
self.episode[3][0] = Node_13
self.episode[3][1] = Node_14
self.episode[3][S] = Node_15
self.episode[3][3] = Node_16
self.goal_labels = [int(input_list[0]), int(input_list[1])]
self.forbidden_label = int(input_list[2])
self.wall_label = int(input_list[3])
x = 0
while x < LEVEL:
y = 0
while y < LEVEL:
current_episode = self.episode[x][y]
if current_episode.title in self.goal_labels:
current_episode.goal = 1
current_episode.move.insert(4, Direction.up.name)
current_episode.qValues.insert(4, 0)
elif current_episode.title == self.forbidden_label:
current_episode.forbidden = 1
current_episode.move.insert(4, Direction.up.name)
current_episode.qValues.insert(4, 0)
elif current_episode.title == self.wall_label:
current_episode.wall = 1
else:
position = 0
while position < LEVEL:
if current_episode.next[position] is not None:
current_episode.move.insert(position,
Direction(position)), current_episode.qValues.insert(
position, False)
position += 1
y += 1
x += 1
def get_episode(self, name):
for x in self.episode:
for episode in x:
if episode.title == name:
return episode
def print_best_actions(self):
for row in self.episode:
for episode in row:
if episode.goal:
best_action_str = 'Direction.goal'
elif episode.forbidden:
best_action_str = "Direction.forbid"
elif episode.wall:
best_action_str = 'Direction.wall-square'
else:
best_action_str = str(episode.find_best_move())
print(str(episode.title) + " " + best_action_str[10:])
def print_four_Q_value(self, index):
episode = self.get_episode(index)
for q in episode.qValues:
if q is not None:
print(str(Direction(episode.qValues.index(q)))[10:] + ' ' + str(round(q, 2)))
def Q_learning(environment, print_best_actions, index):
for iteration in range(ITERATION_MAX_NUM):
current_episode = environment.get_episode(START_LABEL)
total_episode_reward = 0
for episode in range(100):
if np.random.uniform(0, 1) < GREEDY_PROBABILITY_EPSILON:
next_move = []
for score in current_episode.move:
if score is not None:
next_move.append(score)
next_move = random.choice(next_move)
else:
next_move = current_episode.find_best_move()
next_episode = environment.get_episode(current_episode.next[next_move.value])
if next_episode.goal:
reward = GOAL_SQUARE_REWARD
elif next_episode.forbidden:
reward = FORBIDDEN_SQUARE_REWARD
else:
reward = EACH_STEP_REWARD
total_episode_reward += reward
old_q = current_episode.qValues[next_move.value]
new_q = old_q + LEARNING_RATE_ALPHA * (reward + DISCOUNT_RATE_GAMMA * next_episode.max_Q_value() - old_q)
current_episode.qValues[next_move.value] = new_q
if next_episode.goal:
break
elif next_episode.forbidden:
break
else:
if next_episode.wall:
break
else:
current_episode = next_episode
def user_input():
try:
input_list = []
input_str = input()
input_list = input_str.split()
except:
print("The input should be like: 15 12 8 6 p")
environment = create_env(input_list)
if (len(input_list) == 5) and (input_list[-1] == 'p'):
Q_learning(environment, 1, 0)
environment.print_best_actions()
elif (len(input_list) == 6) and (input_list[-2] == 'q'):
Q_learning(environment, 0, int(input_list[5]))
environment.print_four_Q_value(int(input_list[5]))
user_input()