Q_learning to determine best path on the goal state

I am using Python on Q_learning to determine the best action on each episode of the 4x4 board. There are five special episodes on the board which are start, goal, forbidden and wall-square. The starting square is fixed on square 2. The two goals, forbidden and wall-squares are determined from the input. I have a

living_reward = -0.1
goal_reward = 100
forbidden_reward = 100
discount rate gamma = 0.1
learning rate alpha =0.5
greedy probability epsilon = 0.3
and max iteration = 10000.

When there are two similar max q-value for the up and right actions, a clockwise priority for printing the final policy will be used (up, right, down, left).

I feed the following input to my code 12 7 5 6 p and it prints out the following output:

1 right
2 right
3 up
4 left
5 forbid
6 wall-square
7 goal
8 left
9 up
10 left
11 left
12 goal
13 right
14 left
15 left
16 left

However, I am looking for it to print out this output

1   right
2   right
3   up
4   up
5   forbid
6   wall-square
7   goal
8   up
9   up
10  up
11  up
12  goal
13  up
14  up
15  up
16  up

It goes wrong at index 4,8,10,13,14,15,16,17 For the second input 15 12 8 6 p it prints out this

1 up
2 right
3 up
4 left
5 up
6 wall-square
7 up
8 forbid
9 up
10 up
11 up
12 goal
13 right
14 right
15 goal
16 left

while I am looking for this output

1    up
2    right
3    up
4    left
5    up
6    wall-square
7    up
8    forbid
9    up
10    up
11    up
12    goal
13    right
14    right
15    goal
16    up 

It goes wrong at the index 16. Instead of going up, it goes left. I wonder if anyone could advise what goes wrong with my code? I am also providing my codes below. Any advice would be very appreciated! Thanks

What I have tried:

import random
import numpy as np
import enum

EACH_STEP_REWARD = -0.1
GOAL_SQUARE_REWARD = 100
FORBIDDEN_SQUARE_REWARD = -100
DISCOUNT_RATE_GAMMA = 0.1  # Discount Rate
LEARNING_RATE_ALPHA = 0.3  # Learning Rate
GREEDY_PROBABILITY_EPSILON = 0.5  # Greedy Probability
ITERATION_MAX_NUM = 10000  # Will be 10,000
START_LABEL = 2
LEVEL = 4


class Direction(enum.Enum):
    up = 1
    right = 2
    down = 3
    left = 0
    exit = 4


class Node:
    def __init__(self, title, next, Goal=False, Forbidden=False, Wall=False, qValues=None, actions=None):
        self.title = title
        self.next = next
        self.qValues = [qValues] * 5
        self.move = [actions] * 5
        self.goal = Goal
        self.forbidden = Forbidden
        self.wall = Wall

    def max_Q_value(self):
        if self.wall:
            return False
        max_q = []
        for q in self.qValues:
            if q is not None:
                max_q.append(q)
        return max(max_q)

    def find_best_move(self):
        max_q = self.max_Q_value()
        q_index = self.qValues.index(max_q)
        return Direction(q_index)


class create_env:
    def __init__(self, input_list, wall=None):
        self.wall = wall
        self.episode = [[16, 15, 14, 13], [12, 11, 10, 9], [8, 7, 6, 5], [4, 3, 2, 1]]
        S = 2
        Node_1 = Node(1, [self.wall, 5, S, self.wall])
        Node_Start = Node(S, [1, 6, 3, self.wall])
        Node_3 = Node(3, [S, 7, 4, self.wall])
        Node_4 = Node(4, [3, 8, self.wall, self.wall])
        Node_5 = Node(5, [self.wall, 9, 6, 1])
        Node_6 = Node(6, [5, 10, 7, S])
        Node_7 = Node(7, [6, 11, 8, 3])
        Node_8 = Node(8, [7, 12, self.wall, 4])
        Node_9 = Node(9, [self.wall, 13, 10, 5])
        Node_10 = Node(10, [9, 14, 11, 6])
        Node_11 = Node(11, [10, 15, 12, 7])
        Node_12 = Node(12, [11, 16, self.wall, 8])
        Node_13 = Node(13, [self.wall, self.wall, 14, 9])
        Node_14 = Node(14, [13, self.wall, 15, 10])
        Node_15 = Node(15, [14, self.wall, 16, 11])
        Node_16 = Node(16, [15, self.wall, self.wall, 12])

        self.episode[0][0] = Node_1
        self.episode[0][1] = Node_Start
        self.episode[0][S] = Node_3
        self.episode[0][3] = Node_4
        self.episode[1][0] = Node_5
        self.episode[1][1] = Node_6
        self.episode[1][S] = Node_7
        self.episode[1][3] = Node_8
        self.episode[S][0] = Node_9
        self.episode[S][1] = Node_10
        self.episode[S][S] = Node_11
        self.episode[S][3] = Node_12
        self.episode[3][0] = Node_13
        self.episode[3][1] = Node_14
        self.episode[3][S] = Node_15
        self.episode[3][3] = Node_16

        self.goal_labels = [int(input_list[0]), int(input_list[1])]
        self.forbidden_label = int(input_list[2])
        self.wall_label = int(input_list[3])
        x = 0
        while x < LEVEL:
            y = 0
            while y < LEVEL:
                current_episode = self.episode[x][y]
                if current_episode.title in self.goal_labels:
                    current_episode.goal = 1
                    current_episode.move.insert(4, Direction.up.name)
                    current_episode.qValues.insert(4, 0)
                elif current_episode.title == self.forbidden_label:
                    current_episode.forbidden = 1
                    current_episode.move.insert(4, Direction.up.name)
                    current_episode.qValues.insert(4, 0)
                elif current_episode.title == self.wall_label:
                    current_episode.wall = 1
                else:
                    position = 0
                    while position < LEVEL:
                        if current_episode.next[position] is not None:
                            current_episode.move.insert(position,
                                                        Direction(position)), current_episode.qValues.insert(
                                position, False)
                        position += 1
                y += 1
            x += 1

    def get_episode(self, name):
        for x in self.episode:
            for episode in x:
                if episode.title == name:
                    return episode

    def print_best_actions(self):
        for row in self.episode:
            for episode in row:
                if episode.goal:
                    best_action_str = 'Direction.goal'
                elif episode.forbidden:
                    best_action_str = "Direction.forbid"
                elif episode.wall:
                    best_action_str = 'Direction.wall-square'
                else:
                    best_action_str = str(episode.find_best_move())
                print(str(episode.title) + " " + best_action_str[10:])



    def print_four_Q_value(self, index):
        episode = self.get_episode(index)
        for q in episode.qValues:
            if q is not None:
                print(str(Direction(episode.qValues.index(q)))[10:] + ' ' + str(round(q, 2)))


def Q_learning(environment, print_best_actions, index):
    for iteration in range(ITERATION_MAX_NUM):
        current_episode = environment.get_episode(START_LABEL)
        total_episode_reward = 0
        for episode in range(100):
            if np.random.uniform(0, 1) < GREEDY_PROBABILITY_EPSILON:
                next_move = []
                for score in current_episode.move:
                    if score is not None:
                        next_move.append(score)
                next_move = random.choice(next_move)
            else:
                next_move = current_episode.find_best_move()
            next_episode = environment.get_episode(current_episode.next[next_move.value])
            if next_episode.goal:
                reward = GOAL_SQUARE_REWARD
            elif next_episode.forbidden:
                reward = FORBIDDEN_SQUARE_REWARD
            else:
                reward = EACH_STEP_REWARD
            total_episode_reward += reward

            old_q = current_episode.qValues[next_move.value]
            new_q = old_q + LEARNING_RATE_ALPHA * (reward + DISCOUNT_RATE_GAMMA * next_episode.max_Q_value() - old_q)
            current_episode.qValues[next_move.value] = new_q
            if next_episode.goal:
                break
            elif next_episode.forbidden:
                break
            else:
                if next_episode.wall:
                    break
                else:
                    current_episode = next_episode


def user_input():
    try:
        input_list = []
        input_str = input()
        input_list = input_str.split()
    except:
        print("The input should be like: 15 12 8 6 p")

    environment = create_env(input_list)

    if (len(input_list) == 5) and (input_list[-1] == 'p'):
        Q_learning(environment, 1, 0)
        environment.print_best_actions()
    elif (len(input_list) == 6) and (input_list[-2] == 'q'):
        Q_learning(environment, 0, int(input_list[5]))
        environment.print_four_Q_value(int(input_list[5]))


user_input()