~melmon/chizuru-old

8beec5ae9caaff19ec65449427fbb0ad7b1e75be — Melmon 9 months ago a5f2f45
Don't push this commit
6 files changed, 224 insertions(+), 60 deletions(-)

M .gitignore
M README.md
M chizuru.py
D logs/czr2023-04-26--00:18:20/events.out.tfevents.1682464704.CHIZURU-NOTE.198746.0.v2
D logs/czr2023-04-26--00:19:05/events.out.tfevents.1682464745.CHIZURU-NOTE.198827.0.v2
M writeup/Drescher-DGD-dissertation-2022-23.tex
M .gitignore => .gitignore +6 -1
@@ 534,4 534,9 @@ TSWLatexianTemp*
#*Notes.bib

training/
logs/
\ No newline at end of file
logs/
logs_dueling/
logs_dueling_per/
logs_rainbow/
training_dueling/
training_rainbow/

M README.md => README.md +1 -1
@@ 14,7 14,7 @@ docker run
```
After that, it should be smooth sailing.

If you want to run this locally, you need to use Python 3.7. Rogue-gym does not install in higher Python versions, since its an old project.
If you want to run this locally, you need to use Python 3.7. Rogue-gym does not install in higher Python versions, since it's an old project.

## Files
The model is located in `chizuru.py`. The training file and logic is written in `train.py`, and the code for previewing how the AI plays is located in `preview.py`. Seeing is believing, after all.

M chizuru.py => chizuru.py +173 -40
@@ 10,25 10,33 @@
#  ██╔══██╗██║   ██║██║   ██║██║   ██║██╔══╝
#  ██║  ██║╚██████╔╝╚██████╔╝╚██████╔╝███████╗
#  ╚═╝  ╚═╝ ╚═════╝  ╚═════╝  ╚═════╝ ╚══════╝
#
# Much of the code here is inspired by the work of https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py,
# especially the code for agent learning.

# ******************************************************************************************************
# The following code was adapted from:
# Author: Sebastian Theiler
# Accessed from: https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py
# Date of last retrieval: 26-04-2023
# ******************************************************************************************************

"""This file contains everything needed to run the chizuru-rogue AI."""

from rogue_gym.envs import RogueEnv
from random import random, sample, randint
from collections import deque
from random import random, randint
import tensorflow as tf
import datetime
import numpy as np
import itertools
import argparse
import os

# Constants
STEPS_PER_INTERVAL = 10000
CKPT_PATH = "training/czr-{interval:04d}-{label}.ckpt"
LOG_DIR = "logs/czr" + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
ACTIONS = ['h', 'j', 'k', 'l', 'u', 'n', 'b', 'y', 's', '.']  # Movement actions, search and wait.
PARSER = argparse.ArgumentParser(description="Interval checkpoint to load from.")
PARSER.add_argument('-i', '--interval', action="store")
HISTORY_LEN = 4

# Hyperparameters
GAMMA = 0.99


@@ 43,6 51,131 @@ EPSILON_DECAY = 150000
LEARNING_RATE = 0.00001
LEARNING_FREQUENCY = 75
TARGET_UPDATE_FREQUENCY = 750
PRIORITY_SCALE = 0.7


class ReplayBuffer:
    """ReplayBuffer for storing transitions.
    This implementation was heavily inspired by Fabio M. Graetz's replay buffer
    here: https://github.com/fg91/Deep-Q-Learning/blob/master/DQN.ipynb"""
    def __init__(self, size=BUFFER_SIZE, input_shape=(21, 79), history_length=4):
        """
        Arguments:
            size: Integer, Number of stored transitions
            input_shape: Shape of the preprocessed frame
            history_length: Integer, Number of frames stacked together to create a state for the agent
        """
        self.size = size
        self.input_shape = input_shape
        self.history_length = history_length
        self.count = 0  # total index of memory written to, always less than self.size
        self.current = 0  # index to write to

        # Pre-allocate memory
        self.actions = np.empty(self.size, dtype=np.int32)
        self.rewards = np.empty(self.size, dtype=np.float32)
        self.frames = np.empty((self.size, self.input_shape[0], self.input_shape[1]), dtype=np.uint8)
        self.terminal_flags = np.empty(self.size, dtype=np.bool)
        self.priorities = np.zeros(self.size, dtype=np.float32)

    def add_experience(self, action, frame, reward, terminal):
        """Saves a transition to the replay buffer
        Arguments:
            action: An integer between 0 and env.action_space.n - 1
                determining the action the agent perfomed
            frame: A (21, 74, 1) frame of the game in grayscale
            reward: A float determining the reward the agend received for performing an action
            terminal: A bool stating whether the episode terminated
        """
        if frame.shape != self.input_shape:
            raise ValueError('Dimension of frame is wrong!')

        # Write memory
        self.actions[self.current] = action
        self.frames[self.current, ...] = frame
        self.rewards[self.current] = reward
        self.terminal_flags[self.current] = terminal
        self.priorities[self.current] = max(self.priorities.max(initial=0), 1)  # make the most recent experience important
        self.count = max(self.count, self.current+1)
        self.current = (self.current + 1) % self.size

    def get_minibatch(self, batch_size=32, priority_scale=0.0):
        """Returns a minibatch of self.batch_size = 32 transitions
        Arguments:
            batch_size: How many samples to return
            priority_scale: How much to weight priorities. 0 = completely random, 1 = completely based on priority
        Returns:
            A tuple of states, actions, rewards, new_states, and terminals
            If use_per is True:
                An array describing the importance of transition. Used for scaling gradient steps.
                An array of each index that was sampled
        """

        if self.count < self.history_length:
            raise ValueError('Not enough memories to get a minibatch')

        # Get sampling probabilities from priority list
        scaled_priorities = self.priorities[self.history_length:self.count-1] ** priority_scale
        sample_probabilities = scaled_priorities / sum(scaled_priorities)

        # Get a list of valid indices
        indices = []
        for i in range(batch_size):
            while True:
                # Get a random number from history_length to maximum frame written with probabilities based on priority weights
                index = np.random.choice(np.arange(self.history_length, self.count-1), p=sample_probabilities)


                # We check that all frames are from same episode with the two following if statements.  If either are True, the index is invalid.
                if index >= self.current >= index - self.history_length:
                    continue
                if self.terminal_flags[index - self.history_length:index].any():
                    continue
                break
            indices.append(index)

        # Retrieve states from memory
        states = []
        new_states = []
        for idx in indices:
            states.append(self.frames[idx-self.history_length:idx, ...])
            new_states.append(self.frames[idx-self.history_length+1:idx+1, ...])

        states = np.transpose(np.asarray(states), axes=(0, 2, 3, 1))
        new_states = np.transpose(np.asarray(new_states), axes=(0, 2, 3, 1))

        # Get importance weights from probabilities calculated earlier
        importance = 1/self.count * 1/sample_probabilities[[index - self.history_length for index in indices]]
        importance = importance / importance.max()

        return (states, self.actions[indices], self.rewards[indices], new_states, self.terminal_flags[indices]), importance, indices

    def set_priorities(self, indices, errors, offset=0.1):
        """Update priorities for PER
        Arguments:
            indices: Indices to update
            errors: For each index, the error between the target Q-vals and the predicted Q-vals
        """
        for i, e in zip(indices, errors):
            self.priorities[i] = abs(e) + offset

    def save(self, folder_name):
        """Save the replay buffer to a folder"""

        if not os.path.isdir(folder_name):
            os.mkdir(folder_name)

        np.save(folder_name + '/actions.npy', self.actions)
        np.save(folder_name + '/frames.npy', self.frames)
        np.save(folder_name + '/rewards.npy', self.rewards)
        np.save(folder_name + '/terminal_flags.npy', self.terminal_flags)

    def load(self, folder_name):
        """Loads the replay buffer from a folder"""
        self.actions = np.load(folder_name + '/actions.npy')
        self.frames = np.load(folder_name + '/frames.npy')
        self.rewards = np.load(folder_name + '/rewards.npy')
        self.terminal_flags = np.load(folder_name + '/terminal_flags.npy')


class Agent:


@@ 52,7 185,7 @@ class Agent:
        self.w = w
        self.online_net = create_dueling_dqn(h, w)
        self.target_net = create_dueling_dqn(h, w)
        self.replay_buffer = deque(maxlen=BUFFER_SIZE)
        self.replay_buffer = ReplayBuffer()

    def get_action(self, s, e):
        """Agent chooses an action."""


@@ 60,41 193,39 @@ class Agent:

        if rnd_sample <= e:
            return randint(0, len(ACTIONS)-1)
        return self.online_net.predict(tf.reshape(tf.convert_to_tensor(s), (-1, 21, 79, 1)))[0].argmax()
        return self.online_net.predict(s.reshape(-1, 21, 79, HISTORY_LEN))[0].argmax()

    def update_target_network(self):
        """Updates target network with the online network."""
        self.target_net.set_weights(self.online_net.get_weights())

    def learn(self, batch_size, gamma):  # god, I'm so tired.
    def learn(self, batch_size, gamma, e, priority_scale=1.0):  # god, I'm so tired.
        """Learns from replays."""
        minibatch = sample(self.replay_buffer, BATCH_SIZE)

        states = tf.constant([r[0] for r in minibatch])
        actions = tf.constant([r[1] for r in minibatch])
        rewards = tf.constant([r[2] for r in minibatch])
        dones = tf.constant([r[3] for r in minibatch])
        new_states = tf.constant([r[4] for r in minibatch])
        (states, actions, rewards, new_states, dones), importance, indices = self.replay_buffer.get_minibatch(batch_size=batch_size, priority_scale=priority_scale)
        importance = importance ** (1-e)

        arg_q_max = self.online_net.predict(tf.reshape(new_states, (batch_size, 21, 79, 1))).argmax(axis=1)
        arg_q_max = self.online_net.predict(new_states).argmax(axis=1)

        future_q_vals = self.target_net.predict(tf.reshape(new_states, (batch_size, 21, 79, 1)))
        future_q_vals = self.target_net.predict(new_states)
        double_q = future_q_vals[range(batch_size), arg_q_max]

        target_q = tf.cast(rewards, tf.float32) + (gamma * double_q * (1.0 - tf.cast(dones, tf.float32)))

        with tf.GradientTape() as tape:
            q_values = self.online_net(tf.reshape(states, (batch_size, 21, 79, 1)))
            q_values = self.online_net(states)

            one_hot_actions = tf.keras.utils.to_categorical(actions, len(ACTIONS), dtype=np.float32)
            q = tf.reduce_sum(tf.multiply(q_values, one_hot_actions), axis=1)

            error = q - target_q
            learn_loss = tf.keras.losses.MeanSquaredError()(target_q, q)
            learn_loss = tf.reduce_mean(learn_loss * importance)

            model_gradients = tape.gradient(learn_loss, self.online_net.trainable_variables)
            self.online_net.optimizer.apply_gradients(zip(model_gradients, self.online_net.trainable_variables))

            self.replay_buffer.set_priorities(indices, error)

        return float(learn_loss.numpy()), error

    def save(self, interval):


@@ 110,7 241,7 @@ class Agent:

def create_dueling_dqn(h, w) -> tf.keras.Model:
    """Creates a Dueling DQN."""
    net_input = tf.keras.Input(shape=(h, w, 1))
    net_input = tf.keras.Input(shape=(h, w, HISTORY_LEN))
    # net_input = tf.keras.layers.Lambda(lambda layer: layer / 255)(net_input)

    conv1 = tf.keras.layers.Conv2D(32, (3, 3), strides=2, activation="relu", use_bias=False,


@@ 143,11 274,6 @@ def create_dueling_dqn(h, w) -> tf.keras.Model:
    return final_model


def create_rainbow_dqn(_h, _w):
    """Creates a Rainbow Deep Q-network."""
    pass


def save_checkpoint(model_sv: tf.keras.Model, interval, label) -> None:
    """Saves the model checkpoint with given interval."""
    model_sv.save_weights(CKPT_PATH.format(interval=interval, label=label))


@@ 164,6 290,11 @@ def load_checkpoint(model_ld: tf.keras.Model, interval, label) -> tf.keras.Model
if __name__ == "__main__":
    agent = Agent(21, 79)

    arg = PARSER.parse_args()

    if arg.interval:
        agent.load(arg.interval)

    writer = tf.summary.create_file_writer(LOG_DIR)

    CONFIG = {


@@ 179,38 310,40 @@ if __name__ == "__main__":
    intr = 0
    episode = 0
    all_rewards = []
    state = env.reset()
    new_state, reward, done, _ = env.step('.')
    for _ in range(4):
        agent.replay_buffer.append((state.gray_image(), 9, reward, done, new_state.gray_image()))
    state = new_state
    all_losses = []
    env.reset()
    new_state, rew, done, _ = env.step('.')
    agent.replay_buffer.add_experience(9, new_state.gray_image()[0], rew, done)
    current_game_state = np.repeat(new_state.gray_image().reshape(21, 79, 1), HISTORY_LEN, axis=2)  # with a history of 4

    # Main processing
    try:
        with writer.as_default():
            for step in itertools.count():
                epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
                action = agent.get_action(state.gray_image(), epsilon)
                new_state, reward, done, _ = env.step(ACTIONS[action])
                episode_reward += reward
                all_rewards.append(reward)
                act = agent.get_action(current_game_state, epsilon)
                new_state, rew, done, _ = env.step(ACTIONS[act])
                episode_reward += rew
                all_rewards.append(rew)
                current_game_state = np.append(current_game_state[:, :, 1:], new_state.gray_image().reshape(21, 79, 1), axis=2)

                transition = (state.gray_image(), action, reward, done, new_state.gray_image())
                agent.replay_buffer.append(transition)
                state = new_state
                agent.replay_buffer.add_experience(act, new_state.gray_image()[0], rew, done)

                # Learning step
                if step % LEARNING_FREQUENCY == 0 and len(agent.replay_buffer) > MIN_REPLAY_SIZE:
                    loss, _ = agent.learn(BATCH_SIZE, GAMMA)
                if step % LEARNING_FREQUENCY == 0 and agent.replay_buffer.count > MIN_REPLAY_SIZE:
                    loss, _ = agent.learn(BATCH_SIZE, GAMMA, epsilon, PRIORITY_SCALE)
                    all_losses.append(loss)
                    tf.summary.scalar('Loss', loss, step)

                if step % TARGET_UPDATE_FREQUENCY == 0 and step > MIN_REPLAY_SIZE:
                    agent.update_target_network()

                tf.summary.scalar('Rewards per step', reward, step)
                if step % 10 == 0:
                    tf.summary.scalar('Reward', np.mean(all_rewards[-10:]), step)
                    tf.summary.scalar('Losses', np.mean(all_losses[-100:]), step)

                if done:
                    dlvl = state.dungeon_level
                    dlvl = new_state.dungeon_level
                    env.reset()
                    all_rewards.append(episode_reward)
                    tf.summary.scalar('Evaluation score', episode_reward, episode)

D logs/czr2023-04-26--00:18:20/events.out.tfevents.1682464704.CHIZURU-NOTE.198746.0.v2 => logs/czr2023-04-26--00:18:20/events.out.tfevents.1682464704.CHIZURU-NOTE.198746.0.v2 +0 -0
D logs/czr2023-04-26--00:19:05/events.out.tfevents.1682464745.CHIZURU-NOTE.198827.0.v2 => logs/czr2023-04-26--00:19:05/events.out.tfevents.1682464745.CHIZURU-NOTE.198827.0.v2 +0 -0
M writeup/Drescher-DGD-dissertation-2022-23.tex => writeup/Drescher-DGD-dissertation-2022-23.tex +44 -18
@@ 202,6 202,7 @@
    \subsection{Fundamentals of RL}\label{subsec:fundamentals}
    The fundamentals of reinforcement learning and many fundamental algorithms for solving sequential decision problems is explained in detail by~\citet{sutton18}.
    The core idea behind reinforcement learning algorithms is that an agent performs \emph{actions} on an \emph{environment} by deriving what it should do from its \emph{policy}, which is a mapping from states to actions.
    This loop is visualised in Figure~\ref{fig:rlgraph}.
    Once the agent performs an action, it receives the new game state as well as a \emph{reward} signal, telling the agent how good its choice was.

    \begin{figure}[ht]


@@ 226,12 227,15 @@
    This means the optimal value of taking action \(a\) in state \(s\) is the expected reward of taking the action and then following the policy from there.

    \subsection{Deep Learning}\label{subsec:deep-learning}
    Deep learning is a method of artificial intelligence that involves the use of deep neural networks to process data in a way inspired by the human brain. % TODO improve introductory paragraph and make sure not plagiarising
    
    Deep learning is a method of artificial intelligence that involves the use of deep neural networks to process data in a way inspired by the human brain.
    A deep learning method can be supervised or unsupervised. % TODO look into this further
    Deep reinforcement learning is unsupervised, as the input data - the game state - is not labelled.
    The agent must use the reward signal it gains in order to approximate an ideal policy.

    Representing Q-values containing every state-action pairing becomes infeasible in large state spaces such as video
    games, requiring ever expanding tables and computational space needed to store them.
    Deep Q-learning, a technique by OpenAI~\citep{mnih15}, remedies this by using a convolutional neural network to
    approximate the optimal Q-function \(Q*(s, a)\) using a convolutional neural network instead of keeping track of
    approximate the optimal Q-function \(Q^*(s, a)\) using a convolutional neural network instead of keeping track of
    a table.

    The Deep Q-network in their writing was shown to play several Atari games to a superhuman level, most notably Video Pinball and Boxing.


@@ 300,7 304,7 @@

    \begin{itemize}
        \item Monsters are disabled, so that combat is not part of the problem.
        \item The amount of actions available to the agent is reduced.
        \item The amount of actions available to the agent is reduced to the movement actions, search and wait.
        \item Initially we disabled hunger, so that the agent only needs to focus on descending the dungeon.
    \end{itemize}



@@ 335,7 339,9 @@
    \subsection{Agent Implementation}\label{subsec:implementation}
    The agent will be implemented in Python, which is one of the most popular languages used to model neural networks due to its many AI-related libraries that are available, including TensorFlow, which is what we use.
    TensorFlow provides tools for working with linear algebra and is widely used in machine learning.
    We chose TensorFlow because it is popular, and it is bundled with Keras, a library that streamlines the creation and running of neural networks.
    We chose TensorFlow over other alternatives such as PyTorch because it is one of the most popular frameworks for research,
    and it is bundled with Keras, a library that streamlines the creation and running of neural networks by providing the programmer with
    easy-to-use tools for defining, tuning and training neural network models.

    The agent will use Rogue-gym~\citep{kanagawa19} as an environment to interact with.
    Rogue-gym is a game that accurately replicates the gameplay of Rogue while also allowing customisation of the game.


@@ 356,8 362,10 @@
    \end{itemize}

    \subsection{Dueling DQN}\label{subsec:dueling-dqn}
    \subsubsection{First Run}

    \subsection{Rainbow DQN}\label{subsec:rainbow-dqn}
    \subsubsection{Second Run}

    \subsection{Summary}\label{subsec:summary}



@@ 421,21 429,39 @@
    \end{itemize}

    \subsection{Hyperparameters}\label{subsec:hyperparameters}
    \begin{lstlisting}[label={lst:hyperparameters}]
        GAMMA = 0.99
        NUM_ITERATIONS = 20000
        MAX_TURNS_IN_EPISODE = 1000
        BATCH_SIZE = 64
        BUFFER_SIZE = 200000
        MIN_REPLAY_SIZE = 400
        EPSILON_START = 1.0
        EPSILON_END = 0.01
        EPSILON_DECAY = 150000
        LEARNING_RATE = 0.00001
        UPDATE_FREQUENCY = 1000
    \subsubsection{Dueling DQN - First Run}
    \begin{lstlisting}[label={lst:ddqn1hyperparameters}]
GAMMA = 0.99
NUM_ITERATIONS = 20000
MAX_TURNS_IN_EPISODE = 1000
BATCH_SIZE = 32
BUFFER_SIZE = 200000
MIN_REPLAY_SIZE = 400
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000
LEARNING_RATE = 0.00001
LEARNING_FREQUENCY = 1000
TARGET_UPDATE_FREQUENCY = 1000
    \end{lstlisting}

    \subsubsection{Dueling DQN - Second Run}
    \begin{lstlisting}[label={lst:ddqn2hyperparameters}]
GAMMA = 0.99
NUM_ITERATIONS = 20000
MAX_TURNS_IN_EPISODE = 1250
BATCH_SIZE = 32
BUFFER_SIZE = 200000
MIN_REPLAY_SIZE = 1500
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000
LEARNING_RATE = 0.00001
LEARNING_FREQUENCY = 75
TARGET_UPDATE_FREQUENCY = 750
    \end{lstlisting}

    \subsection{Network Architecture}
    \subsection{Network Architecture}\label{subsec:network-architecture}
    \subsubsection{Dueling DQN}
    \begin{lstlisting}[label={lst:dueling}]
    net_input = tf.keras.Input(shape=(h, w, 1))