~melmon/chizuru-old

6628ad6080034517bf72f849d2ca58f0eca16d48 — Melmon 1 year, 5 months ago 37eca43
Pushing my problems to tomorrow.
M .gitignore => .gitignore +2 -0
@@ 540,3 540,5 @@ logs_dueling_per/
logs_rainbow/
training_dueling/
training_rainbow/
code-submission/
fil-result/
\ No newline at end of file

M chizuru.py => chizuru.py +44 -35
@@ 11,17 11,17 @@
#  ██║  ██║╚██████╔╝╚██████╔╝╚██████╔╝███████╗
#  ╚═╝  ╚═╝ ╚═════╝  ╚═════╝  ╚═════╝ ╚══════╝

# ******************************************************************************************************
# The following code was adapted from:
# Author: Sebastian Theiler
# Accessed from: https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py
# Date of last retrieval: 26-04-2023
# ******************************************************************************************************
# ****************************************************************************************************** #
# The following code was adapted from:                                                                   #
# Author: Sebastian Theiler                                                                              #
# Accessed from: https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py                      #
# Date of last retrieval: 26-04-2023                                                                     #
# ****************************************************************************************************** #

"""This file contains everything needed to run the chizuru-rogue AI."""

from rogue_gym.envs import RogueEnv
from random import random, randint
from random import randint
import tensorflow as tf
import datetime
import numpy as np


@@ 49,7 49,6 @@ EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000
LEARNING_RATE = 0.00001
LEARNING_FREQUENCY = 75
TARGET_UPDATE_FREQUENCY = 750
PRIORITY_SCALE = 0.7



@@ 58,7 57,9 @@ class ReplayBuffer:
    """ReplayBuffer for storing transitions.
    This implementation was heavily inspired by Fabio M. Graetz's replay buffer
    here: https://github.com/fg91/Deep-Q-Learning/blob/master/DQN.ipynb"""
    def __init__(self, size=BUFFER_SIZE, input_shape=(21, 79), history_length=4):

    def __init__(self, size=BUFFER_SIZE, input_shape=(21, 79),
                 history_length=HISTORY_LEN):  # History length for n-step learning
        """
        Arguments:
            size: Integer, Number of stored transitions


@@ 72,9 73,9 @@ class ReplayBuffer:
        self.current = 0  # index to write to

        # Pre-allocate memory
        self.actions = np.empty(self.size, dtype=np.int32)
        self.actions = np.empty(self.size, dtype=np.uint8)
        self.rewards = np.empty(self.size, dtype=np.float32)
        self.frames = np.empty((self.size, self.input_shape[0], self.input_shape[1]), dtype=np.uint8)
        self.frames = np.empty((self.size, self.input_shape[0], self.input_shape[1]), dtype=np.float32)
        self.terminal_flags = np.empty(self.size, dtype=np.bool)
        self.priorities = np.zeros(self.size, dtype=np.float32)



@@ 95,8 96,9 @@ class ReplayBuffer:
        self.frames[self.current, ...] = frame
        self.rewards[self.current] = reward
        self.terminal_flags[self.current] = terminal
        self.priorities[self.current] = max(self.priorities.max(initial=0), 1)  # make the most recent experience important
        self.count = max(self.count, self.current+1)
        self.priorities[self.current] = max(self.priorities.max(initial=0),
                                            1)  # make the most recent experience important
        self.count = max(self.count, self.current + 1)
        self.current = (self.current + 1) % self.size

    def get_minibatch(self, batch_size=32, priority_scale=0.0):


@@ 115,18 117,19 @@ class ReplayBuffer:
            raise ValueError('Not enough memories to get a minibatch')

        # Get sampling probabilities from priority list
        scaled_priorities = self.priorities[self.history_length:self.count-1] ** priority_scale
        scaled_priorities = self.priorities[self.history_length:self.count - 1] ** priority_scale
        sample_probabilities = scaled_priorities / sum(scaled_priorities)

        # Get a list of valid indices
        indices = []
        for i in range(batch_size):
            while True:
                # Get a random number from history_length to maximum frame written with probabilities based on priority weights
                index = np.random.choice(np.arange(self.history_length, self.count-1), p=sample_probabilities)

                # Get a random number from history_length to maximum frame written with probabilities based on
                # priority weights
                index = np.random.choice(np.arange(self.history_length, self.count - 1), p=sample_probabilities)

                # We check that all frames are from same episode with the two following if statements.  If either are True, the index is invalid.
                # We check that all frames are from same episode with the two following if statements.  If either are
                # True, the index is invalid.
                if index >= self.current >= index - self.history_length:
                    continue
                if self.terminal_flags[index - self.history_length:index].any():


@@ 138,14 141,14 @@ class ReplayBuffer:
        states = []
        new_states = []
        for idx in indices:
            states.append(self.frames[idx-self.history_length:idx, ...])
            new_states.append(self.frames[idx-self.history_length+1:idx+1, ...])
            states.append(self.frames[idx - self.history_length:idx, ...])
            new_states.append(self.frames[idx - self.history_length + 1:idx + 1, ...])

        states = np.transpose(np.asarray(states), axes=(0, 2, 3, 1))
        new_states = np.transpose(np.asarray(new_states), axes=(0, 2, 3, 1))

        # Get importance weights from probabilities calculated earlier
        importance = 1/self.count * 1/sample_probabilities[[index - self.history_length for index in indices]]
        importance = 1 / self.count * 1 / sample_probabilities[[index - self.history_length for index in indices]]
        importance = importance / importance.max()

        return (states, self.actions[indices], self.rewards[indices], new_states, self.terminal_flags[indices]), importance, indices


@@ 155,6 158,7 @@ class ReplayBuffer:
        Arguments:
            indices: Indices to update
            errors: For each index, the error between the target Q-vals and the predicted Q-vals
            offset
        """
        for i, e in zip(indices, errors):
            self.priorities[i] = abs(e) + offset


@@ 180,6 184,7 @@ class ReplayBuffer:

class Agent:
    """Contains everything needed to manage the agent."""

    def __init__(self, h, w):
        self.h = h
        self.w = w


@@ 189,11 194,13 @@ class Agent:

    def get_action(self, s, e):
        """Agent chooses an action."""
        rnd_sample = random()
        rnd_sample = np.random.rand()

        if rnd_sample <= e:
            return randint(0, len(ACTIONS)-1)
        return self.online_net.predict(s.reshape(-1, 21, 79, HISTORY_LEN))[0].argmax()
            return randint(0, len(ACTIONS) - 1)
        reshaped = s.reshape((1, 21, 79, 4))
        q_vals = self.online_net.predict(reshaped)[0]
        return q_vals.argmax()

    def update_target_network(self):
        """Updates target network with the online network."""


@@ 201,15 208,16 @@ class Agent:

    def learn(self, batch_size, gamma, e, priority_scale=1.0):  # god, I'm so tired.
        """Learns from replays."""
        (states, actions, rewards, new_states, dones), importance, indices = self.replay_buffer.get_minibatch(batch_size=batch_size, priority_scale=priority_scale)
        importance = importance ** (1-e)
        (states, actions, rewards, new_states, dones), importance, indices = self.replay_buffer.get_minibatch(
            batch_size=batch_size, priority_scale=priority_scale)
        importance = importance ** (1 - e)

        arg_q_max = self.online_net.predict(new_states).argmax(axis=1)

        future_q_vals = self.target_net.predict(new_states)
        double_q = future_q_vals[range(batch_size), arg_q_max]

        target_q = tf.cast(rewards, tf.float32) + (gamma * double_q * (1.0 - tf.cast(dones, tf.float32)))
        target_q = rewards + (gamma * double_q * (1.0 - dones))

        with tf.GradientTape() as tape:
            q_values = self.online_net(states)


@@ 230,13 238,13 @@ class Agent:

    def save(self, interval):
        """Saves model at current interval."""
        save_checkpoint(self.online_net, intr, "online")
        save_checkpoint(self.online_net, interval, "online")
        save_checkpoint(self.target_net, interval, "target")

    def load(self, interval):
        """Loads model at given interval."""
        self.online_net = load_checkpoint(self.online_net, intr, "online")
        self.target_net = load_checkpoint(self.target_net, interval, "online")
        self.online_net = load_checkpoint(self.online_net, interval, "online")
        self.target_net = load_checkpoint(self.target_net, interval, "target")


def create_dueling_dqn(h, w) -> tf.keras.Model:


@@ 250,8 258,9 @@ def create_dueling_dqn(h, w) -> tf.keras.Model:
                                   kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.))(conv1)
    conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation="relu", use_bias=False,
                                   kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.))(conv2)
    noise = tf.keras.layers.GaussianNoise(0.1)(conv3)

    val, adv = tf.keras.layers.Lambda(lambda ww: tf.split(ww, 2, 3))(conv3)
    val, adv = tf.keras.layers.Lambda(lambda ww: tf.split(ww, 2, 3))(noise)

    val = tf.keras.layers.Flatten()(val)
    val = tf.keras.layers.Dense(1, kernel_initializer=tf.keras.initializers.VarianceScaling(scale=2.))(val)


@@ 319,7 328,7 @@ if __name__ == "__main__":
    env.reset()
    new_state, rew, done, _ = env.step('.')
    agent.replay_buffer.add_experience(9, new_state.gray_image()[0], rew, done)
    current_game_state = np.repeat(new_state.gray_image().reshape(21, 79, 1), HISTORY_LEN, axis=2)  # with a history of 4
    current_game_state = np.repeat(new_state.gray_image().reshape(21, 79, 1), HISTORY_LEN, axis=2)  # with a history

    # Main processing
    try:


@@ 332,12 341,13 @@ if __name__ == "__main__":
                all_rewards.append(rew)
                interval_rewards.append(rew)
                all_rewards = all_rewards[-10:]
                current_game_state = np.append(current_game_state[:, :, 1:], new_state.gray_image().reshape(21, 79, 1), axis=2)
                current_game_state = np.append(current_game_state[:, :, 1:], new_state.gray_image().reshape(21, 79, 1),
                                               axis=2)

                agent.replay_buffer.add_experience(act, new_state.gray_image()[0], rew, done)

                # Learning step
                if step % LEARNING_FREQUENCY == 0 and agent.replay_buffer.count > MIN_REPLAY_SIZE:
                if agent.replay_buffer.count > MIN_REPLAY_SIZE:
                    loss, _ = agent.learn(BATCH_SIZE, GAMMA, epsilon, PRIORITY_SCALE)
                    all_losses.append(loss)
                    all_losses = all_losses[-100:]


@@ 375,5 385,4 @@ if __name__ == "__main__":
        writer.close()
    env.close()


# †昇天†

M writeup/Drescher-DGD-dissertation-2022-23.tex => writeup/Drescher-DGD-dissertation-2022-23.tex +137 -62
@@ 9,6 9,7 @@
\usepackage[nottoc,notlof,notlot]{tocbibind}
\usepackage{amsfonts}
\usepackage[skip=10pt]{parskip}
\usepackage{float}

\definecolor{keyword}{rgb}{0,0,0.5}
\definecolor{number}{rgb}{0, 0, 1}


@@ 186,7 187,7 @@
        We will investigate into improvements of the DQN algorithm and apply them to play Rogue.
        \item Experiment by using a Dueling DQN, then a Rainbow DQN, both improvements to the original DQN algorithm.
        We will conduct two experiments for this product - training the agent with a Dueling DQN and a Rainbow DQN.
        We will display, analyse and compare the results of the two experiments.
        We will analyse and compare the results of the two experiments.
    \end{itemize}

    \subsection{Summary}\label{subsec:summary1}


@@ 206,7 207,7 @@
    Once the agent performs an action, it receives the new game state as well as a \emph{reward} signal, telling the agent how good its choice was.

    \begin{figure}[ht]
        \caption[The reinforcement learning loop.]{The reinforcement learning loop. An agent is provided with a state, performs an action and is provided with a reward signal and the resulting state. Image credit~\citet{bhattrl}}
        \caption[The reinforcement learning loop.]{The reinforcement learning loop. An agent is provided with a state, performs an action and is provided with a reward signal and the resulting state. Image credit: \citet{bhattrl}}
        \centering
        \includegraphics[scale=0.4]{rlgraph}
        \label{fig:rlgraph}


@@ 221,16 222,14 @@
    In this algorithm, the agent keeps track of a table, mapping state-action pairs to its value.
    When the agent reaches a certain state, it consults its Q-table to determine the most valuable action to take.

    The goal of Q-learning is to find the optimal Q-function, which is defined by the Bellman optimality equation:
    \[Q^{*}(s, a) \quad = \quad \mathbb{E} [r + \gamma max_{a'} Q^{*}(s', a')]\]
    The goal of Q-learning is to find the optimal Q-function, which is defined by the Bellman equation:
    \[Q^{*}(s, a) = \mathbb{E} [r + \gamma max_{a'} Q^{*}(s', a')]\]

    This means the optimal value of taking action \(a\) in state \(s\) is the expected reward of taking the action and then following the policy from there.

    \subsection{Deep Learning}\label{subsec:deep-learning}
    Deep learning is a method of artificial intelligence that involves the use of deep neural networks to process data in a way inspired by the human brain.
    A deep learning method can be supervised or unsupervised. % TODO look into this further
    Deep reinforcement learning is unsupervised, as the input data - the game state - is not labelled.
    The agent must use the reward signal it gains in order to approximate an ideal policy.
    This method can also be used in the field of reinforcement learning in order to approximate existing reinforcement learning methods.

    Representing Q-values containing every state-action pairing becomes infeasible in large state spaces such as video
    games, requiring ever expanding tables and computational space needed to store them.


@@ 257,12 256,38 @@
    This saves on computation time due to the intuition that it is not necessary to estimate action-values for each action.

    And finally, Rainbow DQN~\citep{hessel17}, which combines six different techniques to improve upon the Deep Q-network algorithm.
    These techniques are Double DQN~\citep{hasselt15}, Dueling DQN~\citep{wang16}, Prioritised Experience Replay~\citep{schaul16},
    Multi-step Learning~\citep[chap.~7.1]{sutton18}, Distributional RL~\citep{bellemare17} and Noisy Networks~\citep{fortunato19}.
    These techniques are Double DQN, Dueling DQN, Prioritised Experience Replay,
    Multi-step Learning, Distributional RL and Noisy Networks.

    Prioritised Experience Replay~\citep{schaul16} is a technique where experiences in the replay buffer are prioritised in terms of importance and how valuable experiences are to training.
    This way, experiences with higher priority are sampled more often when training happens, allowing for the agent to learn more efficiently.
    Priorities of experiences are adjusted over time so that the agent does not overfit with certain experiences.

    Multistep Learning~\citep[chap.~7.1]{sutton18} is a technique in reinforcement learning that uses sequences of actions and rewards rather than just an individual transition for learning.
    This is in contrast to traditional Q-learning which only takes into account an individual transition for training and calculating action values (a Markov Decision Process framework).
    In multistep learning

    Distributional reinforcement learning~\citep{bellemare17} differs from traditional RL by evaluating a distribution of a random return, rather than a single value for expected returns.
    The goal is to estimate the probability distribution of an expected reward.

    In standard neural networks, weights are deterministic, which means that a certain input will produce only one output.
    Noisy Networks~\citep{fortunato19} introduces a small amount of Gaussian noise within the weights.
    This is because deterministic networks can make the agent get stuck in a suboptimal policy.
    Adding noise to a network can encourage the agent to explore in an efficient manner.
    According to the paper, the key insight is `a single change to the weight vector can induce a consistent and potentially very complex, state dependent change in policy over multiple time steps'.

    DeepMind found that, by combining six different improvements to the DQN algorithm into one, the algorithm vastly outperforms individual improvements to the DQN algorithm as shown in Figure~\ref{fig:neuralnetperformance}.

    \begin{figure}[ht]
        \caption[Comparison of different neural network performance.]{Comparison of performance averaged over 57 Atari games for different neural networks. Image credit: \citet{hessel17}}
        \centering
        \includegraphics[scale=0.4]{neuralnetperformance}
        \label{fig:neuralnetperformance}
    \end{figure}

    When trying to create an agent that plays the online game Dota 2, \citet{berner19} used a Long Short-term Memory network.

    LSTMs\footnote{Long short-term memory network, type of "recurrent neural network" used in the field of deep learning capable of holding long-term dependencies.} were first defined by~\citet{hochreiter97} and improved upon in later works.
    LSTMs\footnote{Long short-term memory network, type of ``recurrent neural network'' used in the field of deep learning capable of holding long-term dependencies.} were first defined by~\citet{hochreiter97} and improved upon in later works.
    An LSTM is an extension of the ``recurrent'' neural network, where nodes use feedback connections to allow the network to ``remember'' information in the long term.
    This solves the problem that traditional neural networks have, where they can't store information that can be useful to them in the long term.



@@ 284,7 309,10 @@
    This allows the different agents that run simultaneously to learn from different situations to build a common cumulative reward.
    It also involves the work by~\citet{jaderberg16}.

    % TODO talk about rogue-gym here
    Another interface that has been created is Rogue-gym~\citep{kanagawa19}.
    Rogue-gym is a game that accurately replicates the gameplay of Rogue while also allowing customisation of the game.
    It also comes with a customised OpenAI Gym environment which allows AI agents written in Python to interact with the game.
    The paper also shows an agent that was trained on a Proximal policy optimisation algorithm~\citep{schulman17}.

    \subsection{Exploring Other Roguelikes}\label{subsec:exploring-other-roguelikes}
    Rogue is not the only roguelike that has been explored with machine learning.


@@ 293,7 321,6 @@
    The paper introduces a baseline model that they trained on the environment.
    The map and player status are processed separately, concatenated, and run through an LSTM and a regular layer to produce the policy.

    % SkillHack~\citep{matthews22}.
    An article by~\citet{izumiya21} explores how to involve the item inventory in the neural network system of a deep reinforcement learning agent with an attention-based approach.
    It is attention based as the system calculates a score for each item in an inventory using an ``attention function''.



@@ 320,20 347,9 @@

    \subsection{Neural Network}\label{subsec:neural-network}
    % TODO How are you going to compare Dueling DQN and Rainbow DQN? Is there a metric or multiple metrics? The tradeoff between metrics? Are they standard or ad hoc?
    The agent in our experiments will first utilise a Dueling DQN, then utilise a Rainbow DQN as described in the report by~\citet{hessel17}.
    The following techniques make up the Rainbow DQN:
    \begin{itemize}
        \item \textbf{Double Q-learning}: a technique applied to Deep Q-learning where double estimation is used
        with the goal to remedy a problem that the original DQN had where it would be biased to overestimate Q-values.
        \item \textbf{Prioritised Experience Replay}: a technique where experiences in the replay buffer are
        prioritised based on their expected learning progress.
        \item \textbf{Dueling networks}~\citep{schaul16}: an extension of Deep Q-learning that utilises the concept of Advantage.
        Dueling DQN splits the network into two streams - one to approximate state-value and one to approximate advantage.
        These streams are then aggregated to calculate the Q-values.
        \item \textbf{Multi-step learning}~\citep[chap.~7.1]{sutton18}: a technique where the agent learns from the cumulative reward over several steps as individual actions may not provide an immediate reward.
        \item \textbf{Distributional RL}: approximating distributions of returns rather than expected returns.
        \item \textbf{Noisy Networks}: applying random noise to the neural network's parameters in order to avoid overfitting.
    \end{itemize}
    The agent in our experiments will first utilise a base Dueling DQN, which will then be extended with Prioritised Experience Replay and finally extended with Noisy Networks and Multi-step Learning.
    Should time allow we will also introduce Distributional RL in order to create a full Rainbow DQN algorithm.

    % TODO write about the benefits and drawbacks of both dueling and rainbow

    \subsection{Agent Implementation}\label{subsec:implementation}


@@ 344,13 360,19 @@
    easy-to-use tools for defining, tuning and training neural network models.

    The agent will use Rogue-gym~\citep{kanagawa19} as an environment to interact with.
    Rogue-gym is a game that accurately replicates the gameplay of Rogue while also allowing customisation of the game.
    It also comes with a customised OpenAI Gym environment which allows AI agents written in Python to interact with the game.
    
    \subsection{Experiments}\label{subsec:experiments} % TODO experiment discussion


    \subsection{Summary}\label{subsec:summary2} % TODO add
    \subsection{Summary}\label{subsec:summary2}
    In this section we have outlined the algorithms and techniques we will use to create our agent, what we will use to implement them and how we will conduct our experiments.
    We have outlined our reasoning as to why we will use a Deep Q-network, mainly because it is a well-known algorithm that is proven to work well on a variety of game environments.
    The algorithms we will use are improvements to the base Deep Q-network algorithm: Dueling DQN, DDQN with Prioritised Experience Replay and DDQN with PER and Noisy Networks.
    If time allows, we will implement Distributional RL to introduce a full Rainbow DQN to the environment.
    We will compare these algorithms to see how they do when learning Rogue.

    \section{Agent Training and Results}\label{sec:agent-training-and-results}  % TODO things here after data collection
    The agent was trained and evaluated on multiple Nvidia GeForce RTX 2080 graphics cards using CUDA.
    \section{Agent Training and Results}\label{sec:agent-training-and-results}
    The agent was trained and evaluated on an Nvidia GeForce RTX 2080 graphics card using CUDA.

    Our training code was adapted from the work of~\citet{sebtheiler}.



@@ 362,39 384,96 @@
    \end{itemize}

    \subsection{Dueling DQN}\label{subsec:dueling-dqn}
    \subsubsection{First Run}
    In our first experiment, we ran our Dueling DQN for 13000 steps.
    From the results that can be seen in Figure~\ref{fig:ddqn_interval_score}, we were unable to extract a satisfactory result.
    Our model's average reward per interval\footnote{One interval is 10000 steps.} stagnated around 0.02, without increasing except one outlier of Interval 3, with an average reward of 0.049.
    Since the model could not increase its average reward, we set out to improve our model by configuring our hyperparameters and integrating Prioritised Experience Replay~\citet{schaul16} into our Dueling DQN for our second experiment.
    \begin{figure}[h]
        \caption[DDQN: Average reward per interval.]{Average reward per interval. One interval is 10000 steps.}
        \centering
        \includegraphics[scale=0.5]{interval_score_ddqn}
        \label{fig:ddqn_interval_score}
    \end{figure}

    \subsection{Dueling DQN with Prioritised Experience Replay}\label{subsec:dueling-dqn-with-prioritised-experience-replay}
    In our second experiment, we integrated Prioritised Experience Replay, another improvement to the DQN algorithm.
    As shown in Figure~\ref{fig:ddqn_per_interval_score}, we were also unable to extract a satisfactory result, with the average reward per interval stagnating over the entire training period.

    \begin{figure}[h]
        \caption[DDQN with PER: Average reward per interval.]{Average reward per interval. One interval is 10000 steps.}
        \centering
        \includegraphics[scale=0.5]{interval_score_ddqn_per}
        \label{fig:ddqn_per_interval_score}
    \end{figure}

    \subsection{Rainbow DQN}\label{subsec:rainbow-dqn}
    \subsubsection{Second Run}

    \subsection{Summary}\label{subsec:summary}

    \subsection{Dueling DQN with Prioritised Experience Replay, Noisy Networks and Multi-step Learning}\label{subsec:dueling-dqn-with-prioritised-experience-replay-and-noisy-networks}
    \textbf{Need to be added.}

    \section{Conclusion}\label{sec:conclusion}
    In this paper we have set out to improve upon to~\citet{asperti18}`s tests by utilising a Rainbow
    DQN to perform dungeon crawling in Rogue's randomly generated dungeons, while also using a Dueling DQN in order to
    provide a perspective on the performance on the Rainbow DQN.
    In this paper we have set out to improve upon to~\citet{asperti18} and ~\citet{kanagawa19}`s tests by utilising extensions to the DQN algorithm
    to perform dungeon crawling in Rogue's randomly generated dungeons, testing a combination of multiple improvements in order to
    provide a perspective on the performance.

    We have achieved the following in this article:
    \begin{itemize}
        \item Implementation of a Dueling DQN as well as two different improvements for learning the game Rogue
        \item Deployment of our neural network to run experiments
    \end{itemize}

    % TODO go into detail about the actual achievements

    However, our goal was to achieve a successful improvement upon previous literature, but our models did not achieve satisfactory results.
    The average reward per interval that our models attained did not increase as the model learnt.
    This could be due to either more training being required, or our model needing improvement, as we describe in depth in Section~\ref{subsec:future-work}.

    Our main challenge was creation of the neural network.


    Another challenge we faced was tuning of hyperparameters.
    We experimented with several configurations of hyperparameters, and the hyperparameters we used for our tests are noted in Section~\ref{subsec:hyperparameters}.
    Since we were unsuccessful in obtaining satisfactory results, we must improve upon how we tune our hyperparameters as described in Section~\ref{subsec:future-work}.

    Our work provides a framework to...

    This project was good to develop...

    % Explain where chizuru performed well, where it screwed up, and the most important aspect about it

    % Talk about the neural network here
    
    \subsection{Future work}\label{subsec:future-work}
    While

    % Talk about using a customised neural network to run on Nethack or Angband
%    \begin{figure}[ht]
%        \caption{The structure of the Chizuru neural network *OUTDATED.} % TODO outdated
%        \centering
%        \includegraphics[scale=0.5]{network_structure}
%        \label{fig:netwk}
%    \end{figure}

    \subsection{Summary}\label{subsec:summary4}
    In summary, this project was able to
    
    As we were unsuccessful in achieving a satisfactory result, the future work for this project aims to rectify this.
    Looking at what we have accomplished and read, we have identified four main areas of improvement for future work.

    The first is memory management of our program.
    During training of our agent, we ran into an issue where our program was gradually using up more and more memory on the system.
    This means that training of our agent would have to be interrupted periodically so that it would not impact other processes on the system, decreasing the efficiency of training.
    We ran the Fil memory profiler\footnote{\url{https://pythonspeed.com/fil/}} on our program and discovered that predicting an action to take for our agent was taking up the bulk of memory as can be seen in Figure~\ref{fig:fil}.
    In future work investigation into why these memory issues occur should be performed and how to mitigate them.

    \begin{figure}[h]
        \caption[Fil Profiler result for DDQN with PER.]{Fil memory profiler result for our DDQN with PER. Larger bars with a deeper shade of red means the line took up more memory.}
        \centering
        \includegraphics[scale=0.3]{fil}
        \label{fig:fil}
    \end{figure}

    Secondly is the reward function.
    The reward function currently provides a reward of 0 for moving in the map without collecting gold or descending stairs.
    In

    Network architecture

    The fourth is hyperparameter tweaking.
    As we did not obtain satisfactory results for our neural network, in future work more research on what hyperparameters
    are used in what configuration of neural networks and environments are used should be conducted.
    In addition, more experiments on how different configurations of hyperparameters should also be performed.

    \subsection{Reflection}\label{subsec:reflection}

    % Write some bollocks on how RL works well on video games and how this can lead to real-world developments with this technology.
    % Write some bollocks on how RL works well on video games and how this can lead to real-world developments with this technology. End off on a positive note!


    %%%%% Everything after here is not counted in the word count. %%%%%


@@ 410,7 489,6 @@
    \addcontentsline{toc}{section}{Appendices}
    \section{Methods}\label{sec:methods}

    \subsection{Neural Network}\label{subsec:neural-network2}
    \subsection{State Representation}\label{subsec:state-representation}
    The state of the game is converted from a 21x79 grid of ASCII characters as displayed to a human player to a 21x79 grid of
    numbers each representing one character using rogue-gym's \texttt{state.gray\_image()} function.


@@ 420,7 498,7 @@
    \begin{itemize}
        \item Reward per movement: 0
        \item Reward for collecting gold: based on gold collected
        \item Reward for descending stairs: 50
        \item Reward for descending stairs: 100
    \end{itemize}

    \subsection{Hyperparameters}\label{subsec:hyperparameters}


@@ 473,10 551,12 @@ TARGET_UPDATE_FREQUENCY = 750
PRIORITY_SCALE = 0.7
    \end{lstlisting}

    \subsubsection{Dueling DQN/Noisy Networks}

    \subsection{Network Architecture}\label{subsec:network-architecture}
    \subsubsection{Dueling DQN}
    \begin{lstlisting}[label={lst:dueling}]
    net_input = tf.keras.Input(shape=(h, w, 1))
    net_input = tf.keras.Input(shape=(h, w, 4))
    net_input = tf.keras.layers.Lambda(lambda layer: layer / 255)(net_input)

    conv1 = tf.keras.layers.Conv2D(32, (3, 3), strides=2, activation="relu")(net_input)


@@ 504,14 584,9 @@ PRIORITY_SCALE = 0.7
    )

    return final_model
    \end{lstlisting} % XXX do i need to cite this?


    \subsubsection{Rainbow DQN}
    \end{lstlisting}

    \section{Results}\label{sec:results}
    \subsection{Dueling DQN}\label{subsec:dueling-dqn2}
    \subsubsection{Dueling DQN/Noisy Networks}

    \subsection{Rainbow DQN}\label{subsec:rainbow-dqn2}

\end{document}

M writeup/diss.bib => writeup/diss.bib +6 -0
@@ 156,3 156,9 @@
	title = "{train\_dqn.py}",
	year = {2020}
}

@article{schulman17,
	title={Proximal Policy Optimization Algorithms},
    author={John Schulman and Filip Wolski and Prafulla Dhariwal and Alec Radford and Oleg Klimov},
    year={2017},
}
\ No newline at end of file

A writeup/img/fil.png => writeup/img/fil.png +0 -0
A writeup/img/interval_score_ddqn.png => writeup/img/interval_score_ddqn.png +0 -0
A writeup/img/interval_score_ddqn_per.png => writeup/img/interval_score_ddqn_per.png +0 -0
A writeup/img/losses_ddqn_per.png => writeup/img/losses_ddqn_per.png +0 -0
A writeup/img/neuralnetperformance.png => writeup/img/neuralnetperformance.png +0 -0