~melmon/chizuru-old

494b37a490be813b76363c1f9dc479adad2716d1 — Melmon 10 months ago fd3fda0 we_are_so_back
A fix I believe, not like I tested or anything.
8 files changed, 456 insertions(+), 316 deletions(-)

M Dockerfile
M README.md
M chizuru.py
M requirements.txt
D train.py
M writeup/Drescher-DGD-dissertation-2022-23.tex
M writeup/diss.bib
A writeup/img/rlgraph.png
M Dockerfile => Dockerfile +7 -2
@@ 4,6 4,11 @@ WORKDIR /chizuru
SHELL ["bash"]

COPY . .

RUN lscpi | grep -i nvidia
RUN pip3 -r requirements.txt
RUN python3 train.py
\ No newline at end of file
RUN apt-get update && apt-get install python3.7
RUN python3.7 -m venv ./venv
RUN ./venv/bin/activate
RUN pip -r requirements.txt

CMD ["python"]
\ No newline at end of file

M README.md => README.md +2 -0
@@ 14,6 14,8 @@ docker run
```
After that, it should be smooth sailing.

If you want to run this locally, you need to use Python 3.7. Rogue-gym does not install in higher Python versions, I think. It didn't work when I tried to install on Python 3.11.

## Files
The model is located in `chizuru.py`. The training file and logic is written in `train.py`, and the code for previewing how the AI plays is located in `preview.py`. Seeing is believing, after all.


M chizuru.py => chizuru.py +137 -88
@@ 13,96 13,93 @@
#  ██║  ██║╚██████╔╝╚██████╔╝╚██████╔╝███████╗
#  ╚═╝  ╚═╝ ╚═════╝  ╚═════╝  ╚═════╝ ╚══════╝
#
# All organic, free-range bits and bytes. Contains no artificial colours or flavourings. May contain bugs.
# Some of the code here is inspired by the work of https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py

"""This file contains everything needed to run the Chizuru AI."""

import os
from rogue_gym.envs import RogueEnv
from random import choice, random
from collections import deque
import tensorflow as tf
import datetime
import numpy as np
import itertools

# Constants
ASCII_CHARNUM = 128
ENVIRONMENT = "rogueinabox"
LOG_INTERVAL = 200
CKPT_PATH = "training/czr-{epoch:04d}.ckpt"
CKPT_DIR = os.path.dirname(CKPT_PATH)


def create_model() -> tf.keras.Model:
    """Instantiates, compiles and returns the Chizuru model."""
    status_input = tf.keras.Input(shape=(64,))
    inv_input = tf.keras.Input(shape=(64,))
    equip_input = tf.keras.Input(shape=(64,))
    map_input = tf.keras.Input(shape=(21, 79), dtype=tf.int32)
    crop_input = tf.keras.Input(shape=(9, 9), dtype=tf.int32)

    status_net = tf.keras.layers.Dense(64, activation="relu")(status_input)

    inv_net = tf.keras.layers.Embedding(ASCII_CHARNUM, 64)(inv_input)  # replace this with attention maybe?
    inv_net = tf.keras.layers.Flatten()(inv_net)
    inv_net = tf.keras.layers.Dense(64, activation="relu")(inv_net)

    equip_net = tf.keras.layers.Embedding(ASCII_CHARNUM, 32)(equip_input)
    equip_net = tf.keras.layers.Flatten()(equip_net)
    equip_net = tf.keras.layers.Dense(32, activation="relu")(equip_net)

    map_net = tf.keras.layers.Embedding(ASCII_CHARNUM, 64, input_length=21 * 79)(map_input)
    map_net = tf.keras.layers.Conv2D(64, (3, 3), activation="relu", input_shape=(21, 79))(map_net)
    map_net = tf.keras.layers.MaxPooling2D((2, 2))(map_net)
    map_net = tf.keras.layers.Conv2D(64, (3, 3), activation="relu")(map_net)
    map_net = tf.keras.layers.Flatten()(map_net)

    crop_net = tf.keras.layers.Embedding(ASCII_CHARNUM, 64, input_length=9 * 9)(crop_input)
    crop_net = tf.keras.layers.Conv2D(48, (3, 3), activation="relu", input_shape=(9, 9))(crop_net)
    crop_net = tf.keras.layers.Flatten()(crop_net)

    collected = tf.keras.layers.Concatenate()([status_net, inv_net, equip_net, map_net, crop_net])

    # DNN after concat
    pre_dnn = tf.keras.layers.Dense(128, activation="relu")(collected)

    # LSTM
    pre_dnn = tf.keras.layers.Reshape((1, -1))(pre_dnn)
    lstm = tf.keras.layers.LSTM(128)(pre_dnn)

    # final DNN
    final_dnn = tf.keras.layers.Dense(128)(lstm)

    output = tf.keras.layers.Dense(21)(final_dnn)
    # COMMANDS
    # 0  : N MOVE (k)
    # 1  : E MOVE (l)
    # 2  : S MOVE (j)
    # 3  : W MOVE (h)
    # 4  : NE MOVE (u)
    # 5  : SE MOVE (n)
    # 6  : SW MOVE (b)
    # 7  : NW MOVE (y)
    # 8  : SEARCH (s)
    # 9  : WAIT (.)
    # 10 : EAT* (e)
    # 11 : QUAFF* (q)
    # 12 : READ* (r)
    # 13 : WIELD (WEAPON)* (w)
    # 14 : WEAR (ARMOUR)* (W)
    # 15 : TAKE OFF (ARMOUR) (T)
    # 16 : PUT ON (RING)* (p)
    # 17 : REMOVE (RING) (R)
    # 18 : THROW+* (t)
    # 19 : ZAP+* (z)
    # 20 : DROP* (d)
EPISODES_PER_INTERVAL = 500
CKPT_PATH = "training/czr-{interval:04d}.ckpt"
LOG_DIR = "logs/czr" + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
ACTIONS = ['h', 'j', 'k', 'l', 'u', 'n', 'b', 'y', 's', '.']  # Movement actions, search and wait.

# Hyperparameters
GAMMA = 0.99
NUM_ITERATIONS = 20000
MAX_TURNS_IN_EPISODE = 3000
BATCH_SIZE = 64
BUFFER_SIZE = 200000
MIN_REPLAY_SIZE = 2000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000
LEARNING_RATE = 0.00001
UPDATE_FREQUENCY = 10


class Agent:
    """Contains everything needed to manage the agent."""
    def __init__(self, h, w):
        self.online_net = create_dueling_dqn(h, w)
        self.target_net = create_dueling_dqn(h, w)

    def get_action(self, e, observation):
        """Agent chooses an action."""
        rnd_sample = random()

        if rnd_sample <= e:
            return choice(ACTIONS)
        else:
            return self.online_net.act(observation)

    def update_target_network(self):
        """Updates target network with the online network."""
        self.target_net.set_weights(self.online_net.get_weights())

    def learn(self, batch_size, gamma, turn_no):
        """Learns from replays."""
        pass


def create_dueling_dqn(h, w) -> tf.keras.Model:
    """Creates a Dueling DQN."""
    net_input = tf.keras.Input(shape=(h, w), dtype=tf.float32)

    conv1 = tf.keras.layers.Conv2D(32, (3, 3), activation="relu")(net_input)
    mp1 = tf.keras.layers.MaxPooling2D((2, 2))(conv1)
    conv2 = tf.keras.layers.Conv2D(64, (3, 3), activation="relu")(mp1)
    mp2 = tf.keras.layers.MaxPooling2D((2, 2))(conv2)
    conv3 = tf.keras.layers.Conv2D(64, (3, 3), activation="relu")(mp2)

    val, adv = tf.split(conv3, 2, 3)

    val = tf.keras.layers.Flatten()(val)
    val = tf.keras.layers.Dense(1)(val)

    adv = tf.keras.layers.Flatten()(adv)
    adv = tf.keras.layers.Dense(len(ACTIONS))(adv)

    reduced = tf.keras.layers.Lambda(lambda w: tf.reduce_mean(w, axis=1, keepdims=True))

    output = tf.keras.layers.Add()([val, tf.keras.layers.Subtract()([adv, reduced(adv)])])

    final_model = tf.keras.Model(
        inputs=[status_input,
                inv_input,
                equip_input,
                map_input,
                crop_input],
        inputs=[input],
        outputs=[output]
    )

    final_model.compile(
        optimizer=tf.keras.optimizers.Adam(),
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )


@@ 110,27 107,79 @@ def create_model() -> tf.keras.Model:
    return final_model


def get_crop(_map: list[list[int]]):  # TODO
    """Returns a 9x9 crop of the given Rogue map surrounding the player."""
def create_rainbow_dqn(_h, _w):
    """Creates a Rainbow Deep Q-network."""
    pass


def save_checkpoint(model_sv: tf.keras.Model, epoch) -> None:
    """Saves the model checkpoint with given epoch."""
    model_sv.save_weights(CKPT_PATH.format(epoch=epoch))
    print("Epoch " + str(epoch) + " saved to " + CKPT_PATH.format(epoch=epoch) + "~")
def save_checkpoint(model_sv: tf.keras.Model, interval) -> None:
    """Saves the model checkpoint with given interval."""
    model_sv.save_weights(CKPT_PATH.format(interval=interval))
    print("Episode " + str(interval) + " saved to " + CKPT_PATH.format(interval=interval) + "~")


def load_checkpoint(model_ld: tf.keras.Model, epoch) -> tf.keras.Model:
    """Loads a model checkpoint at a given epoch. Returns the loaded model."""
def load_checkpoint(model_ld: tf.keras.Model, interval) -> tf.keras.Model:
    """Loads a model checkpoint at a given interval. Returns the loaded model."""
    model_ld.load_weights(CKPT_PATH)
    print("File " + CKPT_PATH.format(epoch=epoch) + " loaded to current model~")
    print("File " + CKPT_PATH.format(interval=interval) + " loaded to current model~")
    return model_ld


if __name__ == "__main__":
    model = create_model()
    tf.keras.utils.plot_model(model, "stuff.png", show_shapes=True)
    # save_checkpoint(model, 0)
    agent = Agent(21, 79)

    tf.keras.utils.plot_model(agent.online_net, "stuff.png", show_shapes=True)

    writer = tf.summary.create_file_writer(LOG_DIR)

    replay_buffer = deque(maxlen=500)

    CONFIG = {
        'width': 79, 'height': 21,
        'dungeon': {
            'style': 'rogue',
            'room_num_x': 3, 'room_num_y': 2
        },
        'enemies': []
    }
    env = RogueEnv(max_steps=500, stair_reward=50.0, config_dict=CONFIG)
    episode_reward = 0
    turn = 0
    all_rewards = []
    all_losses = []
    env.reset()
    done = False
    state, _, _, _ = env.step('.')

    # Main processing
    try:
        with writer.as_default():
            for step in itertools.count():
                epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
                action = agent.get_action(epsilon, state)
                new_state, reward, done, _ = env.step(action)
                episode_reward += reward

                transition = (state, action, reward, done, new_state)
                replay_buffer.append(transition)
                state = new_state
                turn += 1

                # Learning step
                if turn % UPDATE_FREQUENCY == 0 and len(replay_buffer) > MIN_REPLAY_SIZE:
                    loss, _ = agent.learn(BATCH_SIZE, GAMMA, turn)

                if done:
                    env.reset()
                    all_rewards.append(episode_reward)
                    episode_reward = 0
                    turn = 0

    except KeyboardInterrupt:
        print("Exiting~")
        writer.close()

    env.close()


# †昇天†

M requirements.txt => requirements.txt +4 -3
@@ 1,3 1,4 @@
tensorflow~=2.12.0rc1
numpy~=1.23.5
matplotlib~=3.7.1
\ No newline at end of file
tensorflow~=2.11.0
matplotlib~=3.5.3
numpy~=1.21.6
rogue-gym~=0.0.2
\ No newline at end of file

D train.py => train.py +0 -41
@@ 1,41 0,0 @@
#!/usr/bin/env python3

# This code is governed under the GNU General Public Licence v3.0.

"""Runs and trains the Chizuru agent again and again until the program is halted or until
epoch count reaches a provided number."""

from chizuru import create_model, save_checkpoint, load_checkpoint, get_crop
import tensorflow as tf
import datetime
import os

# Constants
LOG_DIR = "logs/czr" + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
TENSORBOARD_CALLBACK = tf.keras.callbacks.TensorBoard(log_dir=LOG_DIR, histogram_freq=1)
CKPT_PATH = "training/czr-{epoch:04d}.ckpt"
CKPT_DIR = os.path.dirname(CKPT_PATH)
BATCH_SIZE = 64
CKPT_CALLBACK = tf.keras.callbacks.ModelCheckpoint(
    filepath=CKPT_PATH,
    save_weights_only=True,
    verbose=1,
    save_freq=5*BATCH_SIZE
)

# Hyperparameters
NUM_ITERATIONS = 20000
BATCH_SIZE = 32
BUFFER_SIZE = 200000
MIN_REPLAY_SIZE = 2000
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000
TARGET_UPDATE_FREQUENCY = 10000
ALPHA = 1.0e-3
BETA1 = 0.9
BETA2 = 0.999


if __name__ == "__main__":
    pass

M writeup/Drescher-DGD-dissertation-2022-23.tex => writeup/Drescher-DGD-dissertation-2022-23.tex +253 -179
@@ 3,13 3,34 @@
\usepackage{natbib}
\usepackage{graphicx}
\usepackage{hyperref}
\usepackage{appendix}
\usepackage[page,toc,titletoc,title]{appendix}
\usepackage{listings}

\usepackage{color}
\usepackage[nottoc,notlof,notlot]{tocbibind}
\usepackage{amsfonts}
\usepackage[skip=10pt]{parskip}

\definecolor{keyword}{rgb}{0,0,0.5}
\definecolor{number}{rgb}{0, 0, 1}
\definecolor{string}{rgb}{0, 0.5, 0}
\definecolor{comment}{rgb}{0.5, 0.5, 0}
\definecolor{grey}{rgb}{0.5, 0.5, 0.5}
\graphicspath{ {./img/} }
\lstset{language=Python}
\setlength{\parskip}{3pt plus2pt minus2pt}
\setlength{\parindent}{12px}
\lstset{
    language=Python,
    basicstyle=\footnotesize\ttfamily,
    commentstyle=\color{comment},
    keywordstyle=\color{keyword},
    numberstyle=\color{number},
    stringstyle=\color{string},
    breaklines=true,
    frame=single,
    numbers=left,
    numbersep=5pt,
    numberstyle=\tiny\color{grey},
    showspaces=false,
    showstringspaces=false
}

\begin{document}
    \title{chizuru-rogue: Deep Learning for Dungeon Crawling}


@@ 47,83 68,196 @@

    \newpage

    \addcontentsline{toc}{subsection}{Abstract}
    \begin{abstract}
        Video games is one of the most popular problem domains to tackle with reinforcement learning due to the interesting complexity that can arise from simple sets of rules that many games provide.
        By training reinforcement learning models on video games and proving they are effective at solving problems, they can then be repurposed for other problems such as self-driving cars and healthcare.
        \noindent Video games are one of the most popular problem domains to tackle with reinforcement learning due to the interesting complexity that can arise from simple sets of rules that many games provide.
        By training reinforcement learning models on video games and proving they are effective at solving problems, they can then be repurposed for other problems such as self-driving cars to make decisions about speed and navigation,
        and healthcare to optimise and personalise treatment plans.

        In this article we introduce chizuru-rogue, which is a computer program designed to play the video game Rogue, a famous role-playing game that inspired the creation of the ``roguelike'' video game genre.
        \noindent In this article we introduce chizuru-rogue, which is a computer program designed to play Rogue, a well-known dungeon-crawling game regarded as the pioneer of the ``roguelike'' video game genre, which is characterised by randomly generated levels, turn-based gameplay and permanent character death.
        Rogue offers a unique problem to solve, requiring a player to solve partially observable, randomly generated levels.
        chizuru-rouge utilises a customised neural network that involves an LSTM for long-term and short-term memory to explore levels in Rogue, collect gold and reach the goal of collecting the Amulet of Yendor.

        TensorFlow will be used as a framework to implement the reinforcement learning agent.
        \noindent chizuru-rouge interfaces with a program called Rogue-gym, a program that accurately mimics the gameplay of Rogue and
        the agent utilises a Rainbow Deep Q-network to explore the dungeon collect gold and reach the goal of collecting the Amulet of Yendor. % customised neural network that involves an LSTM for long-term and short-term memory to explore levels in Rogue

        \noindent TensorFlow will be used as a framework to implement the reinforcement learning agent.
        TensorFlow is a Python library that provides tools to streamline development of deep learning models.
    \end{abstract}

    {\footnotesize \textbf{Keywords - reinforcement learning, neural networks, deep q-learning, roguelikes, video games, machine learning}}

    \newpage

    \addcontentsline{toc}{subsection}{Acknowledgements}
    \begin{center}
        \section*{Acknowledgements}
        I would like to express my gratitude to my supervisor Dr. Jie Zhang for providing me with invaluable guidance
        and feedback on my dissertation throughout the year, my parents for their proofreading and the administrators
        of Hex, the university GPU cloud for providing me with computational power to run my model on.
    \end{center}

    \newpage

    \tableofcontents
    \listoffigures
    \listoftables
    % \listoftables

    \newpage

    \begin{center}
    \section*{Acknowledgements}
    To-do
    \pagenumbering{arabic}

        % I would like to express my sincere gratitude to the following people:
    \section{Introduction}\label{sec:introduction}
    Rogue is a 1980 computer game that belongs to a genre called ``roguelikes''.
    Roguelikes are characterised by challenging, turn based dungeon crawling gameplay, procedurally generated levels and permanent character death.
    They are inspired by Rogue.
    This genre of game offers a fascinating domain to apply reinforcement learning methods to due to the amount of strategies and gameplay styles that roguelike games allow.
    In these games, turns often resolve immediately, allowing for efficient training.

        % \vspace{3mm}
    The aim of chizuru-rogue is to apply deep reinforcement learning methods for an agent to learn and survive within Rogue.
    Reinforcement learning works by an agent using the current state to decide on an action to do, then observing a reward for that action, as well as a resulting new state.
    Reinforcement learning is suitable for this problem domain because games provide rewards for skilled play, allowing for training.
    The hope is chizuru-rogue will be able to play the game to a level where it can reach the Amulet of Yendor, the goal of the game.

        % \emph{Dr. Jie Zhang}
        
        % My project supervisor, for providing me with invaluable guidance and feedback on my dissertation throughout the year.
    Should the project prove successful, it can serve as a foundation for creating AI to play more complex
    roguelike games like NetHack\footnote{https://nethackwiki.com/wiki/NetHack}.

        % \vspace{3mm}
    \subsection{Rogue - the game}\label{subsec:rogue}
    \subsubsection{Objective}\label{subsubsec:objective}
    In Rogue, the player's main objective is to get a high score by descending the Dungeon of Doom to slay monsters, collect gold coins, retrieve the Amulet of Yendor and escape the dungeon with it alive.
    The game is turn based, which means the player can spend as long as they want thinking their next move before the game processes the environment.
    Figure~\ref{fig:rogsc} depicts an example screenshot of the game.

        % \emph{Mr. Raphael and Mrs. Christine Drescher}
        
        % My parents, for their proofreading and their encouragement.
    \begin{figure}[ht]
        \caption[A screenshot of an example Rogue game.]{A screenshot of an example Rogue game. The player is represented with \texttt{@}, stairs with \texttt{\%}, a kestrel enemy with \texttt{K} and gold with \texttt{*}.}
        \centering
        \includegraphics[scale=0.5]{rogue_screenshot}
        \label{fig:rogsc}
    \end{figure}

        % \vspace{5mm}
    \subsubsection{Environment}\label{subsubsec:environment}
    The dungeon's floors are randomly generated mazes made up of several rooms connected with corridors.
    Rooms may be empty or populated with several items or enemies, and one will have stairs to descend to the next floor,
    represented with the character \texttt{\%}.
    When the player starts a new run\footnote{A play-through of the game from start to finish.}, the player is placed in dungeon level 1 with basic equipment.

        % This project made use of Hex, the GPU Cloud in the Department of Computer Science at the University of Bath.
    \end{center}
    Rogue's environment is partially observable;
    the dungeon configuration is initially obscured to the player, revealing itself as the player moves around.

    \newpage
    The game tracks several stats that are always shown to the player:
    \begin{itemize}
        \item \textbf{Level} denotes the current dungeon level.
        \item \textbf{HP} (Hit Points) represents how much damage the player can take before death.
        The number in brackets is the player's maximum HP.
        \item \textbf{Str} (Strength) represents how strong the player is.
        The number in brackets is the player's maximum strength.
        \item \textbf{Gold} is how many gold coins the player has collected.
        Gold increases the player's final score.
        \item \textbf{Arm} (Armour) is the player's current armour rating.
        The higher the rating, the higher chance to avoid attacks.
        \item \textbf{Exp} shows the player's experience level and total experience points.
        When the player earns enough experience points, the player's experience level increases, increasing the player's maximum HP.
    \end{itemize}

    \pagenumbering{arabic}
    \subsubsection{Items}\label{subsubsec:items}
    There are a wide variety of items the player can use, such as potions, scrolls, weapons and armour.
    Some items need to be identified before the player knows what it will do.
    This can either be done by using a scroll of identify, or by blindly using or wearing the item, which may be risky as some items can have negative effects.

    \section{Introduction}\label{sec:introduction}
    TODO introduction goes here lmao
    \subsubsection{Combat}\label{subsubsec:combat}
    As the player navigates around the dungeon, they will encounter enemies of increasing difficulty.
    The player can attack enemies by moving into them, attacking them with the equipped weapon.
    Enemies in the game will try to harm the player by attacking and reducing the player's HP\@.
    If the player's HP falls to 0, the player dies and the game ends.

    \section{Literature, Technology and Data Review}\label{sec:literature-technology-and-data-review}
    Unlike many other role-playing games of the time, Rogue uses character permanent death as a mechanic, providing the player with
    the unique challenge of surviving till the end, as the player could not load a previous save if they are defeated.
    Therefore, the player has to think through their future moves much more rigorously;
    the player's decisions have much more weight to them as a wrong move could mean game over.
    \emph{Michael Toy}, Rogue's co-creator, touched on the topic of permanent death in Roguelike Celebration 2016~\citep{gamasutra16} by saying `We were trying to make it more immersive by making things matter \ldots'.

    \subsection{Project Objectives}\label{subsec:objectives}
    The primary objectives of this project are as follows:
    \begin{itemize}
        \item Create a program that uses artificial intelligence to play Rogue.
        This will involve designing, developing and deploying the program to a GPU cloud for training an agent.
        \item Improve upon existing work for playing Rogue.
        As we will explain in Section~\ref{subsec:exploring-rogue}, existing literature has only applied the standard
        DQN\footnote{Deep Q-network: using neural networks to approximate Q-learning.} to Rogue.
        We will investigate into improvements of the DQN algorithm and apply them to play Rogue.
        \item Experiment by using a Dueling DQN, then a Rainbow DQN, both improvements to the original DQN algorithm.
        We will conduct two experiments for this product - training the agent with a Dueling DQN and a Rainbow DQN.
        We will display, analyse and compare the results of the two experiments.
    \end{itemize}

    \subsection{Summary}\label{subsec:summary1}
    In this section we have introduced our problem domain Rogue, a dungeon crawling game that we will make our program explore.
    Beyond this section, Section~\ref{sec:literature-technology-and-data-review} is focused on the literature review of
    this project, collating and demonstrating previous work on the subjects that are covered on this project.
    Section~\ref{sec:design-and-methodology} will explain in detail the methodology we will use and how we will collect
    results from the upcoming experiments.
    Section~\ref{sec:agent-training-and-results} will focus on discussing the results of the experiments.

    \subsection{Fundamentals}\label{subsec:fundamentals}
    \section{Literature, Technology and Data Review}\label{sec:literature-technology-and-data-review}

    The fundamentals of reinforcement learning and many fundamental algorithms is explained in detail by~\citet{sutton18}.
    \subsection{Fundamentals of RL}\label{subsec:fundamentals}
    The fundamentals of reinforcement learning and many fundamental algorithms for solving sequential decision problems is explained in detail by~\citet{sutton18}.
    The core idea behind reinforcement learning algorithms is that an agent performs \emph{actions} on an \emph{environment} by deriving what it should do from its \emph{policy}, which is a mapping from states to actions.
    Once the agent performs an action, it receives the new game state as well as a \emph{reward} signal, telling the agent how good its choice was.

    \begin{figure}[ht]
        \caption[The reinforcement learning loop.]{The reinforcement learning loop. An agent is provided with a state, performs an action and is provided with a reward signal and the resulting state. Image credit: \citet{bhattrl}}
        \centering
        \includegraphics[scale=0.4]{rlgraph}
        \label{fig:rlgraph}
    \end{figure}

    The purpose of rewards is for the agent to constantly estimate a \emph{value function}.
    This function tells the agent either how profitable being in a certain state and following its policy is, or how profitable taking a certain action then following its policy is.
    The theory is that the agent should aim to maximise its reward over the long term.

    \subsubsection{Q-learning}
    One of the most well-known reinforcement learning algorithms is the Q-learning algorithm~\citep[chap.~6.5]{sutton18}.
    In this algorithm, the agent keeps track of a table, mapping state-action pairs to its value.
    When the agent reaches a certain state, it consults its Q-table to determine the most valuable action to take.

    The goal of Q-learning is to find the optimal Q-function, which is defined by the Bellman optimality equation:
    \[Q^{*}(s, a) \quad = \quad \mathbb{E} [r + \gamma max_{a'} Q^{*}(s', a')]\]

    This means the optimal value of taking action \(a\) in state \(s\) is the expected reward of taking the action and then following the policy from there.

    \subsection{Deep Learning}\label{subsec:deep-learning}
    While the Q-learning algorithm can solve simple problem domains sufficiently, when it comes to more complex domains that don't have fully observable states such as Atari games, the amount of resources it takes to run the algorithm can become extremely large.
    The way that \citet{mnih15} chose to solve this is to use a convolutional neural network to approximate the Q-learning action-value functions, through an algorithm the authors call ``Deep Q-network''.
    Deep learning is a method of artificial intelligence that involves the use of deep neural networks to process data in a way inspired by the human brain. % TODO improve and make sure not plagiarising
    
    Representing Q-values containing every state-action pairing becomes infeasible in large state spaces such as video
    games, requiring ever expanding tables and computational space needed to store them.
    Deep Q-learning, a technique by OpenAI~\citep{mnih15}, remedies this by using a convolutional neural network to
    approximate the optimal Q-function \(Q*(s, a)\) using a convolutional neural network instead of keeping track of
    a table.

    The Deep Q-network in their writing was shown to play several Atari games to a superhuman level, most notably Video Pinball and Boxing.
    A similar algorithm involving neural networks was employed by~\citet{silver16} in their development of the AlphaGo system, an agent that was found to beat human grandmaster players in the game of Go. The authors used a convolutional neural network alongside ``policy gradient'' reinforcement learning, where
    A similar algorithm involving neural networks was employed by~\citet{silver16} in their development of the AlphaGo system, an agent that was found to beat human grandmaster players in the game of Go.
    The authors used a convolutional neural network alongside ``policy gradient'' reinforcement learning, which is where % TODO continue

    While the DQN algorithm by itself is serviceable for simpler problem domains such as Atari, there have been improvements to tackle more challenging domains.
    One of the first improvements to the DQN algorithm is the Double DQN~\citep{hasselt15}.
    Double DQN improves on the original DQN by using a current network for selecting actions, and then training a ``target network''
    to calculate the target Q-value of said action.
    This improves on the original DQN by solving an issue the original DQN had that the Double DQN paper explains in detail,
    where the original DQN suffered from ``substantial overestimations'' when playing Atari games.

    This is further improved with the Dueling DQN~\citep{wang16}.
    Dueling DQN works by splitting the existing DQN network into two streams: a state-value stream and an ``advantage'' stream.
    Advantage is a value describing how advantageous taking a given action would be given a state-value.
    These streams are then joined with an aggregation layer.
    This saves on computation time due to the intuition that it is not necessary to estimate action-values for each action.

    And finally, Rainbow DQN~\citep{hessel17}, which combines six different techniques to improve upon the Deep Q-network algorithm.
    These techniques are Double DQN~\citep{hasselt15}, Dueling DQN~\citep{wang16}, Prioritised Experience Replay~\citep{schaul16},
    Multi-step Learning~\citep[chap.~7.1]{sutton18}, Distributional RL~\citep{bellemare17} and Noisy Networks~\citep{fortunato19}.

    While the DQN algorithm by itself is serviceable for simpler problem domains such as Atari, there are better methods to tackle more challenging domains.
    When trying to create an agent that plays the online game Dota 2, \citet{berner19} used a Long Short-term Memory network.

    LSTMs were first defined by~\citet{hochreiter97} and improved upon in later works.
    LSTMs\footnote{Long short-term memory network, type of "recurrent neural network" used in the field of deep learning capable of holding long-term dependencies.} were first defined by~\citet{hochreiter97} and improved upon in later works.
    An LSTM is an extension of the ``recurrent'' neural network, where nodes use feedback connections to allow the network to ``remember'' information in the long term.
    This solves the problem that traditional neural networks have, where they can't store information that can be useful to them in the long term.



@@ 133,191 267,123 @@
    An expert system, as stated by~\citet{jackson86} in their book's introduction, ``is a computing system capable of representing and reasoning about some knowledge-rich domain''.
    Essentially, these systems aim to emulate a human expert in a particular domain and their decision-making.
    While expert systems are artificial intelligence, they make no use of machine learning to learn and adapt to situations, they follow what instructions have been programmed within them and are designed to rigidly solve one problem domain.
    ROG-O-MATIC provides a good yardstick to measure the performance of the agent we will be creating.

    An interface for machine learning agents to play Rogue has been created, called Rogueinabox~\citep{asperti17}.
    Rogueinabox is a framework that allow developers to create agents that interface with the game Rogue.
    In the Rogueinabox article, the authors ran a Deep Q-learning agent on the game for testing.
    They simplified the problem domain to have the agent only consider exploring the dungeon to find the stairs, without fighting or collecting items.
    Their agent performed reasonably well accounting dungeon exploration alone, however, the aim of our agent is to reach the Amulet of Yendor and clear the game, which is difficult if the player does not fight monsters and gets stronger.
    Their agent performed reasonably well accounting dungeon exploration alone, however, the aim of our agent is to reach the Amulet of Yendor in the final floor.

    The initial agent proposed in the original Rogueinabox paper was further improved upon~\citep{asperti18}.
    The problem domain was still simplified to only consider getting to the exit stairs alone.
    While the previous implementation employed a DQN, the agent in the improvement implemented an A3C algorithm as a base, rather than a DQN. The A3C algorithm in the improvement was partitioned, meaning the sample space is \emph{partitioned} into a set of situations.
    While the previous implementation employed a DQN, the agent in the improvement implemented an A3C~\citep{mnih15}\footnote{Asynchronous Advantage Actor Critic, an asynchronous algorithm that aims to optimise a policy and estimate a value function by training multiple actors in parallel.} algorithm as a base, rather than a DQN. The A3C algorithm in the improvement was partitioned, meaning the sample space is \emph{partitioned} into a set of situations.
    This allows the different agents that run simultaneously to learn from different situations to build a common cumulative reward.
    It also involves the work by~\citet{jaderberg16}.
    The A3C algorithm is first defined by~\citet{mnih15}.
    It is an asynchronous algorithm that aims to optimise a policy and estimate a value function by training multiple actors in parallel.

    \subsection{Exploring Other Roguelikes}\label{subsec:exploring-other-roguelikes}

    Rogue is not the only roguelike that has been explored with machine learning.
    NetHack is one of the most popular games that has been explored with neural networks.
    NetHack is a roguelike game created in 1987 and is still being updated to this day, with a small but dedicated player-base.
    NetHack, a popular roguelike game created in 1987, has been explored with machine learning during the development of the NetHack Learning Environment~\citep{kuttler20}.
    It is an environment that allows reinforcement learning agents to interact with NetHack easily.
    The paper introduces a baseline model that they trained on the environment.
    The map and player status are processed separately, concatenated, and run through an LSTM and a regular layer to produce the policy.

    SkillHack~\citep{matthews22}.
    NLE~\citep{kuttler20}.
    % SkillHack~\citep{matthews22}.
    An article by~\citet{izumiya21} explores how to involve the item inventory in the neural network system of a deep reinforcement learning agent with an attention-based approach.
    It is attention based as the system calculates a score for each item in an inventory using an ``attention function''

    \section{Concepts}\label{sec:concepts}

    \subsection{Reinforcement Learning Concepts}\label{subsec:reinforcement-learning-concepts}

    \subsection{Rogue Concepts}\label{subsec:rogue-concepts}
    Rogue is a 1980 role-playing computer game inspired by text-based adventure games and tabletop role-playing games like Dungeons and Dragons that led to the creation of ``roguelikes'' - games that are based off of the core gameplay of Rogue.
    Roguelike games are mainly characterised by challenging, turn based hack and slash gameplay, procedurally generated levels and permanent character death.

    \subsubsection{Objective}\label{subsubsec:objective}
    In Rogue, your main objective is to get a high score by descending the Dungeon of Doom to slay monsters, collect gold coins, retrieve the Amulet of Yendor and escape the dungeon with it alive.
    The game is turn based, which means the player can spend as long as they want thinking their next move before the game processes the environment.
    Figure~\ref{fig:rogsc} depicts an example screenshot of the game.

    \begin{figure}[t]
        \caption{A screenshot of an example Rogue game.}
        \centering
        \includegraphics[scale=0.5]{rogue_screenshot}
        \label{fig:rogsc}
    \end{figure}
    It is attention based as the system calculates a score for each item in an inventory using an ``attention function''.

    \subsubsection{Environment}\label{subsubsec:environment}
    \section{Design and Methodology}\label{sec:design-and-methodology}
    \subsection{Problem Simplification}\label{subsec:problem-simplification}
    Due to the complex nature of the game, we introduce some simplifications to the problem so that
    we may create more manageable solutions.

    Every floor of the dungeon is a randomly generated maze consisting of several rooms connected with corridors.
    Rooms sometimes generate empty, but they may also generate populated with several items or enemies.
    One of the rooms will contain the stairs that will let the player descend the dungeon, represented with the character \texttt{\%}.
    When the player starts a new run, the player is placed in dungeon level 1 with some food, a mace, basic armour, a bow and arrows.

    Rogue's environment is partially observable.
    The dungeon configuration is initially obscured to the player, revealing itself as the player moves around.
    In addition, enemies on the map will only be shown to the player if the enemy is within the player character's line of sight.

    The game tracks several stats that are always shown to the player:
    \begin{itemize}
        \item \textbf{Level} denotes the current dungeon level.
        \item \textbf{HP} (Hit Points) represents how much damage the player can take before death.
            The number in brackets is the player's maximum HP.
        \item \textbf{Str} (Strength) represents how strong the player is.
            The number in brackets is the player's maximum strength.
        \item \textbf{Gold} is how many gold coins the player has collected.
            Gold increases the player's final score.
        \item \textbf{Arm} (Armour) is the player's current armour rating.
            The higher the rating, the higher chance to avoid attacks.
        \item \textbf{Exp} shows the player's experience level and total experience points.
            When the player earns enough experience points, the player's experience level increases, increasing the player's maximum HP.
        \item Monsters are disabled, so that combat is not part of the problem.
        \item The amount of actions available to the agent is reduced.
        \item Initially we disabled hunger, so that the agent only needs to focus on descending the dungeon.
    \end{itemize}

    \subsubsection{Items}\label{subsubsec:items}

    There are a wide variety of items the player can use, such as potions, scrolls, weapons and armour.
    Some items need to be identified before the player knows what it will do.
    This can either be done by using a scroll of identify, or by blindly using or wearing the item, which may be risky.
    Some potions have negative effects such as the potion of poison, and rings may be ``cursed''.
    Cursed rings may not be removed once equipped, and they reduce the player's stats.
    Curses can be removed with the scroll of remove curse.

    \subsubsection{Combat}\label{subsubsec:combat}
    As the player navigates around the dungeon, they will encounter enemies of increasing difficulty.
    Enemies in the game will try to harm the player by attacking and reducing the player's HP\@.

    The player can attack enemies by moving into them.
    This will make the player automatically hit the enemy with their equipped weapon.
    Each weapon in Rogue deals a different amount of damage, so it is important to find stronger weapons.

    If the player defeats an enemy, they are granted ``experience points''.
    When the player earns enough experience points to increase their player level, their HP increases, making them tougher.

    If the player's HP reaches 0, the player dies.
    The game will then provide the player with a scoreboard of the top 10 plays, each entry containing the name of the player, their score and their fate.
    The player will then have to start the game from the beginning, generating a new dungeon to explore.

    Unlike many other role-playing games of the time, Rogue uses character permanent death as a mechanic, providing the player with
    the unique challenge of surviving till the end, as the player could not load a previous save if they are defeated.
    This makes the player think through their future moves much more rigorously as the player's decisions have much more weight to them.
    \emph{Michael Toy}, Rogue's co-creator, touched on the topic of permanent death in Roguelike Celebration 2016 in \citet{gamasutra16} by saying `We were trying to make it more immersive by making things matter \ldots ``this thing matters, so I'm going to think about this.'' '.


    % Rogue is a partially observable Markov Decision Process. To deal with this, we use a Long Short-term Memory system, an extension of a feedforward neural network, to process the sequence of observations. This is because LSTMs are capable of ``remembering'' information for longer periods of time. The LSTM algorithm was first defined by \citet{hochreiter97} and popularised much later, one example of an agent implementing a LSTM including AlphaStar by \citet{vinyals19}.

    % The goal of chizuru4rogue is to improve upon the work of \citet{asperti18} by introducing enemies and items into the Rogue world for the agent to deal with. The agent will endeavour to maximise the final score that it gets within one run. A run's final score is used as the reward for the reinforcement learning methods within the agent. A run's final score is determined by how much gold a player collects. The deeper a player ventures in the dungeon, the more gold they can collect. Additionally, the player gains a large score bonus if the game ends while the player possesses the Amulet of Yendor, an item found in dungeon level 26.

    % We use a combination of supervised learning and self-play. During the supervised learning portion of the learning process, we provide replays of Rog-o-Matic completing the game. During the self-play portion of the learning process, chizuru4rogue will play thousands of runs interfacing with Rogueinabox to receive game state and send actions.

    % Using only reinforcement learning is challenging, mainly due to the large action space that Rogue provides. Unlike most other video games where the actions you can perform is contextual, Rogue is a game where every single command is available to you at all times. This allows the player to develop a wide variety of strategies, but increases the overall complexity of the game. Additionally, some commands are combined with selecting an item from the player's inventory e.g. ``wear chain mail armour'', increasing the size of the action space in different contexts.



    \section{Network Architecture}\label{sec:network-architecture}

    The objective of the neural network for Chizuru is to take in the observed dungeon map, player status, recent message and inventory as inputs and return an action that will maximise the expected reward as output as if it were maximising an action-value function.

    \subsection{State}\label{subsec:state}

    We use the following information to represent game state:
    \begin{itemize}
        \item The player's status - HP, strength, EXP and other attributes.
        \item The game map, a 21 x 79 array of ASCII characters.
        \item A 9 x 9 crop of the map centred around the player
        \item The inventory of items.
        \item The items the player is equipped with
    \end{itemize}

    \subsection{Action}\label{subsec:action}
    The objective of the neural network for chizuru-rogue is to take in the observed dungeon state as inputs and return an action that will maximise the expected reward as output as if it were maximising an action-value function.

    \subsection{Player Action}\label{subsec:action}
    Every action in Rogue is available to the player from the start of the game.
    Actions can be divided into basic actions and actions that utilise an inventory item, depending on the action and the item type.
    For example,
    the ``eat'' action can be used to make the player eat something in the inventory.
    If the player attempts to use an action on an item where it wouldn't make sense, the action fails and a message is displayed to the player, such as `Ugh, you would get ill if you ate that'.

    When the player uses an action that utilises an item, the game will await the player to input a key.
    Every item in the player's inventory maps to one key on the keyboard.
    The player may input \texttt{*} in order to see what legal items they may choose and their corresponding key.
    Additionally, the player may see what the item to keyboard mapping is by viewing their inventory with the \texttt{i} key at any other point during the game.

    \subsection{Policy Optimisation}\label{subsec:policy-optimisation}
    The player may input \texttt{*} to see what legal items they may choose and their corresponding key.
    Additionally, the player may see what the item-key mapping is by viewing their inventory with the \texttt{i} key at any time.

    \subsection{Neural Network}\label{subsec:neural-network}
    The neural network processes the inputs in a separate subnetwork, which is then concatenated, fed through an LSTM, and a multilayer perceptron network to produce the final output.
    Figure~\ref{fig:netwk} visually shows the structure of the Chizuru neural network.

    \begin{figure}[t]
        \caption{The structure of the Chizuru neural network *OUTDATED.} % TODO outdated
        \centering
        \includegraphics[scale=0.5]{network_structure}
        \label{fig:netwk}
    \end{figure}

    \section{Implementation}\label{sec:implementation}
    The agent in our experiments will first utilise a Dueling DQN, then utilise a Rainbow DQN as described in the report by~\citet{hessel17}.
    The following techniques make up the Rainbow DQN:
    \begin{itemize}
        \item \textbf{Double Q-learning}: a technique applied to Deep Q-learning where double estimation is used
        with the goal to remedy a problem that the original DQN had where it would be biased to overestimate Q-values.
        \item \textbf{Prioritised Experience Replay}: a technique where experiences in the replay buffer are
        prioritised based on their expected learning progress.
        \item \textbf{Dueling networks}~\citep{schaul16}: an extension of Deep Q-learning that utilises the concept of Advantage.
        Dueling DQN splits the network into two streams - one to approximate state-value and one to approximate advantage.
        These streams are then aggregated to calculate the Q-values.
        \item \textbf{Multi-step learning}~\citep[chap.~7.1]{sutton18}: a technique where the agent learns from the cumulative reward over several steps as individual actions may not provide an immediate reward.
        \item \textbf{Distributional RL}: approximating distributions of returns rather than expected returns.
        \item \textbf{Noisy Networks}: applying random noise to the neural network's parameters in order to avoid overfitting.
    \end{itemize}

    \subsection{Language}\label{subsec:language}
    \subsection{Agent Implementation}\label{subsec:implementation}
    The agent will be implemented in Python, which is one of the most popular languages used to model neural networks due to its many AI-related libraries that are available, including TensorFlow, which is what we use.
    TensorFlow provides tools for working with linear algebra and is widely used in machine learning.
    We chose TensorFlow because it is popular, and it is bundled with Keras, a library that streamlines the creation and running of neural networks.

    The agent will be implemented in Python.
    Python is one of the most popular languages used to model neural networks due to the large amount of artificial intelligence related libraries that are available for the language.
    The main library we will be using is TensorFlow with Keras.
    TensorFlow is a library that provides tools for working with linear algebra, mainly used in machine learning.
    Keras is a wrapper for TensorFlow that streamlines the creation of machine learning models in Python by providing the programmer with tools to construct machine learning models with ease.
    The agent will use Rogue-gym~\citep{kanagawa19} as an environment to interact with.
    Rogue-gym is a game that accurately replicates the gameplay of Rogue while also allowing customisation of the game.
    It also comes with a customised OpenAI Gym environment which allows AI agents written in Python to interact with the game.

    \subsection{Summary}\label{subsec:summary2} % TODO

    \section{Agent Training and Investigation}\label{sec:agent-training-and-investigation}
    \section{Agent Training and Results}\label{sec:agent-training-and-results}  % TODO things here after data collection
    \textbf{--- Everything below here is unfinished. ---}

    \subsection{Evaluation}\label{subsec:evaluation}
    The agent was trained and evaluated on multiple Nvidia GeForce RTX 2080 graphics cards using CUDA.

    During our training of the agent, we measured the agent's performance with the following criteria after every run:
    \begin{itemize}
        \item The final score the agent achieved
        \item The final score the agent achieved per episode (total reward)
        \item The deepest dungeon level the agent entered
    \end{itemize}

    \section{Conclusion and Future work}\label{sec:conclusion-and-future-work}
    \subsection{Dueling DQN}\label{subsec:dueling-dqn}

    \subsection{Rainbow DQN}\label{subsec:rainbow-dqn}

    \subsection{Summary}\label{subsec:summary}

    chizuru-rogue achieves its goal on being an improvement to~\citet{asperti18}`s simple navigation by being able to use items in game and fight monsters.
    \section{Conclusion}\label{sec:conclusion}
%    In this paper we have achieved the goal of being an improvement to~\citet{asperti18}`s tests by utilising a Rainbow
%    DQN to perform dungeon crawling in Rogue's randomly generated dungeons.

    For future developments of the model, we plan to use *** to *** because ***
    % Explain where chizuru performed well, where it screwed up, and the most important aspect about it

    \section{Reflection}\label{sec:reflection}
    \subsection{Results}\label{subsec:improvements}
    % Talk about the neural network here
    
    \subsection{Future work}\label{subsec:future-work}
    % Talk about using a customised neural network to run on Nethack or Angband
%    \begin{figure}[ht]
%        \caption{The structure of the Chizuru neural network *OUTDATED.} % TODO outdated
%        \centering
%        \includegraphics[scale=0.5]{network_structure}
%        \label{fig:netwk}
%    \end{figure}

    \subsection{Summary}\label{subsec:summary4}

    For future developments of the model, we plan to use\ldots
    
    \subsection{Reflection}\label{subsec:reflection}

    % Write some bollocks on how RL works well on video games and how this can lead to real-world developments with this technology.


    %%%%% Everything after here is not counted in the word count. %%%%%
    \medskip

    \bibliographystyle{agsm}


@@ 326,11 392,19 @@
    \medskip

    \appendix
    \section*{Appendices}
    \addcontentsline{toc}{section}{Appendices}
    \section{Methods}\label{sec:methods}

    \subsection{Neural Network}\label{subsec:neural-network2}
%    \begin{lstlisting}[label={lst:thing}]
%        if __name__ == "__main__":
%            print("Hello, world!")
%    \end{lstlisting}

    \subsection{State Representation}\label{subsec:state-representation}
    The state of the game is represented as a 21x79 grid of ASCII characters as displayed to a human player, an example
    of which is shown in Figure~\ref{fig:rogsc}.

    \subsection{Reward Representation}\label{subsec:reward-representation}


M writeup/diss.bib => writeup/diss.bib +53 -3
@@ 6,13 6,18 @@
}

@article{mauldin83,

	title="{ROG-O-MATIC}: A Belligerent Expert System",
	author="Mauldin, M. and Jacobson, G. and Appel, A. and Hamey, L.",
	year="1983",
	publisher="Carnegie Mellon University"
}

@article{hessel17,
	title={Rainbow: Combining Improvements in Deep Reinforcement Learning},
    author={Matteo Hessel and Joseph Modayil and Hado van Hasselt and Tom Schaul and Georg Ostrovski and Will Dabney and Dan Horgan and Bilal Piot and Mohammad Azar and David Silver},
    year={2017}
}

@article{asperti17,
	title="Rogueinabox: An Environment for Roguelike Learning",
	author="Asperti, A. and De Pieri, C. and Pedrini, G.",


@@ 84,12 89,13 @@
	author       = {Gamasutra},
	howpublished = {https://www.gamedeveloper.com/design/-i-rogue-i-co-creator-permadeath-was-never-supposed-to-be-about-pain-},
	title        = {Rogue co-creator: permadeath was never supposed to be 'about pain'},
	year         = {2016}
	year         = {2016},
	note = {[Online, Accessed 2023-04-24]},
}

@article{asperti18,
	author="Asperti, Andrea and Cortesi, Daniele and Sovrano, Francesco",
	title="Crawling in Rogue's dungeons with (partitioned) A3C",
	title="Crawling in Rogue's dungeons with (partitioned) {A3C}",
	year="2018",
}



@@ 132,3 138,47 @@
  pages={16--22},
  year={2017}
}

@article{hasselt15,
	title={Deep Reinforcement Learning with Double Q-learning},
	author={Hado van Hasselt and Arthur Guez and David Silver},
	year={2015},
}

@article{wang16,
	title={Dueling Network Architectures for Deep Reinforcement Learning},
    author={Ziyu Wang and Tom Schaul and Matteo Hessel and Hado van Hasselt and Marc Lanctot and Nando de Freitas},
    year={2016}
}

@article{kanagawa19,
      title={Rogue-Gym: A New Challenge for Generalization in Reinforcement Learning},
      author={Yuji Kanagawa and Tomoyuki Kaneko},
      year={2019}
}

@article{schaul16,
      title={Prioritized Experience Replay},
      author={Tom Schaul and John Quan and Ioannis Antonoglou and David Silver},
      year={2016}
}

@article{bellemare17,
      title={A Distributional Perspective on Reinforcement Learning},
      author={Marc G. Bellemare and Will Dabney and Rémi Munos},
      year={2017}
}

@article{fortunato19,
      title={Noisy Networks for Exploration},
      author={Meire Fortunato and Mohammad Gheshlaghi Azar and Bilal Piot and Jacob Menick and Ian Osband and Alex Graves and Vlad Mnih and Remi Munos and Demis Hassabis and Olivier Pietquin and Charles Blundell and Shane Legg},
      year={2019}
}

@misc{bhattrl,
	author = {Shweta Bhatt},
	title = {Reinforcement learning 101},
	note = {[Online, Accessed: 2023-04-24]},
	howpublished = {https://towardsdatascience.com/reinforcement-learning-101-e24b50e1d292},
	year = {2018},
}
\ No newline at end of file

A writeup/img/rlgraph.png => writeup/img/rlgraph.png +0 -0