~melmon/chizuru-old

e94aff43fb63ae5a0eba2f059e51be498d2844d0 — Melmon 1 year, 7 months ago b7d4cd7
apparently I did something...
3 files changed, 93 insertions(+), 40 deletions(-)

M Dockerfile
M chizuru.py
M writeup/Drescher-DGD-dissertation-2022-23.tex
M Dockerfile => Dockerfile +6 -5
@@ 1,14 1,15 @@
# syntax=docker/dockerfile:1
FROM tensorflow/tensorflow:2.11.0-gpu
WORKDIR /chizuru
SHELL ["bash"]

COPY . .

RUN apt update && apt install -y software-properties-common && add-apt-repository -y ppa:deadsnakes/ppa && apt-get update && apt install -y python3.7
RUN python3.7 -m venv ./venv
RUN ./venv/bin/activate
RUN apt-get update \
    && apt-get install -y software-properties-common \
    && add-apt-repository -y ppa:deadsnakes/ppa \
    && apt-get update \
    && apt-get install -y python3.7
RUN pip install --upgrade pip
RUN pip install -r requirements.txt

CMD python3.7 chizuru.py
\ No newline at end of file
CMD ["python3.7", "chizuru.py"]
\ No newline at end of file

M chizuru.py => chizuru.py +10 -15
@@ 25,7 25,7 @@ import numpy as np
import itertools

# Constants
EPISODES_PER_INTERVAL = 100
STEPS_PER_INTERVAL = 10000
CKPT_PATH = "training/czr-{interval:04d}-{label}.ckpt"
LOG_DIR = "logs/czr" + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
ACTIONS = ['h', 'j', 'k', 'l', 'u', 'n', 'b', 'y', 's', '.']  # Movement actions, search and wait.


@@ 34,9 34,9 @@ ACTIONS = ['h', 'j', 'k', 'l', 'u', 'n', 'b', 'y', 's', '.']  # Movement actions
GAMMA = 0.99
NUM_ITERATIONS = 20000
MAX_TURNS_IN_EPISODE = 1000
BATCH_SIZE = 64
BATCH_SIZE = 32
BUFFER_SIZE = 200000
MIN_REPLAY_SIZE = 400
MIN_REPLAY_SIZE = 1500
EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000


@@ 160,8 160,6 @@ def load_checkpoint(model_ld: tf.keras.Model, interval, label) -> tf.keras.Model
if __name__ == "__main__":
    agent = Agent(21, 79)

    tf.keras.utils.plot_model(agent.online_net, "stuff.png", show_shapes=True)

    writer = tf.summary.create_file_writer(LOG_DIR)

    CONFIG = {


@@ 172,10 170,9 @@ if __name__ == "__main__":
        },
        'enemies': []
    }
    env = RogueEnv(max_steps=MAX_TURNS_IN_EPISODE, stair_reward=50.0, config_dict=CONFIG)
    env = RogueEnv(max_steps=MAX_TURNS_IN_EPISODE, stair_reward=100.0, config_dict=CONFIG)
    episode_reward = 0
    intr = 0
    saved = True
    episode = 0
    all_rewards = []
    all_losses = []


@@ 213,23 210,21 @@ if __name__ == "__main__":
                    all_rewards.append(episode_reward)
                    tf.summary.scalar('Evaluation score', episode_reward, step)
                    tf.summary.scalar('Dungeon level', dlvl, step)
                    saved = False
                    episode_reward = 0
                    print('')
                    print('Episode', episode)
                    print('Average reward', np.mean(all_rewards))
                    print('\nEpisode', episode)
                    print('Reward this game', episode_reward)
                    print('Average reward this session', np.mean(all_rewards))
                    print('Epsilon', epsilon)
                    episode_reward = 0
                    episode += 1

                if episode % EPISODES_PER_INTERVAL == 0 and not saved:
                if step % STEPS_PER_INTERVAL == 0 and step > 0:
                    print('\nInterval', intr)
                    agent.save(intr)
                    intr += 1
                    saved = True

    except KeyboardInterrupt:
        print("Exiting~")
        writer.close()

    env.close()



M writeup/Drescher-DGD-dissertation-2022-23.tex => writeup/Drescher-DGD-dissertation-2022-23.tex +77 -20
@@ 78,7 78,7 @@
        Rogue offers a unique problem to solve, requiring a player to solve partially observable, randomly generated levels.

        \noindent chizuru-rouge interfaces with a program called Rogue-gym, a program that accurately mimics the gameplay of Rogue and
        the agent utilises a Rainbow Deep Q-network to explore the dungeon collect gold and reach the goal of collecting the Amulet of Yendor. % customised neural network that involves an LSTM for long-term and short-term memory to explore levels in Rogue
        the agent utilises a Rainbow Deep Q-network to explore the dungeon, collect gold and reach the goal of collecting the Amulet of Yendor. % customised neural network that involves an LSTM for long-term and short-term memory to explore levels in Rogue

        \noindent TensorFlow will be used as a framework to implement the reinforcement learning agent.
        TensorFlow is a Python library that provides tools to streamline development of deep learning models.


@@ 119,7 119,7 @@
    The hope is chizuru-rogue will be able to play the game to a level where it can reach the Amulet of Yendor, the goal of the game.

    Should the project prove successful, it can serve as a foundation for creating AI to play more complex
    roguelike games like NetHack\footnote{https://nethackwiki.com/wiki/NetHack}.
    roguelike games like NetHack\footnote{\url{https://nethackwiki.com/wiki/NetHack}}.

    \subsection{Rogue - the game}\label{subsec:rogue}
    \subsubsection{Objective}\label{subsubsec:objective}


@@ 213,7 213,7 @@

    The purpose of rewards is for the agent to constantly estimate a \emph{value function}.
    This function tells the agent either how profitable being in a certain state and following its policy is, or how profitable taking a certain action then following its policy is.
    The theory is that the agent should aim to maximise its reward over the long term.
    The theory is that the agent should aim to maximise its cumulative reward by tuning its policy in order to achieve this.

    \subsubsection{Q-learning}
    One of the most well-known reinforcement learning algorithms is the Q-learning algorithm~\citep[chap.~6.5]{sutton18}.


@@ 226,7 226,7 @@
    This means the optimal value of taking action \(a\) in state \(s\) is the expected reward of taking the action and then following the policy from there.

    \subsection{Deep Learning}\label{subsec:deep-learning}
    Deep learning is a method of artificial intelligence that involves the use of deep neural networks to process data in a way inspired by the human brain. % TODO improve and make sure not plagiarising
    Deep learning is a method of artificial intelligence that involves the use of deep neural networks to process data in a way inspired by the human brain. % TODO improve introductory paragraph and make sure not plagiarising
    
    Representing Q-values containing every state-action pairing becomes infeasible in large state spaces such as video
    games, requiring ever expanding tables and computational space needed to store them.


@@ 236,14 236,15 @@

    The Deep Q-network in their writing was shown to play several Atari games to a superhuman level, most notably Video Pinball and Boxing.
    A similar algorithm involving neural networks was employed by~\citet{silver16} in their development of the AlphaGo system, an agent that was found to beat human grandmaster players in the game of Go.
    The authors used a convolutional neural network alongside ``policy gradient'' reinforcement learning, which is where % TODO continue
    The authors used a convolutional neural network alongside ``policy gradient'' reinforcement learning, which is where gradient descent is used to derive a policy.

    While the DQN algorithm by itself is serviceable for simpler problem domains such as Atari, there have been improvements to tackle more challenging domains.
    One of the first improvements to the DQN algorithm is the Double DQN~\citep{hasselt15}.
    Double DQN improves on the original DQN by using a current network for selecting actions, and then training a ``target network''
    to calculate the target Q-value of said action.
    One of the first improvements to the DQN algorithm is the Double DQN~\citep{hasselt15}, which improves on the original DQN by using a current network for selecting actions
    and then training a ``target network'' to calculate the target Q-value of said action.
    This improves on the original DQN by solving an issue the original DQN had that the Double DQN paper explains in detail,
    where the original DQN suffered from ``substantial overestimations'' when playing Atari games.
    where the original DQN suffered from ``substantial overestimations'' when playing Atari games, leading to poorer derived policies due to the fact that
    DQN (and standard Q-learning) uses the save max value for selecting and evaluating an action.
    Using a target network decouples selection from evaluation, according to the paper.

    This is further improved with the Dueling DQN~\citep{wang16}.
    Dueling DQN works by splitting the existing DQN network into two streams: a state-value stream and an ``advantage'' stream.


@@ 262,7 263,6 @@
    This solves the problem that traditional neural networks have, where they can't store information that can be useful to them in the long term.

    \subsection{Exploring Rogue}\label{subsec:exploring-rogue}

    The first notable instance of a program being developed to play Rogue was by~\citet{mauldin83}, where they created ``ROG-O-MATIC'', an expert system that plays Rogue.
    An expert system, as stated by~\citet{jackson86} in their book's introduction, ``is a computing system capable of representing and reasoning about some knowledge-rich domain''.
    Essentially, these systems aim to emulate a human expert in a particular domain and their decision-making.


@@ 280,6 280,8 @@
    This allows the different agents that run simultaneously to learn from different situations to build a common cumulative reward.
    It also involves the work by~\citet{jaderberg16}.

    % TODO talk about rogue-gym here

    \subsection{Exploring Other Roguelikes}\label{subsec:exploring-other-roguelikes}
    Rogue is not the only roguelike that has been explored with machine learning.
    NetHack, a popular roguelike game created in 1987, has been explored with machine learning during the development of the NetHack Learning Environment~\citep{kuttler20}.


@@ 313,6 315,7 @@
    Additionally, the player may see what the item-key mapping is by viewing their inventory with the \texttt{i} key at any time.

    \subsection{Neural Network}\label{subsec:neural-network}
    % TODO How are you going to compare Dueling DQN and Rainbow DQN? Is there a metric or multiple metrics? The tradeoff between metrics? Are they standard or ad hoc?
    The agent in our experiments will first utilise a Dueling DQN, then utilise a Rainbow DQN as described in the report by~\citet{hessel17}.
    The following techniques make up the Rainbow DQN:
    \begin{itemize}


@@ 327,6 330,7 @@
        \item \textbf{Distributional RL}: approximating distributions of returns rather than expected returns.
        \item \textbf{Noisy Networks}: applying random noise to the neural network's parameters in order to avoid overfitting.
    \end{itemize}
    % TODO write about the benefits and drawbacks of both dueling and rainbow

    \subsection{Agent Implementation}\label{subsec:implementation}
    The agent will be implemented in Python, which is one of the most popular languages used to model neural networks due to its many AI-related libraries that are available, including TensorFlow, which is what we use.


@@ 337,11 341,9 @@
    Rogue-gym is a game that accurately replicates the gameplay of Rogue while also allowing customisation of the game.
    It also comes with a customised OpenAI Gym environment which allows AI agents written in Python to interact with the game.

    \subsection{Summary}\label{subsec:summary2} % TODO
    \subsection{Summary}\label{subsec:summary2} % TODO add

    \section{Agent Training and Results}\label{sec:agent-training-and-results}  % TODO things here after data collection
    \textbf{--- Everything below here is unfinished. ---}

    The agent was trained and evaluated on multiple Nvidia GeForce RTX 2080 graphics cards using CUDA.

    Much of our code was inspired by the work of~\citet{sebtheiler}.


@@ 360,15 362,17 @@
    \subsection{Summary}\label{subsec:summary}

    \section{Conclusion}\label{sec:conclusion}
    In this paper we have achieved the goal of being an improvement to~\citet{asperti18}`s tests by utilising a Rainbow
    DQN to perform dungeon crawling in Rogue's randomly generated dungeons.
    In this paper we have set out to improve upon to~\citet{asperti18}`s tests by utilising a Rainbow
    DQN to perform dungeon crawling in Rogue's randomly generated dungeons, while also using a Dueling DQN in order to
    provide a perspective on the performance on the Rainbow DQN.

    % Explain where chizuru performed well, where it screwed up, and the most important aspect about it

    \subsection{Results}\label{subsec:improvements}
    % Talk about the neural network here
    
    \subsection{Future work}\label{subsec:future-work}
    While

    % Talk about using a customised neural network to run on Nethack or Angband
%    \begin{figure}[ht]
%        \caption{The structure of the Chizuru neural network *OUTDATED.} % TODO outdated


@@ 378,8 382,7 @@
%    \end{figure}

    \subsection{Summary}\label{subsec:summary4}

    For future developments of the model, we plan to use\ldots
    In summary, this project was able to
    
    \subsection{Reflection}\label{subsec:reflection}



@@ 410,13 413,67 @@
    of which is shown in Figure~\ref{fig:rogsc}.

    \subsection{Reward Representation}\label{subsec:reward-representation}
    The reward signals are as follows
    \begin{itemize}
        \item Reward per movement: 0
        \item Reward for collecting gold: based on gold collected
        \item Reward for descending stairs: 50
    \end{itemize}

    \subsection{Hyperparameters}\label{subsec:hyperparameters}
    \begin{lstlisting}[label={lst:hyperparameters}]
        GAMMA = 0.99
        NUM_ITERATIONS = 20000
        MAX_TURNS_IN_EPISODE = 1000
        BATCH_SIZE = 64
        BUFFER_SIZE = 200000
        MIN_REPLAY_SIZE = 400
        EPSILON_START = 1.0
        EPSILON_END = 0.01
        EPSILON_DECAY = 150000
        LEARNING_RATE = 0.00001
        UPDATE_FREQUENCY = 1000
    \end{lstlisting}

    \subsection{Network Architecture}
    \subsubsection{Dueling DQN}
    \begin{lstlisting}[label={lst:dueling}]
    net_input = tf.keras.Input(shape=(h, w, 1))
    net_input = tf.keras.layers.Lambda(lambda layer: layer / 255)(net_input)

    \section{Results}\label{sec:results}
    conv1 = tf.keras.layers.Conv2D(32, (3, 3), strides=2, activation="relu")(net_input)
    conv2 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation="relu")(conv1)
    conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation="relu")(conv2)

    val, adv = tf.keras.layers.Lambda(lambda ww: tf.split(ww, 2, 3))(conv3)

    val = tf.keras.layers.Flatten()(val)
    val = tf.keras.layers.Dense(1)(val)

    adv = tf.keras.layers.Flatten()(adv)
    adv = tf.keras.layers.Dense(len(ACTIONS))(adv)

    reduced = tf.keras.layers.Lambda(lambda ww: tf.reduce_mean(ww, axis=1, keepdims=True))

    output = tf.keras.layers.Add()([val, tf.keras.layers.Subtract()([adv, reduced(adv)])])

    final_model = tf.keras.Model(net_input, output)

    final_model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE),
        loss=tf.keras.losses.MeanSquaredError(),
        metrics=[tf.keras.metrics.SparseCategoricalAccuracy()]
    )

    return final_model
    \end{lstlisting} % XXX do i need to cite this?


    \subsubsection{Rainbow DQN}

    \section{Results}\label{sec:results}
    \subsection{Dueling DQN}\label{subsec:dueling-dqn2}

    \section{Data}\label{sec:data}
    \subsection{Rainbow DQN}\label{subsec:rainbow-dqn2}

\end{document}