~melmon/chizuru-old

83ceed3dbe39e3f602253141387c93d4e9724908 — Melmon 10 months ago e3f595a
IT WORKS?
3 files changed, 47 insertions(+), 89 deletions(-)

M chizuru.py
M writeup/Drescher-DGD-dissertation-2022-23.tex
M writeup/diss.bib
M chizuru.py => chizuru.py +36 -47
@@ 1,5 1,3 @@
# This code is governed under the GNU General Public Licence v3.0.
#
#   ██████╗██╗  ██╗██╗███████╗██╗   ██╗██████╗ ██╗   ██╗
#  ██╔════╝██║  ██║██║╚══███╔╝██║   ██║██╔══██╗██║   ██║
#  ██║     ███████║██║  ███╔╝ ██║   ██║██████╔╝██║   ██║


@@ 13,12 11,13 @@
#  ██║  ██║╚██████╔╝╚██████╔╝╚██████╔╝███████╗
#  ╚═╝  ╚═╝ ╚═════╝  ╚═════╝  ╚═════╝ ╚══════╝
#
# Much of the code here is inspired by the work of https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py
# Much of the code here is inspired by the work of https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py,
# especially the code for agent learning.

"""This file contains everything needed to run the chizuru-rogue AI."""

from rogue_gym.envs import RogueEnv
from random import choice, random, sample
from random import random, sample, randint
from collections import deque
import tensorflow as tf
import datetime


@@ 26,7 25,7 @@ import numpy as np
import itertools

# Constants
EPISODES_PER_INTERVAL = 500
EPISODES_PER_INTERVAL = 100
CKPT_PATH = "training/czr-{interval:04d}-{label}.ckpt"
LOG_DIR = "logs/czr" + datetime.datetime.now().strftime("%Y-%m-%d--%H:%M:%S")
ACTIONS = ['h', 'j', 'k', 'l', 'u', 'n', 'b', 'y', 's', '.']  # Movement actions, search and wait.


@@ 34,7 33,7 @@ ACTIONS = ['h', 'j', 'k', 'l', 'u', 'n', 'b', 'y', 's', '.']  # Movement actions
# Hyperparameters
GAMMA = 0.99
NUM_ITERATIONS = 20000
MAX_TURNS_IN_EPISODE = 3000
MAX_TURNS_IN_EPISODE = 1000
BATCH_SIZE = 64
BUFFER_SIZE = 200000
MIN_REPLAY_SIZE = 400


@@ 42,7 41,7 @@ EPSILON_START = 1.0
EPSILON_END = 0.01
EPSILON_DECAY = 150000
LEARNING_RATE = 0.00001
UPDATE_FREQUENCY = 5
UPDATE_FREQUENCY = 1000


class Agent:


@@ 52,22 51,15 @@ class Agent:
        self.w = w
        self.online_net = create_dueling_dqn(h, w)
        self.target_net = create_dueling_dqn(h, w)
        self.replay_buffer = deque(maxlen=1000)
        self.replay_buffer = deque(maxlen=BUFFER_SIZE)

    def get_action(self, e):
    def get_action(self, s, e):
        """Agent chooses an action."""
        rnd_sample = random()

        history = [self.replay_buffer[-1][4].gray_image(),
                   self.replay_buffer[-2][4].gray_image(),
                   self.replay_buffer[-3][4].gray_image(),
                   self.replay_buffer[-4][4].gray_image()]

        history = tf.convert_to_tensor(history)

        if rnd_sample <= e:
            return choice(ACTIONS)
        return self.online_net.predict(tf.reshape(history, (-1, 21, 79, 4)))[0].argmax()
            return randint(0, len(ACTIONS)-1)
        return self.online_net.predict(tf.reshape(tf.convert_to_tensor(s), (-1, 21, 79, 1)))[0].argmax()

    def update_target_network(self):
        """Updates target network with the online network."""


@@ 75,59 67,56 @@ class Agent:

    def learn(self, batch_size, gamma):  # god, I'm so tired.
        """Learns from replays."""
        minibatch = sample(self.replay_buffer, batch_size)
        minibatch = sample(self.replay_buffer, BATCH_SIZE)

        states = tf.constant([r[0].gray_image() for r in minibatch])
        states = tf.constant([r[0] for r in minibatch])
        actions = tf.constant([r[1] for r in minibatch])
        rewards = tf.constant([r[2] for r in minibatch])
        dones = tf.constant([r[3] for r in minibatch])
        new_states = tf.constant([r[4].gray_image() for r in minibatch])
        new_states = tf.constant([r[4] for r in minibatch])

        arg_q_max = self.online_net.predict(tf.reshape(new_states, (batch_size, 21, 79, 4))).argmax(axis=1)
        arg_q_max = self.online_net.predict(tf.reshape(new_states, (batch_size, 21, 79, 1))).argmax(axis=1)

        future_q_vals = self.target_net.predict(tf.reshape(new_states, (batch_size, 21, 79, 4)))
        future_q_vals = self.target_net.predict(tf.reshape(new_states, (batch_size, 21, 79, 1)))
        double_q = future_q_vals[range(batch_size), arg_q_max]

        target_q = rewards + (gamma * double_q * (1 - dones))
        target_q = tf.cast(rewards, tf.float32) + (gamma * double_q * (1.0 - tf.cast(dones, tf.float32)))

        with tf.GradientTape() as tape:
            q_values = self.online_net(states)
            q_values = self.online_net(tf.reshape(states, (batch_size, 21, 79, 1)))

            one_hot_actions = tf.keras.utils.to_categorical(actions, len(ACTIONS), dtype=np.float32)
            Q = tf.reduce_sum(tf.multiply(q_values, one_hot_actions), axis=1)
            q = tf.reduce_sum(tf.multiply(q_values, one_hot_actions), axis=1)

            error = Q - target_q
            learn_loss = tf.keras.losses.Huber()(target_q, Q)
            error = q - target_q
            learn_loss = tf.keras.losses.Huber()(target_q, q)

            model_gradients = tape.gradient(learn_loss, self.online_net.trainable_variables)
            self.online_net.optimizer.apply_gradients(model_gradients, self.online_net.trainable_variables)
            self.online_net.optimizer.apply_gradients(zip(model_gradients, self.online_net.trainable_variables))

        return float(learn_loss.numpy()), error

    def save(self, intr):
    def save(self, interval):
        """Saves model at current interval."""
        save_checkpoint(self.online_net, intr, "online")
        save_checkpoint(self.target_net, interval, "target")

    def load(self, intr):
    def load(self, interval):
        """Loads model at given interval."""
        self.online_net = load_checkpoint(self.online_net, intr, "online")
        self.target_net = load_checkpoint(self.target_net, interval, "online")


def reshape_state(state):
    pass

def create_dueling_dqn(h, w, history_length=4) -> tf.keras.Model:
def create_dueling_dqn(h, w) -> tf.keras.Model:
    """Creates a Dueling DQN."""
    net_input = tf.keras.Input(shape=(h, w, history_length))
    net_input = tf.keras.Input(shape=(h, w, 1))
    net_input = tf.keras.layers.Lambda(lambda layer: layer / 255)(net_input)

    conv1 = tf.keras.layers.Conv2D(32, (3, 3), strides=2, activation="relu")(net_input)
    conv2 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation="relu")(conv1)
    conv3 = tf.keras.layers.Conv2D(64, (3, 3), strides=1, activation="relu")(conv2)

    val, adv = tf.keras.layers.Lambda(lambda w: tf.split(w, 2, 3))(conv3)
    val, adv = tf.keras.layers.Lambda(lambda ww: tf.split(ww, 2, 3))(conv3)

    val = tf.keras.layers.Flatten()(val)
    val = tf.keras.layers.Dense(1)(val)


@@ 135,7 124,7 @@ def create_dueling_dqn(h, w, history_length=4) -> tf.keras.Model:
    adv = tf.keras.layers.Flatten()(adv)
    adv = tf.keras.layers.Dense(len(ACTIONS))(adv)

    reduced = tf.keras.layers.Lambda(lambda w: tf.reduce_mean(w, axis=1, keepdims=True))
    reduced = tf.keras.layers.Lambda(lambda ww: tf.reduce_mean(ww, axis=1, keepdims=True))

    output = tf.keras.layers.Add()([val, tf.keras.layers.Subtract()([adv, reduced(adv)])])



@@ 183,9 172,9 @@ if __name__ == "__main__":
        },
        'enemies': []
    }
    env = RogueEnv(max_steps=500, stair_reward=50.0, config_dict=CONFIG)
    env = RogueEnv(max_steps=MAX_TURNS_IN_EPISODE, stair_reward=50.0, config_dict=CONFIG)
    episode_reward = 0
    interval = 0
    intr = 0
    saved = True
    episode = 0
    all_rewards = []


@@ 193,19 182,20 @@ if __name__ == "__main__":
    state = env.reset()
    new_state, reward, done, _ = env.step('.')
    for _ in range(4):
        agent.replay_buffer.append((state, '.', reward, done, new_state))
        agent.replay_buffer.append((state.gray_image(), 9, reward, done, new_state.gray_image()))
    state = new_state

    # Main processing
    try:
        with writer.as_default():
            for step in itertools.count():
                epsilon = np.interp(step, [0, EPSILON_DECAY], [EPSILON_START, EPSILON_END])
                action = agent.get_action(epsilon)
                new_state, reward, done, _ = env.step(action)
                action = agent.get_action(state.gray_image(), epsilon)
                new_state, reward, done, _ = env.step(ACTIONS[action])
                episode_reward += reward
                all_rewards.append(reward)

                transition = (state, action, reward, done, new_state)
                transition = (state.gray_image(), action, reward, done, new_state.gray_image())
                agent.replay_buffer.append(transition)
                state = new_state



@@ 232,11 222,10 @@ if __name__ == "__main__":
                    episode += 1

                if episode % EPISODES_PER_INTERVAL == 0 and not saved:
                    agent.save(interval)
                    interval += 1
                    agent.save(intr)
                    intr += 1
                    saved = True


    except KeyboardInterrupt:
        print("Exiting~")
        writer.close()

M writeup/Drescher-DGD-dissertation-2022-23.tex => writeup/Drescher-DGD-dissertation-2022-23.tex +4 -2
@@ 205,7 205,7 @@
    Once the agent performs an action, it receives the new game state as well as a \emph{reward} signal, telling the agent how good its choice was.

    \begin{figure}[ht]
        \caption[The reinforcement learning loop.]{The reinforcement learning loop. An agent is provided with a state, performs an action and is provided with a reward signal and the resulting state. Image credit: \citet{bhattrl}}
        \caption[The reinforcement learning loop.]{The reinforcement learning loop. An agent is provided with a state, performs an action and is provided with a reward signal and the resulting state. Image credit~\citet{bhattrl}}
        \centering
        \includegraphics[scale=0.4]{rlgraph}
        \label{fig:rlgraph}


@@ 344,8 344,10 @@

    The agent was trained and evaluated on multiple Nvidia GeForce RTX 2080 graphics cards using CUDA.

    Much of our code was inspired by the work of~\citet{sebtheiler}.

    During our training of the agent, we measured the agent's performance with the following criteria after every run:
    Much of our code was inspired by the work of~\citet{sebtheiler}

    \begin{itemize}
        \item The final score the agent achieved per episode (total reward)
        \item The deepest dungeon level the agent entered

M writeup/diss.bib => writeup/diss.bib +7 -40
@@ 1,10 1,3 @@
@online{deepmgo,
	title="AlphaGo",
	author="DeepMind",
	url="https://www.deepmind.com/research/highlighted-research/alphago",
	urldate="2022-11-04"
}

@article{mauldin83,
	title="{ROG-O-MATIC}: A Belligerent Expert System",
	author="Mauldin, M. and Jacobson, G. and Appel, A. and Hamey, L.",


@@ 38,15 31,6 @@
	publisher={MIT Press}
}

@article{vinyals19,
	title="Grandmaster level in StarCraft II using multi-agent reinforcement learning",
	author="Vinyals, O. and Babuschkin, I. and Czarnecki, W. M. and others",
	year="2019",
	journal="Nature",
	volume="575",
	pages="350--354"
}

@article{berner19,
	title={Dota 2 with large scale deep reinforcement learning},
	author={Berner, Christopher and Brockman, Greg and Chan, Brooke and Cheung, Vicki and others},


@@ 99,12 83,6 @@
	year="2018",
}

@article{matthews22,
	author="Matthews, M. and Samvelyan, M. and Parker-Holder, J. and Grefenstette, E. and Rockt{\"a}schel",
	title="Hierarchical Kickstarting for Skill Transfer in Reinforcement Learning",
	year="2022"
}

@article{jaderberg16,
  title={Reinforcement learning with unsupervised auxiliary tasks},
  author={Jaderberg, Max and Mnih, Volodymyr and Czarnecki, Wojciech Marian and Schaul, Tom and Leibo, Joel Z and Silver, David and Kavukcuoglu, Koray},


@@ 129,16 107,6 @@
  year={2020}
}

@inproceedings{campbell17,
  title={Learning combat in {NetHack}},
  author={Campbell, Jonathan and Verbrugge, Clark},
  booktitle={Proceedings of the AAAI Conference on Artificial Intelligence and Interactive Digital Entertainment},
  volume={13},
  number={1},
  pages={16--22},
  year={2017}
}

@article{hasselt15,
	title={Deep Reinforcement Learning with Double Q-learning},
	author={Hado van Hasselt and Arthur Guez and David Silver},


@@ 177,15 145,14 @@

@misc{bhattrl,
	author = {Shweta Bhatt},
	title = {Reinforcement learning 101},
	note = {[Online, Accessed: 2023-04-24]},
	howpublished = {https://towardsdatascience.com/reinforcement-learning-101-e24b50e1d292},
	year = {2018},
	title = {{Reinforcement learning 101}},
	howpublished = "\url{https://towardsdatascience.com/reinforcement-learning-101-e24b50e1d292}",
	year = {2018}
}

@misc{sebtheiler,
	author = {Sebastian Theiler},
	howpublished = "https://github.com/sebtheiler/tutorials/blob/main/dqn/train_dqn.py",
	title = "train_dqn.py",
	year = {2020},
}
\ No newline at end of file
	howpublished = "\url{https://github.com/sebtheiler/tutorials/blob/main/dqn/train\_dqn.py}",
	title = "{train\_dqn.py}",
	year = {2020}
}