Add material and solution for week 9

9195ad7b · Jon · 914b0ded · 9195ad7b · 9195ad7b · 9195ad7b
Commit 9195ad7b authored Aug 16, 2023 by Jon
--- a/week9/material/.DS_Store
+++ b/week9/material/.DS_Store
--- a/week9/material/decision_tree/decision_tree.py
+++ b/week9/material/decision_tree/decision_tree.py
+import math
+
+from dt_node import DTNode
+from guess_who_samples import SamplesSet
+
+training_set = SamplesSet()
+
+def entropy(samples, feature):
+    global training_set
+
+    total_entropy = 0
+
+    # generating a list with all the values for the considered
+    # feature in the samples
+    # you can use the class method get_values_by_feature from the class SamplesSet
+
+    # computing entropy by summing up
+    # the partial entropy for each feature value
+
+    # You can retrieve the possible values of a feature
+    # by using the method get_feature_values from the training_set instance
+    
+    # for each possible value v of the considered feature
+    #   count the number of positive samples,
+    #   i.e. the samples with value for the considered feature
+    #   equal to the current value v
+    #   
+    #   The probability is then the number of positive samples divided
+    #   by the total number of samples
+    #
+    #   If the probability is 0 or 1, the partial entropy is 0, so skip
+    #   Else, the current partial entropy is -P(positive)*log2(P(positive))
+    #
+    #   sum the partial entropy to the total entropy
+    
+    return total_entropy
+
+def remainder(samples, feature):
+    global training_set
+
+    total_remainder = 0
+
+    # Retrieve the possible values for the considered feature
+    # you can use the method .get_feature_values from the training_set instance
+
+    # For each possible value v of the considered feature
+    #   Retrieve the samples having value = v for the considered feature
+    #   you can use the class method get_samples_by_feature_value from the class SamplesSet
+    #
+    #   If the number of retrieved samples is > 0
+    #       compute the partial remainder as [#(retrieved samples) / #(samples)] * entropy(retrieved samples, decision feature)
+    #       (the decision feature can be gathered from training_set.get_decision_feature())
+    #       sum the partial remainder to the total remainder
+    
+    return total_remainder
+
+def information_gain(samples, feature):
+    global training_set
+
+    # Information gain can be computed as
+    # entropy(samples, decision feature) - remainder(samples, feature)
+    # The decision feature can be gathered from training_set.get_decision_feature()
+
+    ig = 0
+
+    return ig
+
+def learn_tree(samples, features, parent_samples, parent_node=None, edge=None):
+    global training_set
+
+    if len(samples) == 0:
+        leaf = training_set.plurality_value(parent_samples)
+        return DTNode(leaf, parent_node, edge)
+    
+    labels = SamplesSet.get_values_by_feature(samples, training_set.get_decision_feature())
+    if len(set(labels)) == 1:
+        leaf = labels[0]
+        return DTNode(leaf, parent_node, edge)
+    
+    if len(features) == 0:
+        leaf = training_set.plurality_value(samples)
+        return DTNode(leaf, parent_node, edge)
+
+    best_feature = None
+    best_ig = None
+    for feature in features:
+        if feature != training_set.get_decision_feature():
+            cur_ig = information_gain(samples, feature)
+            if best_ig is None or cur_ig > best_ig:
+                best_ig = cur_ig
+                best_feature = feature
+    
+    tree = DTNode(best_feature, parent_node, edge)
+    for value in training_set.get_feature_values(best_feature):
+        subsamples = SamplesSet.get_samples_by_feature_value(samples, best_feature, value)
+        
+        subfeatures = features.copy()
+        subfeatures.remove(best_feature)
+        subtree = learn_tree(subsamples, subfeatures, samples, tree, best_feature)
+        tree.add_successor(subtree, value)
+    
+    return tree
+
+def classify(sample, decision_tree):
+    print("\n--------------------------")
+    print("Classifying the sample {0}".format(sample))
+
+    cur_node = decision_tree
+    while len(cur_node.get_successors()) > 0:
+        successors = cur_node.get_successors()
+        cur_feat = cur_node.get_state()
+        print("Evaluating feature '{0}'".format(cur_feat))
+        feat_val = sample.get_feature_value(cur_feat)
+        print("Feature value for the sample is '{0}'".format(feat_val))
+
+        cur_node = successors[feat_val]
+
+    print("Classification: {0}".format(cur_node.get_state()))
+
+    return cur_node.get_state()
+
+if __name__ == '__main__':
+    print("Learning decision tree...")
+    decision_tree = learn_tree(
+        training_set.get_samples(),
+        training_set.get_classification_features(),
+        [], None, None)
+    
+    print("Decision tree learned!")
+
+    print("Classification...\n")
+    n_misclassifications = 0
+    for sample in training_set.get_samples():
+        ground_truth = sample.get_label()
+        prediction = classify(sample, decision_tree)
+
+        if ground_truth != prediction:
+            n_misclassifications += 1
+            print("Misclassification for sample {0}".format(sample))
+            print("Ground truth was '{0}' and prediction was '{1}'".format(ground_truth, prediction))
+
+    print("\nNumber of misclassifications: {0}".format(n_misclassifications))
--- a/week9/material/decision_tree/dt_node.py
+++ b/week9/material/decision_tree/dt_node.py
+from une_ai.models import GraphNode
+
+class DTNode(GraphNode):
+
+    def __init__(self, state, parent_node, edge):
+        super().__init__(state, parent_node, edge, 0)
+        
+        self._successors = {}
+    
+    def add_successor(self, successor, edge):
+        self._successors[edge] = successor
+
+        return self._successors[edge]
+    
+    def get_successors(self):
+        successors = {}
+        for key, val in self._successors.items():
+            successors[key] = val
+        return successors
\ No newline at end of file
--- a/week9/material/decision_tree/guess_who.csv
+++ b/week9/material/decision_tree/guess_who.csv
+Name,IsFemale,HasDarkColourEyes,HasBlackHair,HasBlondHair,HasRedHair,HasWhiteHair,HasLongHair,HasBigLips,HasMoustache,HasBeard,WearHat,IsBald,WearGlasses,WearEarrings
+Alex,No,Yes,Yes,No,No,No,No,Yes,Yes,No,No,No,No,No
+Alfred,No,No,No,No,Yes,No,No,No,Yes,No,No,No,No,No
+Anita,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,No
+Anne,Yes,Yes,Yes,No,No,No,No,No,No,No,No,No,No,Yes
+Bernard,No,Yes,No,No,No,No,No,No,No,No,Yes,No,No,No
+Bill,No,Yes,No,No,Yes,No,No,No,No,Yes,No,Yes,No,No
+Charles,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,No
+Claire,Yes,Yes,No,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No
+David,No,Yes,No,Yes,No,No,Yes,No,No,Yes,No,No,No,No
+Eric,No,Yes,No,Yes,No,No,No,No,No,No,Yes,No,No,No
+Frans,No,Yes,No,No,Yes,No,No,No,No,No,No,No,No,No
+George,No,Yes,No,No,No,Yes,No,No,No,No,Yes,No,No,No
+Herman,No,Yes,No,No,Yes,No,No,No,No,No,No,Yes,No,No
+Joe,No,Yes,No,Yes,No,No,No,No,No,No,No,No,Yes,No
+Maria,Yes,Yes,No,No,No,No,Yes,No,No,No,Yes,No,No,Yes
+Max,No,Yes,Yes,No,No,No,No,Yes,Yes,No,No,No,No,No
+Paul,No,Yes,No,No,No,Yes,No,No,No,No,No,No,Yes,No
+Peter,No,No,No,No,No,Yes,No,Yes,No,No,No,No,No,No
+Philip,No,Yes,Yes,No,No,No,No,No,No,Yes,No,No,No,No
+Richard,No,Yes,No,No,No,No,No,No,Yes,Yes,No,Yes,No,No
+Robert,No,No,No,No,No,No,No,No,No,No,No,No,No,No
+Sam,No,Yes,No,No,No,Yes,No,No,No,No,No,Yes,Yes,No
+Susan,Yes,Yes,No,Yes,No,No,Yes,Yes,No,No,No,No,No,Yes
+Tom,No,No,Yes,No,No,No,No,No,No,No,No,Yes,Yes,No
\ No newline at end of file
--- a/week9/material/decision_tree/guess_who_samples.py
+++ b/week9/material/decision_tree/guess_who_samples.py
+import csv
+
+class Sample():
+
+    def __init__(self, sample_vector, features, decision_feature):
+        self._x = {}
+        self._y = {}
+        for i, feature in enumerate(features):
+            if feature != decision_feature:
+                self._x[feature] = sample_vector[i]
+            else:
+                self._y[feature] = sample_vector[i]
+    
+    def get_sample_values(self):
+        sample_vector = []
+        for value in self._x.values():
+            sample_vector.append(value)
+        return sample_vector
+    
+    def get_feature_value(self, feature):
+        assert feature in self._x.keys() or feature in self._y.keys(), "'{0}' is not a valid feature.".format(feature)
+
+        if feature in self._x.keys():
+            return self._x[feature]
+        else:
+            return self.get_label()
+    
+    def get_label(self):
+        decision_feature = list(self._y.keys())[0]
+        return self._y[decision_feature]
+    
+    def __str__(self):
+        return str(self._x) + ' -> ' + str(self._y)
+
+class SamplesSet():
+
+    def __init__(self, guess_who_file='guess_who.csv', decision_feature='Name'):
+        self._features_values = {}
+        self._features = []
+        self._decision_feature = decision_feature
+        self._samples = []
+
+        with open(guess_who_file, newline='', mode='r', encoding='utf-8-sig') as f:
+            reader = csv.reader(f)
+            for i, row in enumerate(reader):
+                if i == 0:
+                    # First row with the name of the features
+                    self._features = row
+                    for feature in self._features:
+                        self._features_values[feature] = []
+                    continue
+                
+                # rows with names and values for the features
+                # the name is our label we want to predict from the features
+                cur_sample = row
+                self.add_sample(cur_sample)
+
+                for i, feature_val in enumerate(cur_sample):
+                    cur_feature = self._features[i]
+                    if feature_val not in self._features_values[cur_feature]:
+                        self._features_values[cur_feature].append(feature_val)
+    
+    def add_sample(self, sample_vector):
+        new_sample = Sample(sample_vector, self._features, self._decision_feature)
+        self._samples.append(new_sample)
+
+        return len(self._samples)-1
+    
+    def get_classification_features(self):
+        classification_features = self._features.copy()
+        classification_features.remove(self._decision_feature)
+
+        return classification_features
+
+    def get_decision_feature(self):
+        return self._decision_feature
+    
+    def get_feature_values(self, feature):
+        assert feature in self._features_values.keys(), "'{0}' is not a valid feature.".format(feature)
+
+        return self._features_values[feature]
+    
+    def get_sample_at_index(self, index):
+        assert index >= 0 and index < len(self._samples), "The parameter index must be >= 0 and less than the number of samples"
+
+        return self._samples[index]
+    
+    def get_samples(self):
+        return self._samples
+    
+    def get_values_by_feature(samples, feature):
+        values = []
+        for sample in samples:
+            values.append(sample.get_feature_value(feature))
+        
+        return values
+    
+    def get_samples_by_feature_value(samples, feature, feature_value):
+        subsamples = []
+        for sample in samples:
+            if sample.get_feature_value(feature) == feature_value:
+                subsamples.append(sample)
+        
+        return subsamples
+    
+    def plurality_value(self, samples):
+        max_count = None
+        best_label = None
+        labels = SamplesSet.get_values_by_feature(samples, self._decision_feature)
+        for value in self._features_values[self._decision_feature]:
+            cur_count = labels.count(value)
+            if max_count is None or cur_count > max_count:
+                max_count = cur_count
+                best_label = value
+        
+        return best_label
\ No newline at end of file
--- a/week9/material/guess_who.csv
+++ b/week9/material/guess_who.csv
+Name,Gender,EyeColour,HairColour,HairLength,LipsSize,HasMoustache,HasBeard,WearHat,IsBald,WearGlasses,WearEarrings
+Alex,Male,Brown,Black,Short,Big,Yes,No,No,No,No,No
+Alfred,Male,Blue,Red,Short,Small,Yes,No,No,No,No,No
+Anita,Female,Blue,White,Short,Small,No,No,No,No,No,No
+Anne,Female,Brown,Black,Short,Small,No,No,No,No,No,Yes
+Bernard,Male,Brown,Brown,Short,Small,No,No,Yes,No,No,No
+Bill,Male,Brown,Red,Short,Small,No,Yes,No,Yes,No,No
+Charles,Male,Brown,Blond,Short,Big,Yes,No,No,No,No,No
+Claire,Female,Brown,Red,Long,Small,No,No,Yes,No,Yes,No
+David,Male,Brown,Blond,Long,Small,No,Yes,No,No,No,No
+Eric,Male,Brown,Blond,Short,Small,No,No,Yes,No,No,No
+Frans,Male,Brown,Red,Short,Small,No,No,No,No,No,No
+George,Male,Brown,White,Short,Small,No,No,Yes,No,No,No
+Herman,Male,Brown,Red,Short,Small,No,No,No,Yes,No,No
+Joe,Male,Brown,Blond,Short,Small,No,No,No,No,Yes,No
+Maria,Female,Brown,Brown,Long,Small,No,No,Yes,No,No,Yes
+Max,Male,Brown,Black,Short,Big,Yes,No,No,No,No,No
+Paul,Male,Brown,White,Short,Small,No,No,No,No,Yes,No
+Peter,Male,Blue,White,Short,Big,No,No,No,No,No,No
+Philip,Male,Brown,Black,Short,Small,No,Yes,No,No,No,No
+Richard,Male,Brown,Brown,Short,Small,Yes,Yes,No,Yes,No,No
+Robert,Male,Blue,Brown,Short,Small,No,No,No,No,No,No
+Sam,Male,Brown,White,Short,Small,No,No,No,Yes,Yes,No
+Susan,Female,Brown,Blond,Long,Big,No,No,No,No,No,Yes
+Tom,Male,Blue,Black,Short,Small,No,No,No,Yes,Yes,No
\ No newline at end of file
--- a/week9/material/reinforcement_learning/agent_programs.py
+++ b/week9/material/reinforcement_learning/agent_programs.py
+import random
+import json
+
+from tictactoe_game_environment import TicTacToeGameEnvironment
+from reinforcement_learning import state_to_str
+
+def agent_program_random(percepts, actuators):
+    game_board = percepts['game-board-sensor']
+    player_turn = percepts['turn-taking-indicator']
+    game_state = {
+        'game-board': game_board.copy(),
+        'player-turn': player_turn
+    }
+    
+    legal_moves = TicTacToeGameEnvironment.get_legal_actions(game_state)
+    if len(legal_moves) > 0:
+        return [random.choice(legal_moves)]
+    
+    return []
+
+def agent_program_RL(percepts, actuators):
+
+    game_board = percepts['game-board-sensor']
+    player_turn = percepts['turn-taking-indicator']
+    game_state = {
+        'game-board': game_board.copy(),
+        'player-turn': player_turn
+    }
+
+    with open('vfunction.json'.format(player_turn), 'r') as f:
+        vfunction = json.load(f)
+    
+    opponent = 'X' if player_turn == 'O' else 'O'
+    v_player = vfunction['player-{0}'.format(player_turn)]
+    v_opponent = vfunction['player-{0}'.format(opponent)]
+    if not TicTacToeGameEnvironment.is_terminal(game_state):
+        best_action = None
+        max_advantage = None
+        for action in TicTacToeGameEnvironment.get_legal_actions(game_state):
+            new_state = TicTacToeGameEnvironment.transition_result(game_state, action)
+            future_state_str = state_to_str(new_state)
+            if future_state_str in v_player.keys() and future_state_str in v_opponent.keys():
+                advantage = v_player[future_state_str] - v_opponent[future_state_str]
+                if best_action is None or advantage > max_advantage:
+                    best_action = action
+                    max_advantage = advantage
+        
+        if best_action is not None:
+            return [best_action]
+        else:
+            print("No best action found in v function for state {0}. Selecting it randomly".format(state_to_str(game_state)))
+            actions = TicTacToeGameEnvironment.get_legal_actions(game_state)
+            selected_action = random.choice(actions)
+            return [selected_action]
+    
+    return []
--- a/week9/material/reinforcement_learning/reinforcement_learning.py
+++ b/week9/material/reinforcement_learning/reinforcement_learning.py
+import random
+import json
+import sys
+
+from tictactoe_game_environment import TicTacToeGameEnvironment
+
+# code taken from https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
+# A function to print a progress bar to keep track
+# of the learning process
+def progress(count, total, suffix=''):
+    bar_len = 60
+    filled_len = int(round(bar_len * count / float(total)))
+
+    percents = round(100.0 * count / float(total), 1)
+    bar = '=' * filled_len + '-' * (bar_len - filled_len)
+
+    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
+    sys.stdout.flush()
+
+# A function transforming a game board state into a
+# string representing it
+def state_to_str(state):
+    board_str = ''
+    game_board = state['game-board']
+    for i in range(game_board.get_width()):
+        for j in range(game_board.get_height()):
+            value = game_board.get_item_value(i, j)
+            if value is None:
+                value = '*'
+            board_str += value
+    
+    return board_str
+
+# A function that picks the best next action for the
+# current player given the current policies.
+# The best action for the player is determined as the
+# difference between the v-value of the player
+# to transition in the new state and the v-value of
+# the opponent in that same transitioned state.
+# There is an off_policy_likelihood percentage to
+# select a random action instead of the best one.
+def pick_best_action(vfunction, state, off_policy_likelihood):
+    player = 'player-{0}'.format(state['player-turn'])
+    opponent = 'player-X' if state['player-turn'] == 'O' else 'player-O'
+    v_player = vfunction[player]
+    v_opponent = vfunction[opponent]
+
+    # getting the legal actions
+    legal_actions = [] # replace this line with the correct list of legal actions
+
+    # setting the selected action as None
+    selected_action = None
+
+    # keeping track of the states not explored yet
+    # (for exploration mode)
+    unvisited_future_states = []
+
+    # selecting the action based on the current policy from the
+    # so far learned v function for the player
+    max_advantage = float('-Inf')
+    for action in legal_actions:
+
+        # Transition the state given the current action
+
+        # Transform the state into a string identifier with the
+        # function state_to_str
+
+        # If the state identifier is present in the v functions of both players
+        #   compute the advantage by subtracting the v-value of the player for
+        #   the transitioned state with the v-value of the opponent for the transitioned state
+
+        #   update the max_advantage if advantage > max_advantage
+        # else, if the state identifier is not in the vfunction of the current player
+        #   add the state identifier to the unvisited_future_states
+
+        pass
+
+    # checking if we should use exploration instead of exploitation
+    if random.random() < off_policy_likelihood or selected_action is None:
+        # exploration mode, selecting random action
+        if len(unvisited_future_states) > 0:
+            item = random.choice(unvisited_future_states)
+            selected_action = item[0]
+        else:
+            selected_action = random.choice(legal_actions)
+    
+    return selected_action
+
+# A function to update the v function of a player
+# The v function is updated according to the TD(0) equation
+def update_v_function(v, state, new_state, reward, alpha, gamma):
+
+    # Compute the string state identifier for state
+    # If the identifier is in the v function, take that value as the
+    # old v-value
+    # else, set the v-value for that state identifier as random.random()*0.1
+    # and set old v-value to that random value
+
+    # Compute the string state identifier for new_state
+    # If the identifier is in the v function, take that value as the
+    # v-value at state s'
+    # else, set the v-value for that state identifier as random.random()*0.1
+    # and set s' v-value to that random value
+
+    # updated v value = old_v-value + alpha*(reward + gamma*s'_v-value - old_v-value)
+    
+    # it is not necessary to return any value.
+    # Just update the v-value in the dictionary and since the dictionary is
+    # passed by reference, it will update it when leaving the function
+    pass
+
+# The learning function
+def learn(alpha=0.1, gamma=0.9, off_policy_likelihood=0.1, n_episodes=100000):
+    # setting the v functions for the players as empty dictionaries
+    vfunction = {'player-X': {}, 'player-O': {}}
+
+    # We start from the opening state of the game
+    starting_environment = TicTacToeGameEnvironment()
+    state = starting_environment.get_game_state()
+
+    # We need to keep track to the last state to update for the
+    # other player
+    past_states = {'player-X': None, 'player-O': None}
+
+    # Loop for n_episodes
+    i = 0
+    while i < n_episodes:
+        progress(i, n_episodes)
+
+        cur_player = state['player-turn']
+        last_player = 'X' if state['player-turn'] == 'O' else 'O'
+
+        v_cur_player = vfunction['player-{0}'.format(cur_player)]
+        v_last_player = vfunction['player-{0}'.format(last_player)]
+
+        # from the current state, we pick the best action
+        selected_action = pick_best_action(
+            vfunction,
+            state,
+            off_policy_likelihood
+        )
+
+        # Transitioning state
+        new_state = TicTacToeGameEnvironment.transition_result(state, selected_action)
+
+        # We store the transitioned state as the state to update for the current player
+        # during the next iteration
+        past_states['player-{0}'.format(state['player-turn'])] = new_state
+
+        # We compute the reward for the current player for performing
+        # the selected action and transitioning to new_state
+        # If new_state is not terminal, the reward will be 0
+        cur_player_reward = TicTacToeGameEnvironment.payoff(new_state, cur_player)
+
+        # Given the received reward, we update the v function of the current player
+        # Even if the reward = 0, the v function for the current state will be updated
+        # given the gamma*v-value of the new_state, thus propagating the rewards from
+        # terminal states back to early states of the game
+        update_v_function(v_cur_player, state, new_state, cur_player_reward, alpha, gamma)
+
+        # We also need to update the v function of the last player
+        # If we have a past state for the last player, we can do so
+        if past_states['player-{0}'.format(last_player)] is not None:
+            last_player_new_state = past_states['player-{0}'.format(last_player)]
+            
+            # The reward is given by the payoff at the same new_state but for the last_player
+            # In this game is simply -1*cur_player_reward
+            last_player_reward = TicTacToeGameEnvironment.payoff(new_state, last_player)
+
+            # We need to update the v-value for the past state of the last player
+            # when transitioning to the new_state with the achieved reward for new_state
+            update_v_function(v_last_player, last_player_new_state, new_state, last_player_reward, alpha, gamma)  
+
+        # We check if the episode terminated
+        if TicTacToeGameEnvironment.is_terminal(new_state):
+            # Yes, we restart
+            state = starting_environment.get_game_state()
+            past_states = {'player-X': None, 'player-O': None}
+            i += 1
+        else:
+            # No, we continue
+            state = new_state
+    
+    return vfunction
+
+if __name__ == "__main__":
+    vfunction = learn()
+    with open('vfunction.json', 'w+') as f:
+        json.dump(vfunction, f)
\ No newline at end of file
--- a/week9/material/reinforcement_learning/tictactoe_app.py
+++ b/week9/material/reinforcement_learning/tictactoe_app.py
+from une_ai.tictactoe import TicTacToeGame
+from une_ai.tictactoe import TicTacToePlayer
+
+from tictactoe_game_environment import TicTacToeGameEnvironment
+from agent_programs import agent_program_RL, agent_program_random
+
+if __name__ == '__main__':
+    player_X = TicTacToePlayer('X', agent_program_RL)
+    player_O = TicTacToePlayer('O', agent_program_random)
+
+    # DO NOT EDIT THE FOLLOWING INSTRUCTIONS!
+    environment = TicTacToeGameEnvironment()
+    environment.add_player(player_X)
+    environment.add_player(player_O)
+
+    game = TicTacToeGame(player_X, player_O, environment)
+
--- a/week9/material/reinforcement_learning/tictactoe_game_environment.py
+++ b/week9/material/reinforcement_learning/tictactoe_game_environment.py
+import numpy as np
+from scipy.signal import convolve2d
+from une_ai.models import GameEnvironment, GridMap, Agent
+
+class IllegalMove(Exception):
+    pass
+
+class TicTacToeGameEnvironment(GameEnvironment):
+
+    def __init__(self, board_size=3):
+        super().__init__("Tic Tac Toe")
+
+        self._board_size = board_size
+        self._game_board = GridMap(board_size, board_size, None)
+        self._player_turn = 'X' # X always starts
+
+    # TODO
+    # implement the abstract method add_player
+    # the GameEnvironment superclass uses a dictionary self._players 
+    # to store the players of the game.
+    # For this game, we must limit the players to 2 players and
+    # The first added player will be X and the second O
+    def add_player(self, player):
+        assert isinstance(player, Agent), "The parameter player must be an instance of a subclass of the class Agent"
+        assert len(self._players) < 2, "It is not possible to add more than 2 players for this game."
+        
+        if len(self._players) == 0:
+            marker = 'X'
+        else:
+            marker = 'O'
+
+        self._players[marker] = player
+
+        return marker
+
+    # TODO
+    # implement the abstract method get_game_state
+    # the method must return the current state of the game
+    # as a dictionary with the following keys:
+    # 'game-board' -> a copy of the game board (as 3x3 GridMap)
+    # 'player-turn' -> 'X' or 'O' depending on the current player turn
+    # You may first create properties in the constructor function __init__
+    # to store the game board and the current turn
+    def get_game_state(self):
+        gs = {
+            'game-board': self._game_board.copy(),
+            'player-turn': self._player_turn
+        }
+        return gs
+    
+    # TODO
+    # implement the abstract method get_percepts
+    # this method returns a dictionary with keys the sensors of the agent
+    # and values the percepts gathered for that sensor at time t
+    # the sensors are:
+    # 'game-board-sensor' -> the 'game-board' value from the current game state
+    # 'turn-taking-indicator' -> the 'player-turn' value from the current game state
+    def get_percepts(self):
+        gs = self.get_game_state()
+        return {
+            'game-board-sensor': gs['game-board'], 
+            'turn-taking-indicator': gs['player-turn']
+        }
+    
+    # TODO
+    # implement the abstract method get_legal_actions
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state as input and it returns the list of
+    # legal actions in that game state
+    # An action is legal in a given game state if the game board cell 
+    # for that action is free from marks
+    def get_legal_actions(game_state):
+        legal_actions = []
+        game_board = game_state['game-board']
+        empty_cells = game_board.find_value(None)
+        for empty_cell in empty_cells:
+            legal_actions.append('mark-{0}-{1}'.format(empty_cell[0], empty_cell[1]))
+        
+        return legal_actions
+
+    # TODO
+    # implement the abstract method transition_result
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state and an action to perform as input and it returns
+    # the new game state.
+    def transition_result(game_state, action):
+        legal_actions = TicTacToeGameEnvironment.get_legal_actions(game_state)
+        if action not in legal_actions:
+            raise(IllegalMove('The action {0} is not a legal move for the given game state {1}.'.format(action, game_state.get_map())))
+        
+        marker = TicTacToeGameEnvironment.turn(game_state)
+        
+        tokens = action.split('-')
+        x, y = (int(tokens[1]), int(tokens[2]))
+        new_game_board = game_state['game-board'].copy()
+        new_game_board.set_item_value(x, y, marker)
+
+        new_game_state = {
+            'game-board': new_game_board,
+            'player-turn': 'O' if game_state['player-turn'] == 'X' else 'X'
+        }
+
+        return new_game_state
+    
+    # TODO
+    # implement the abstract method state_transition
+    # this method takes as input the agent's actuators
+    # and it changes the game environment state based
+    # on the values of the agent's actuators
+    # This agent has only one actuator, 'marker'
+    # the value of this actuator is a tuple with the x and y
+    # coordinates where the agent will place its marker on the game board
+    # We can implement this method by re-using the static method
+    # transition_result we just implemented
+    def state_transition(self, agent_actuators):
+        assert agent_actuators['marker'] is not None, "During a turn, the player must have set the 'marker' actuator value to a coordinate (x, y) of the game board where to place the marker."
+
+        x, y = agent_actuators['marker']
+        gs = self.get_game_state()
+        action = 'mark-{0}-{1}'.format(x, y)
+        new_gs = TicTacToeGameEnvironment.transition_result(gs, action)
+
+        self._player_turn = new_gs['player-turn']
+        self._game_board = new_gs['game-board'].copy()
+    
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It returns the turn of the player given a game state.
+    def turn(game_state):
+        assert 'player-turn' in game_state.keys(), "Invalid game state. A game state must have the key 'player-turn'"
+
+        return game_state['player-turn']
+    
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state as input and it returns the winner ('X' or 'O') if there is any
+    # or None if there is no winner (a tie or a non-terminal state)
+    # This method is already provided to you. You should look at its implementation
+    # and try to understand how it is finding a winner with the convolution operation
+    def get_winner(game_state):
+        game_board = game_state['game-board']
+
+        horizontal_kernel = np.array([[ 1, 1, 1]])
+        vertical_kernel = np.transpose(horizontal_kernel)
+        diag_kernel = np.eye(3, dtype=np.uint8)
+        flipped_diag_kernel = np.fliplr(diag_kernel)
+        detection_kernels = [horizontal_kernel, vertical_kernel, diag_kernel, flipped_diag_kernel]
+
+        for marker in ['X', 'O']:
+            player_markers = game_board.get_map() == marker
+            for kernel in detection_kernels:
+                convolved_values = convolve2d(player_markers, kernel, mode="valid")
+                if (convolved_values == 3).any():
+                    return marker
+        
+        return None
+
+    # TODO
+    # implement the abstract method is_terminal
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state as input and it returns True if the game state
+    # is terminal and False otherwise.
+    # In this game, a state is terminal if there are no more legal actions
+    # or if there is a winner.
+    def is_terminal(game_state):
+        # game is over if the board is full
+        remaining_actions = TicTacToeGameEnvironment.get_legal_actions(game_state)
+        winner = TicTacToeGameEnvironment.get_winner(game_state)
+
+        return len(remaining_actions) == 0 or winner is not None
+
+    # TODO
+    # implement the abstract method payoff
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state and the player name ('X' or 'O') as input and it returns
+    # the payoff value for that player in the given game state
+    # In this scenario, we are only considering terminal states with a winner
+    # if there is not a winner yet (or there is a tie) we return 0
+    # In other games the payoff function may be more complex
+    def payoff(game_state, player_name):
+        winner = TicTacToeGameEnvironment.get_winner(game_state)
+        if winner is None:
+            return 0
+        elif winner == player_name:
+            return 1
+        else:
+            return -1
+    
\ No newline at end of file
--- a/week9/solution/.DS_Store
+++ b/week9/solution/.DS_Store
--- a/week9/solution/decision_tree/decision_tree.py
+++ b/week9/solution/decision_tree/decision_tree.py
+import math
+
+from dt_node import DTNode
+from guess_who_samples import SamplesSet
+
+training_set = SamplesSet()
+
+def entropy(samples, feature):
+    global training_set
+
+    # generating a list with all the values for the considered
+    # feature given in the samples
+    values = SamplesSet.get_values_by_feature(samples, feature)
+
+    # computing entropy by summing up
+    # the partial entropy for each feature value
+    feature_vals = training_set.get_feature_values(feature)
+    total_entropy = 0
+    for value in feature_vals:
+        n_positive = values.count(value)
+        prob = n_positive / len(values)
+        if prob == 0 or prob == 1:
+            # current partial entropy is 0, skip
+            continue
+        total_entropy += prob*math.log2(prob)
+    
+    return -1*total_entropy
+
+def remainder(samples, feature):
+    global training_set
+
+    feature_vals = training_set.get_feature_values(feature)
+    total_remainder = 0
+    for value in feature_vals:
+        cur_samples = SamplesSet.get_samples_by_feature_value(samples, feature, value)
+        
+        if len(cur_samples) > 0:
+            total_remainder += (len(cur_samples) / len(samples)) * entropy(cur_samples, training_set.get_decision_feature())
+    
+    return total_remainder
+
+def information_gain(samples, feature):
+    global training_set
+
+    return entropy(samples, training_set.get_decision_feature()) - remainder(samples, feature)
+
+def learn_tree(samples, features, parent_samples, parent_node=None, edge=None):
+    global training_set
+
+    if len(samples) == 0:
+        leaf = training_set.plurality_value(parent_samples)
+        return DTNode(leaf, parent_node, edge)
+    
+    labels = SamplesSet.get_values_by_feature(samples, training_set.get_decision_feature())
+    if len(set(labels)) == 1:
+        leaf = labels[0]
+        return DTNode(leaf, parent_node, edge)
+    
+    if len(features) == 0:
+        leaf = training_set.plurality_value(samples)
+        return DTNode(leaf, parent_node, edge)
+
+    best_feature = None
+    best_ig = None
+    for feature in features:
+        if feature != training_set.get_decision_feature():
+            cur_ig = information_gain(samples, feature)
+            if best_ig is None or cur_ig > best_ig:
+                best_ig = cur_ig
+                best_feature = feature
+    
+    tree = DTNode(best_feature, parent_node, edge)
+    for value in training_set.get_feature_values(best_feature):
+        subsamples = SamplesSet.get_samples_by_feature_value(samples, best_feature, value)
+        
+        subfeatures = features.copy()
+        subfeatures.remove(best_feature)
+        subtree = learn_tree(subsamples, subfeatures, samples, tree, best_feature)
+        tree.add_successor(subtree, value)
+    
+    return tree
+
+def classify(sample, decision_tree):
+    print("\n--------------------------")
+    print("Classifying the sample {0}".format(sample))
+
+    cur_node = decision_tree
+    while len(cur_node.get_successors()) > 0:
+        successors = cur_node.get_successors()
+        cur_feat = cur_node.get_state()
+        print("Evaluating feature '{0}'".format(cur_feat))
+        feat_val = sample.get_feature_value(cur_feat)
+        print("Feature value for the sample is '{0}'".format(feat_val))
+
+        cur_node = successors[feat_val]
+
+    print("Classification: {0}".format(cur_node.get_state()))
+
+    return cur_node.get_state()
+
+if __name__ == '__main__':
+    print("Learning decision tree...")
+    decision_tree = learn_tree(
+        training_set.get_samples(),
+        training_set.get_classification_features(),
+        [], None, None)
+    
+    print("Decision tree learned!")
+
+    print("Classification...\n")
+    n_misclassifications = 0
+    for sample in training_set.get_samples():
+        ground_truth = sample.get_label()
+        prediction = classify(sample, decision_tree)
+
+        if ground_truth != prediction:
+            n_misclassifications += 1
+            print("Misclassification for sample {0}".format(sample))
+            print("Ground truth was '{0}' and prediction was '{1}'".format(ground_truth, prediction))
+
+    print("\nNumber of misclassifications: {0}".format(n_misclassifications))
--- a/week9/solution/decision_tree/dt_node.py
+++ b/week9/solution/decision_tree/dt_node.py
+from une_ai.models import GraphNode
+
+class DTNode(GraphNode):
+
+    def __init__(self, state, parent_node, edge):
+        super().__init__(state, parent_node, edge, 0)
+        
+        self._successors = {}
+    
+    def add_successor(self, successor, edge):
+        self._successors[edge] = successor
+
+        return self._successors[edge]
+    
+    def get_successors(self):
+        successors = {}
+        for key, val in self._successors.items():
+            successors[key] = val
+        return successors
\ No newline at end of file
--- a/week9/solution/decision_tree/guess_who.csv
+++ b/week9/solution/decision_tree/guess_who.csv
+Name,IsFemale,HasDarkColourEyes,HasBlackHair,HasBlondHair,HasRedHair,HasWhiteHair,HasLongHair,HasBigLips,HasMoustache,HasBeard,WearHat,IsBald,WearGlasses,WearEarrings
+Alex,No,Yes,Yes,No,No,No,No,Yes,Yes,No,No,No,No,No
+Alfred,No,No,No,No,Yes,No,No,No,Yes,No,No,No,No,No
+Anita,Yes,No,No,No,No,Yes,No,No,No,No,No,No,No,No
+Anne,Yes,Yes,Yes,No,No,No,No,No,No,No,No,No,No,Yes
+Bernard,No,Yes,No,No,No,No,No,No,No,No,Yes,No,No,No
+Bill,No,Yes,No,No,Yes,No,No,No,No,Yes,No,Yes,No,No
+Charles,No,Yes,No,Yes,No,No,No,Yes,Yes,No,No,No,No,No
+Claire,Yes,Yes,No,No,Yes,No,Yes,No,No,No,Yes,No,Yes,No
+David,No,Yes,No,Yes,No,No,Yes,No,No,Yes,No,No,No,No
+Eric,No,Yes,No,Yes,No,No,No,No,No,No,Yes,No,No,No
+Frans,No,Yes,No,No,Yes,No,No,No,No,No,No,No,No,No
+George,No,Yes,No,No,No,Yes,No,No,No,No,Yes,No,No,No
+Herman,No,Yes,No,No,Yes,No,No,No,No,No,No,Yes,No,No
+Joe,No,Yes,No,Yes,No,No,No,No,No,No,No,No,Yes,No
+Maria,Yes,Yes,No,No,No,No,Yes,No,No,No,Yes,No,No,Yes
+Max,No,Yes,Yes,No,No,No,No,Yes,Yes,No,No,No,No,No
+Paul,No,Yes,No,No,No,Yes,No,No,No,No,No,No,Yes,No
+Peter,No,No,No,No,No,Yes,No,Yes,No,No,No,No,No,No
+Philip,No,Yes,Yes,No,No,No,No,No,No,Yes,No,No,No,No
+Richard,No,Yes,No,No,No,No,No,No,Yes,Yes,No,Yes,No,No
+Robert,No,No,No,No,No,No,No,No,No,No,No,No,No,No
+Sam,No,Yes,No,No,No,Yes,No,No,No,No,No,Yes,Yes,No
+Susan,Yes,Yes,No,Yes,No,No,Yes,Yes,No,No,No,No,No,Yes
+Tom,No,No,Yes,No,No,No,No,No,No,No,No,Yes,Yes,No
\ No newline at end of file
--- a/week9/solution/decision_tree/guess_who_samples.py
+++ b/week9/solution/decision_tree/guess_who_samples.py
+import csv
+
+class Sample():
+
+    def __init__(self, sample_vector, features, decision_feature):
+        self._x = {}
+        self._y = {}
+        for i, feature in enumerate(features):
+            if feature != decision_feature:
+                self._x[feature] = sample_vector[i]
+            else:
+                self._y[feature] = sample_vector[i]
+    
+    def get_sample_values(self):
+        sample_vector = []
+        for value in self._x.values():
+            sample_vector.append(value)
+        return sample_vector
+    
+    def get_feature_value(self, feature):
+        assert feature in self._x.keys() or feature in self._y.keys(), "'{0}' is not a valid feature.".format(feature)
+
+        if feature in self._x.keys():
+            return self._x[feature]
+        else:
+            return self.get_label()
+    
+    def get_label(self):
+        decision_feature = list(self._y.keys())[0]
+        return self._y[decision_feature]
+    
+    def __str__(self):
+        return str(self._x) + ' -> ' + str(self._y)
+
+class SamplesSet():
+
+    def __init__(self, guess_who_file='guess_who.csv', decision_feature='Name'):
+        self._features_values = {}
+        self._features = []
+        self._decision_feature = decision_feature
+        self._samples = []
+
+        with open(guess_who_file, newline='', mode='r', encoding='utf-8-sig') as f:
+            reader = csv.reader(f)
+            for i, row in enumerate(reader):
+                if i == 0:
+                    # First row with the name of the features
+                    self._features = row
+                    for feature in self._features:
+                        self._features_values[feature] = []
+                    continue
+                
+                # rows with names and values for the features
+                # the name is our label we want to predict from the features
+                cur_sample = row
+                self.add_sample(cur_sample)
+
+                for i, feature_val in enumerate(cur_sample):
+                    cur_feature = self._features[i]
+                    if feature_val not in self._features_values[cur_feature]:
+                        self._features_values[cur_feature].append(feature_val)
+    
+    def add_sample(self, sample_vector):
+        new_sample = Sample(sample_vector, self._features, self._decision_feature)
+        self._samples.append(new_sample)
+
+        return len(self._samples)-1
+    
+    def get_classification_features(self):
+        classification_features = self._features.copy()
+        classification_features.remove(self._decision_feature)
+
+        return classification_features
+
+    def get_decision_feature(self):
+        return self._decision_feature
+    
+    def get_feature_values(self, feature):
+        assert feature in self._features_values.keys(), "'{0}' is not a valid feature.".format(feature)
+
+        return self._features_values[feature]
+    
+    def get_sample_at_index(self, index):
+        assert index >= 0 and index < len(self._samples), "The parameter index must be >= 0 and less than the number of samples"
+
+        return self._samples[index]
+    
+    def get_samples(self):
+        return self._samples
+    
+    def get_values_by_feature(samples, feature):
+        values = []
+        for sample in samples:
+            values.append(sample.get_feature_value(feature))
+        
+        return values
+    
+    def get_samples_by_feature_value(samples, feature, feature_value):
+        subsamples = []
+        for sample in samples:
+            if sample.get_feature_value(feature) == feature_value:
+                subsamples.append(sample)
+        
+        return subsamples
+    
+    def plurality_value(self, samples):
+        max_count = None
+        best_label = None
+        labels = SamplesSet.get_values_by_feature(samples, self._decision_feature)
+        for value in self._features_values[self._decision_feature]:
+            cur_count = labels.count(value)
+            if max_count is None or cur_count > max_count:
+                max_count = cur_count
+                best_label = value
+        
+        return best_label
\ No newline at end of file
--- a/week9/solution/reinforcement_learning/agent_programs.py
+++ b/week9/solution/reinforcement_learning/agent_programs.py
+import random
+import json
+
+from tictactoe_game_environment import TicTacToeGameEnvironment
+from reinforcement_learning import state_to_str
+
+def agent_program_random(percepts, actuators):
+    game_board = percepts['game-board-sensor']
+    player_turn = percepts['turn-taking-indicator']
+    game_state = {
+        'game-board': game_board.copy(),
+        'player-turn': player_turn
+    }
+    
+    legal_moves = TicTacToeGameEnvironment.get_legal_actions(game_state)
+    if len(legal_moves) > 0:
+        return [random.choice(legal_moves)]
+    
+    return []
+
+def agent_program_RL(percepts, actuators):
+
+    game_board = percepts['game-board-sensor']
+    player_turn = percepts['turn-taking-indicator']
+    game_state = {
+        'game-board': game_board.copy(),
+        'player-turn': player_turn
+    }
+
+    with open('vfunction.json'.format(player_turn), 'r') as f:
+        vfunction = json.load(f)
+    
+    opponent = 'X' if player_turn == 'O' else 'O'
+    v_player = vfunction['player-{0}'.format(player_turn)]
+    v_opponent = vfunction['player-{0}'.format(opponent)]
+    if not TicTacToeGameEnvironment.is_terminal(game_state):
+        best_action = None
+        max_advantage = None
+        for action in TicTacToeGameEnvironment.get_legal_actions(game_state):
+            new_state = TicTacToeGameEnvironment.transition_result(game_state, action)
+            future_state_str = state_to_str(new_state)
+            if future_state_str in v_player.keys() and future_state_str in v_opponent.keys():
+                advantage = v_player[future_state_str] - v_opponent[future_state_str]
+                if best_action is None or advantage > max_advantage:
+                    best_action = action
+                    max_advantage = advantage
+        
+        if best_action is not None:
+            return [best_action]
+        else:
+            print("No best action found in v function for state {0}. Selecting it randomly".format(state_to_str(game_state)))
+            actions = TicTacToeGameEnvironment.get_legal_actions(game_state)
+            selected_action = random.choice(actions)
+            return [selected_action]
+    
+    return []
--- a/week9/solution/reinforcement_learning/reinforcement_learning.py
+++ b/week9/solution/reinforcement_learning/reinforcement_learning.py
+import random
+import json
+import sys
+
+from tictactoe_game_environment import TicTacToeGameEnvironment
+
+# code taken from https://gist.github.com/vladignatyev/06860ec2040cb497f0f3
+# A function to print a progress bar to keep track
+# of the learning process
+def progress(count, total, suffix=''):
+    bar_len = 60
+    filled_len = int(round(bar_len * count / float(total)))
+
+    percents = round(100.0 * count / float(total), 1)
+    bar = '=' * filled_len + '-' * (bar_len - filled_len)
+
+    sys.stdout.write('[%s] %s%s ...%s\r' % (bar, percents, '%', suffix))
+    sys.stdout.flush()
+
+# A function transforming a game board state into a
+# string representing it
+def state_to_str(state):
+    board_str = ''
+    game_board = state['game-board']
+    for i in range(game_board.get_width()):
+        for j in range(game_board.get_height()):
+            value = game_board.get_item_value(i, j)
+            if value is None:
+                value = '*'
+            board_str += value
+    
+    return board_str
+
+# A function that picks the best next action for the
+# current player given the current policies.
+# The best action for the player is determined as the
+# difference between the v-value of the player
+# to transition in the new state and the v-value of
+# the opponent in that same transitioned state.
+# There is an off_policy_likelihood percentage to
+# select a random action instead of the best one.
+def pick_best_action(vfunction, state, off_policy_likelihood):
+    player = 'player-{0}'.format(state['player-turn'])
+    opponent = 'player-X' if state['player-turn'] == 'O' else 'player-O'
+    v_player = vfunction[player]
+    v_opponent = vfunction[opponent]
+
+    # getting the legal actions
+    legal_actions = TicTacToeGameEnvironment.get_legal_actions(state)
+
+    # setting the selected action as None
+    selected_action = None
+
+    # keeping track of the states not explored yet
+    # (for exploration mode)
+    unvisited_future_states = []
+
+    # selecting the action based on the current policy from the
+    # so far learned v function for the player
+    max_advantage = float('-Inf')
+    for action in legal_actions:
+        future_state = TicTacToeGameEnvironment.transition_result(state, action)
+        future_state_str = state_to_str(future_state)
+        if future_state_str in v_player.keys() and future_state_str in v_opponent.keys():
+            advantage = v_player[future_state_str] - v_opponent[future_state_str]
+            if advantage > max_advantage:
+                selected_action = action
+                max_advantage = advantage
+        elif future_state_str not in v_player.keys():
+            unvisited_future_states.append((action, future_state))
+
+    # checking if we should use exploration instead of exploitation
+    if random.random() < off_policy_likelihood or selected_action is None:
+        # exploration mode, selecting random action
+        if len(unvisited_future_states) > 0:
+            item = random.choice(unvisited_future_states)
+            selected_action = item[0]
+        else:
+            selected_action = random.choice(legal_actions)
+    
+    return selected_action
+
+# A function to update the v function of a player
+# The v function is updated according to the TD(0) equation
+def update_v_function(v, state, new_state, reward, alpha, gamma):
+    state_str = state_to_str(state)
+    if state_str not in v.keys():
+        v[state_str] = random.random()*0.1
+    
+    future_state_str = state_to_str(new_state)
+    if future_state_str not in v.keys():
+        v[future_state_str] = random.random()*0.1
+    
+    v_old = v[state_str]
+    v_new = v_old + alpha*(reward + gamma*v[future_state_str] - v_old)
+    v[state_str] = v_new
+
+# The learning function
+def learn(alpha=0.1, gamma=0.9, off_policy_likelihood=0.1, n_episodes=100000):
+    # setting the v functions for the players as empty dictionaries
+    vfunction = {'player-X': {}, 'player-O': {}}
+
+    # We start from the opening state of the game
+    starting_environment = TicTacToeGameEnvironment()
+    state = starting_environment.get_game_state()
+
+    # We need to keep track to the last state to update for the
+    # other player
+    past_states = {'player-X': None, 'player-O': None}
+
+    # Loop for n_episodes
+    i = 0
+    while i < n_episodes:
+        progress(i, n_episodes)
+
+        cur_player = state['player-turn']
+        last_player = 'X' if state['player-turn'] == 'O' else 'O'
+
+        v_cur_player = vfunction['player-{0}'.format(cur_player)]
+        v_last_player = vfunction['player-{0}'.format(last_player)]
+
+        # from the current state, we pick the best action
+        selected_action = pick_best_action(
+            vfunction,
+            state,
+            off_policy_likelihood
+        )
+
+        # Transitioning state
+        new_state = TicTacToeGameEnvironment.transition_result(state, selected_action)
+
+        # We store the transitioned state as the state to update for the current player
+        # during the next iteration
+        past_states['player-{0}'.format(state['player-turn'])] = new_state
+
+        # We compute the reward for the current player for performing
+        # the selected action and transitioning to new_state
+        # If new_state is not terminal, the reward will be 0
+        cur_player_reward = TicTacToeGameEnvironment.payoff(new_state, cur_player)
+
+        # Given the received reward, we update the v function of the current player
+        # Even if the reward = 0, the v function for the current state will be updated
+        # given the gamma*v-value of the new_state, thus propagating the rewards from
+        # terminal states back to early states of the game
+        update_v_function(v_cur_player, state, new_state, cur_player_reward, alpha, gamma)
+
+        # We also need to update the v function of the last player
+        # If we have a past state for the last player, we can do so
+        if past_states['player-{0}'.format(last_player)] is not None:
+            last_player_new_state = past_states['player-{0}'.format(last_player)]
+            
+            # The reward is given by the payoff at the same new_state but for the last_player
+            # In this game is simply -1*cur_player_reward
+            last_player_reward = TicTacToeGameEnvironment.payoff(new_state, last_player)
+
+            # We need to update the v-value for the past state of the last player
+            # when transitioning to the new_state with the achieved reward for new_state
+            update_v_function(v_last_player, last_player_new_state, new_state, last_player_reward, alpha, gamma)  
+
+        # We check if the episode terminated
+        if TicTacToeGameEnvironment.is_terminal(new_state):
+            # Yes, we restart
+            state = starting_environment.get_game_state()
+            past_states = {'player-X': None, 'player-O': None}
+            i += 1
+        else:
+            # No, we continue
+            state = new_state
+    
+    return vfunction
+
+if __name__ == "__main__":
+    vfunction = learn()
+    with open('vfunction.json', 'w+') as f:
+        json.dump(vfunction, f)
\ No newline at end of file
--- a/week9/solution/reinforcement_learning/tictactoe_app.py
+++ b/week9/solution/reinforcement_learning/tictactoe_app.py
+from une_ai.tictactoe import TicTacToeGame
+from une_ai.tictactoe import TicTacToePlayer
+
+from tictactoe_game_environment import TicTacToeGameEnvironment
+from agent_programs import agent_program_RL, agent_program_random
+
+if __name__ == '__main__':
+    player_X = TicTacToePlayer('X', agent_program_RL)
+    player_O = TicTacToePlayer('O', agent_program_random)
+
+    # DO NOT EDIT THE FOLLOWING INSTRUCTIONS!
+    environment = TicTacToeGameEnvironment()
+    environment.add_player(player_X)
+    environment.add_player(player_O)
+
+    game = TicTacToeGame(player_X, player_O, environment)
+
--- a/week9/solution/reinforcement_learning/tictactoe_game_environment.py
+++ b/week9/solution/reinforcement_learning/tictactoe_game_environment.py
+import numpy as np
+from scipy.signal import convolve2d
+from une_ai.models import GameEnvironment, GridMap, Agent
+
+class IllegalMove(Exception):
+    pass
+
+class TicTacToeGameEnvironment(GameEnvironment):
+
+    def __init__(self, board_size=3):
+        super().__init__("Tic Tac Toe")
+
+        self._board_size = board_size
+        self._game_board = GridMap(board_size, board_size, None)
+        self._player_turn = 'X' # X always starts
+
+    # TODO
+    # implement the abstract method add_player
+    # the GameEnvironment superclass uses a dictionary self._players 
+    # to store the players of the game.
+    # For this game, we must limit the players to 2 players and
+    # The first added player will be X and the second O
+    def add_player(self, player):
+        assert isinstance(player, Agent), "The parameter player must be an instance of a subclass of the class Agent"
+        assert len(self._players) < 2, "It is not possible to add more than 2 players for this game."
+        
+        if len(self._players) == 0:
+            marker = 'X'
+        else:
+            marker = 'O'
+
+        self._players[marker] = player
+
+        return marker
+
+    # TODO
+    # implement the abstract method get_game_state
+    # the method must return the current state of the game
+    # as a dictionary with the following keys:
+    # 'game-board' -> a copy of the game board (as 3x3 GridMap)
+    # 'player-turn' -> 'X' or 'O' depending on the current player turn
+    # You may first create properties in the constructor function __init__
+    # to store the game board and the current turn
+    def get_game_state(self):
+        gs = {
+            'game-board': self._game_board.copy(),
+            'player-turn': self._player_turn
+        }
+        return gs
+    
+    # TODO
+    # implement the abstract method get_percepts
+    # this method returns a dictionary with keys the sensors of the agent
+    # and values the percepts gathered for that sensor at time t
+    # the sensors are:
+    # 'game-board-sensor' -> the 'game-board' value from the current game state
+    # 'turn-taking-indicator' -> the 'player-turn' value from the current game state
+    def get_percepts(self):
+        gs = self.get_game_state()
+        return {
+            'game-board-sensor': gs['game-board'], 
+            'turn-taking-indicator': gs['player-turn']
+        }
+    
+    # TODO
+    # implement the abstract method get_legal_actions
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state as input and it returns the list of
+    # legal actions in that game state
+    # An action is legal in a given game state if the game board cell 
+    # for that action is free from marks
+    def get_legal_actions(game_state):
+        legal_actions = []
+        game_board = game_state['game-board']
+        empty_cells = game_board.find_value(None)
+        for empty_cell in empty_cells:
+            legal_actions.append('mark-{0}-{1}'.format(empty_cell[0], empty_cell[1]))
+        
+        return legal_actions
+
+    # TODO
+    # implement the abstract method transition_result
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state and an action to perform as input and it returns
+    # the new game state.
+    def transition_result(game_state, action):
+        legal_actions = TicTacToeGameEnvironment.get_legal_actions(game_state)
+        if action not in legal_actions:
+            raise(IllegalMove('The action {0} is not a legal move for the given game state {1}.'.format(action, game_state.get_map())))
+        
+        marker = TicTacToeGameEnvironment.turn(game_state)
+        
+        tokens = action.split('-')
+        x, y = (int(tokens[1]), int(tokens[2]))
+        new_game_board = game_state['game-board'].copy()
+        new_game_board.set_item_value(x, y, marker)
+
+        new_game_state = {
+            'game-board': new_game_board,
+            'player-turn': 'O' if game_state['player-turn'] == 'X' else 'X'
+        }
+
+        return new_game_state
+    
+    # TODO
+    # implement the abstract method state_transition
+    # this method takes as input the agent's actuators
+    # and it changes the game environment state based
+    # on the values of the agent's actuators
+    # This agent has only one actuator, 'marker'
+    # the value of this actuator is a tuple with the x and y
+    # coordinates where the agent will place its marker on the game board
+    # We can implement this method by re-using the static method
+    # transition_result we just implemented
+    def state_transition(self, agent_actuators):
+        assert agent_actuators['marker'] is not None, "During a turn, the player must have set the 'marker' actuator value to a coordinate (x, y) of the game board where to place the marker."
+
+        x, y = agent_actuators['marker']
+        gs = self.get_game_state()
+        action = 'mark-{0}-{1}'.format(x, y)
+        new_gs = TicTacToeGameEnvironment.transition_result(gs, action)
+
+        self._player_turn = new_gs['player-turn']
+        self._game_board = new_gs['game-board'].copy()
+    
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It returns the turn of the player given a game state.
+    def turn(game_state):
+        assert 'player-turn' in game_state.keys(), "Invalid game state. A game state must have the key 'player-turn'"
+
+        return game_state['player-turn']
+    
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state as input and it returns the winner ('X' or 'O') if there is any
+    # or None if there is no winner (a tie or a non-terminal state)
+    # This method is already provided to you. You should look at its implementation
+    # and try to understand how it is finding a winner with the convolution operation
+    def get_winner(game_state):
+        game_board = game_state['game-board']
+
+        horizontal_kernel = np.array([[ 1, 1, 1]])
+        vertical_kernel = np.transpose(horizontal_kernel)
+        diag_kernel = np.eye(3, dtype=np.uint8)
+        flipped_diag_kernel = np.fliplr(diag_kernel)
+        detection_kernels = [horizontal_kernel, vertical_kernel, diag_kernel, flipped_diag_kernel]
+
+        for marker in ['X', 'O']:
+            player_markers = game_board.get_map() == marker
+            for kernel in detection_kernels:
+                convolved_values = convolve2d(player_markers, kernel, mode="valid")
+                if (convolved_values == 3).any():
+                    return marker
+        
+        return None
+
+    # TODO
+    # implement the abstract method is_terminal
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state as input and it returns True if the game state
+    # is terminal and False otherwise.
+    # In this game, a state is terminal if there are no more legal actions
+    # or if there is a winner.
+    def is_terminal(game_state):
+        # game is over if the board is full
+        remaining_actions = TicTacToeGameEnvironment.get_legal_actions(game_state)
+        winner = TicTacToeGameEnvironment.get_winner(game_state)
+
+        return len(remaining_actions) == 0 or winner is not None
+
+    # TODO
+    # implement the abstract method payoff
+    # This method is a static method (i.e. we do not have access to self
+    # and it can only be accessed via the class TicTacToeGameEnvironment)
+    # It takes a game_state and the player name ('X' or 'O') as input and it returns
+    # the payoff value for that player in the given game state
+    # In this scenario, we are only considering terminal states with a winner
+    # if there is not a winner yet (or there is a tie) we return 0
+    # In other games the payoff function may be more complex
+    def payoff(game_state, player_name):
+        winner = TicTacToeGameEnvironment.get_winner(game_state)
+        if winner is None:
+            return 0
+        elif winner == player_name:
+            return 1
+        else:
+            return -1
+    
\ No newline at end of file
--- a/week9/solution/reinforcement_learning/vfunction.json
+++ b/week9/solution/reinforcement_learning/vfunction.json