From 03bddcaee45cb65cf60db8b4b49c877cd9ef9159 Mon Sep 17 00:00:00 2001 From: Jameswflynn1 Date: Sat, 8 Jul 2023 21:28:02 +0100 Subject: [PATCH 01/18] Added wip efr implementation --- open_spiel/python/algorithms/efr.py | 812 ++++++++++++++++++++++++++++ 1 file changed, 812 insertions(+) create mode 100644 open_spiel/python/algorithms/efr.py diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py new file mode 100644 index 0000000000..0760aaeab1 --- /dev/null +++ b/open_spiel/python/algorithms/efr.py @@ -0,0 +1,812 @@ +# Copyright 2023 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#Modified: 2023 James Flynn +#Original: https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py + +"""Python implementation of the counterfactual regret minimization algorithm. + +One iteration of CFR consists of: +1) Compute current strategy from regrets (e.g. using Regret Matching). +2) Compute values using the current strategy +3) Compute regrets from these values + +The average policy is what converges to a Nash Equilibrium. +""" + +import collections +import attr +import copy +import numpy as np +from collections import defaultdict + +from open_spiel.python import policy +from scipy.linalg import lstsq +import pyspiel + +@attr.s +class _InfoStateNode(object): + """An object wrapping values associated to an information state.""" + # The list of the legal actions. + legal_actions = attr.ib() + index_in_tabular_policy = attr.ib() + # The newly availible deviations + the old ones + relizable_deviations = attr.ib() + #Player -> state -> action -> prob + current_history_probs = attr.ib() + + #An array representing + history = attr.ib() + updates = attr.ib() + updated = attr.ib() + + cumulative_regret = attr.ib(factory=lambda: collections.defaultdict(float)) + # Same as above for the cumulative of the policy probabilities computed + # during the policy iterations + cumulative_policy = attr.ib(factory=lambda: collections.defaultdict(float)) + y_values = attr.ib(factory=lambda: collections.defaultdict(float)) + + +class _EFRSolverBase(object): + def __init__(self, game, _deviation_gen, discounting, discounting_parameters): + assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, () + + self._game = game + self._num_players = game.num_players() + self._root_node = self._game.new_initial_state() + + # This is for returning the current policy and average policy to a caller + self._current_policy = policy.TabularPolicy(game) + self._average_policy = self._current_policy.__copy__() + self._deviation_gen = _deviation_gen + + self._info_state_nodes = {} + hist = {player : [] for player in range(self._num_players)} + self._initialize_info_state_nodes(self._root_node, hist, [[] for _ in range(self._num_players)],[[] for _ in range(self._num_players)]) + + self._iteration = 1 # For possible linear-averaging. + + self.discounting = discounting + self.alpha = discounting_parameters[0] + self.beta = discounting_parameters[1] + self.gamma = discounting_parameters[2] + + self._str_to_action = {} + def return_cumulative_regret(self): + return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret for i in range(len(self._info_state_nodes.keys()))} + def current_policy(self): + return self._current_policy + + def average_policy(self): + _update_average_policy(self._average_policy, self._info_state_nodes) + return self._average_policy + + def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state,path_indices): + if state.is_terminal(): + return + + if state.is_chance_node(): + for action, unused_action_prob in state.chance_outcomes(): + self._initialize_info_state_nodes(state.child(action), history, uniform_probs_to_state, path_indices) + return + + current_player = state.current_player() + info_state = state.information_state_string(current_player) + info_state_node = self._info_state_nodes.get(info_state) + if info_state_node is None: + legal_actions = state.legal_actions(current_player) + info_state_node = _InfoStateNode( + legal_actions=legal_actions, + index_in_tabular_policy=self._current_policy.state_lookup[info_state], + relizable_deviations = None, + history = history[current_player].copy(), + current_history_probs = copy.deepcopy(path_indices[current_player]), + updates = 0, + updated = False + ) + prior_possible_actions = [] + for i in range(len(info_state_node.current_history_probs)): + prior_possible_actions.append(info_state_node.current_history_probs[i][0]) + prior_possible_actions.append(info_state_node.legal_actions) + + info_state_node.relizable_deviations = self._deviation_gen(len(info_state_node.legal_actions), info_state_node.history, prior_possible_actions) + self._info_state_nodes[info_state] = info_state_node + + legal_actions = state.legal_actions(current_player) + new_uniform_probs_to_state = copy.deepcopy(uniform_probs_to_state) + assert len(new_uniform_probs_to_state[current_player]) == len(history[current_player]) + + new_uniform_probs_to_state[current_player].append({legal_actions[i]: 1/len(legal_actions) for i in range(len(legal_actions))}) + for action in info_state_node.legal_actions: + #Speedup + new_path_indices = copy.deepcopy(path_indices) + new_path_indices[current_player].append([legal_actions, info_state_node.index_in_tabular_policy]) + #Speedup + new_history = copy.deepcopy(history) + new_history[current_player].append(action) + assert len(new_history[current_player]) == len(new_path_indices[current_player]) + + self._initialize_info_state_nodes(state.child(action), new_history, new_uniform_probs_to_state, new_path_indices) + + def _update_current_policy(self,state, current_policy): + """Updated in order so that memory reach probs are defined wrt to the new strategy + """ + + if state.is_terminal(): + return + elif not state.is_chance_node(): + current_player = state.current_player() + info_state = state.information_state_string(current_player) + info_state_node = self._info_state_nodes[info_state] + deviations = info_state_node.relizable_deviations + #print(info_state) + for devation in range(len(deviations)): + #change too infostate + mem_reach_probs = create_probs_from_index(info_state_node.current_history_probs, current_policy) + deviation_reach_prob = deviations[devation].player_deviation_reach_probability(mem_reach_probs) + accum_regret_discount = 1 + if self.discounting == True: + #No point in discounting 0 regret + if info_state_node.y_values[deviations[devation]]>0: + talpha = self._iteration**self.alpha + accum_regret_discount = talpha/(talpha+1) + info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]]*accum_regret_discount + max(0,info_state_node.cumulative_regret[devation])*deviation_reach_prob + + #Might be incorrect + state_policy = current_policy.policy_for_key(info_state) + #print + for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): + state_policy[action] = value + info_state_node.updated = True + + info_state_node.updates +=1 + + for action in info_state_node.legal_actions: + new_state = state.child(action) + self._update_current_policy(new_state, current_policy) + else: + for action, action_prob in state.chance_outcomes(): + new_state = state.child(action) + self._update_current_policy(new_state, current_policy) + #Path to state probability ignores chance probabilty as this is stored as new_reach_probabilities[-1] + def _compute_cumulative_immediate_regret_for_player(self, state, policies, + reach_probabilities, player): + if state.is_terminal(): + return np.asarray(state.returns()) + + if state.is_chance_node(): + state_value = 0.0 + for action, action_prob in state.chance_outcomes(): + assert action_prob > 0 + new_state = state.child(action) + new_reach_probabilities = reach_probabilities.copy() + new_reach_probabilities[-1] *= action_prob + + state_value += action_prob * self._compute_cumulative_immediate_regret_for_player( + new_state, policies, new_reach_probabilities, player) + return state_value + + current_player = state.current_player() + info_state = state.information_state_string(current_player) + + + + # No need to continue on this history branch as no update will be performed + # for any player. + # The value we return here is not used in practice. If the conditional + # statement is True, then the last taken action has probability 0 of + # occurring, so the returned value is not impacting the parent node value. + if all(reach_probabilities[:-1] == 0): + return np.zeros(self._num_players) + + state_value = np.zeros(self._num_players) + + # The utilities of the children states are computed recursively. As the + # regrets are added to the information state regrets for each state in that + # information state, the recursive call can only be made once per child + # state. Therefore, the utilities are cached. + children_utilities = {} + + info_state_node = self._info_state_nodes[info_state] + #Reset y values + info_state_node.y_values = collections.defaultdict(float) + if policies is None: + info_state_policy = self._get_infostate_policy(info_state) + else: + info_state_policy = policies[current_player](info_state) + + reach_prob = reach_probabilities[current_player] + for action in state.legal_actions(): + action_prob = info_state_policy.get(action, 0.) + current_policy_discount = 1.0 + if self.discounting == True: + current_policy_discount = (self._iteration/self._iteration)**self.gamma + info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action]*current_policy_discount + action_prob * reach_prob + new_state = state.child(action) + new_reach_probabilities = reach_probabilities.copy() + assert action_prob <= 1 + new_reach_probabilities[current_player] *= action_prob + child_utility = self._compute_cumulative_immediate_regret_for_player(new_state,policies=policies,reach_probabilities=new_reach_probabilities,player=player) + + state_value += action_prob * child_utility + children_utilities[action] = child_utility + + counterfactual_reach_prob = (np.prod(reach_probabilities[:current_player]) * np.prod(reach_probabilities[current_player + 1:])) + + state_value_for_player = state_value[current_player] + deviations = info_state_node.relizable_deviations + for deviationIndex in range(len(deviations)): + #FIX ADD DICT TO ARRAY CONVERSION FUNCTION + deviation = deviations[deviationIndex] + deviation_strategy = deviation.deviate(strat_dict_to_array(self._get_infostate_policy(info_state))) + + player_child_utilities = np.array(list(children_utilities.values()))[:,current_player] + devation_cf_value = np.inner(np.transpose(deviation_strategy), player_child_utilities) + + memory_reach_probs = create_probs_from_index(info_state_node.current_history_probs,self.current_policy()) + player_current_memory_reach_prob = deviation.player_deviation_reach_probability(memory_reach_probs) + + deviation_regret = player_current_memory_reach_prob *((devation_cf_value*counterfactual_reach_prob) - (counterfactual_reach_prob * state_value_for_player)) + + info_state_node.cumulative_regret[deviationIndex] += deviation_regret + return state_value + + + def _get_infostate_policy(self, info_state_str): + """Returns an {action: prob} dictionary for the policy on `info_state`.""" + info_state_node = self._info_state_nodes[info_state_str] + prob_vec = self._current_policy.action_probability_array[ + info_state_node.index_in_tabular_policy] + return { + action: prob_vec[action] for action in info_state_node.legal_actions + } +def __get_infostate_policy_array(self, info_state_str): + info_state_node = self._info_state_nodes[info_state_str] + return self._current_policy.action_probability_array[ + info_state_node.index_in_tabular_policy] + +class _EFRSolver(_EFRSolverBase): + def __init__(self, game, _deviation_gen, discounting, discounting_parameters): + super().__init__(game, _deviation_gen, discounting, discounting_parameters) + + def evaluate_and_update_policy(self): + """Performs a single step of policy evaluation and policy improvement.""" + self._compute_cumulative_immediate_regret_for_player( + self._root_node, + policies=None, + reach_probabilities=np.ones(self._game.num_players() + 1), + player=None) + history = [ [] for _ in range(self._num_players)] + self._update_current_policy(self._root_node,self._current_policy) + self._iteration+= 1 + +class EFRSolver(_EFRSolver): + def __init__(self, game, deviations_name, discounting = False, discounting_parameters = [1,1,1]): + + #Takes the deviation sets used for learning from Deviation_Sets + external_only = False + deviation_sets = None + + if deviations_name == "blind action": + deviation_sets = return_blind_action + external_only = True + elif deviations_name == "informed action": + deviation_sets = return_informed_action + elif deviations_name == "blind cf" or deviations_name == "blind counterfactual": + deviation_sets = return_blind_CF + external_only = True + elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": + deviation_sets = return_informed_CF + elif deviations_name == "swap cf" or deviations_name == "swap counterfactual": + deviation_sets = return_swap_cf + elif deviations_name == "bps" or deviations_name == "blind partial sequence": + deviation_sets = return_blind_partial_sequence + external_only = True + elif deviations_name == "cfps" or deviations_name == "cf partial sequence" or deviations_name == "counterfactual partial sequence": + deviation_sets = return_cf_partial_sequence + elif deviations_name == "csps" or deviations_name == "casual partial sequence": + deviation_sets = return_cs_partial_sequence + elif deviations_name == "tips" or deviations_name == "twice informed partial sequence": + deviation_sets = return_twice_informed_partial_sequence + elif deviations_name == "bhv" or deviations_name == "single target behavioural" or deviations_name =="behavioural": + deviation_sets = return_behavourial + else: + print("Unsupported Deviation Set") + return None + super(EFRSolver, self).__init__(game, + _deviation_gen = deviation_sets, + discounting = discounting, + discounting_parameters = discounting_parameters + ) + self._external_only = external_only + def _regret_matching(self, legal_actions, info_set_node): + """Returns an info state policy by applying regret-matching. + Args: + cumulative_regrets: A {deviation: y value} dictionary. + legal_actions: the list of legal actions at this state. + + Returns: + A dict of action -> prob for all legal actions. + """ + z = sum(info_set_node.y_values.values()) + info_state_policy = {} + + #The fixed point solution can be directly obtained through the weighted regret matrix if only external deviations are used + if self._external_only and z > 0: + weighted_deviation_matrix = np.zeros((len(legal_actions), len(legal_actions))) + for dev in list(info_set_node.y_values.keys()): + weighted_deviation_matrix += (info_set_node.y_values[dev]/z) * dev.return_transform_matrix() + new_strategy = weighted_deviation_matrix[:,0] + for index in range(len(legal_actions)): + info_state_policy[legal_actions[index]] = new_strategy[index] + + #Full regret matching by finding the least squares solution to the fixed point + #Last row of matrix and the column entry ensures the solution is a strategy (otherwise would have to normalise) + elif z > 0: + num_actions = len(info_set_node.legal_actions) + weighted_deviation_matrix = -np.eye(num_actions) + + #Calculate the + for dev in list(info_set_node.y_values.keys()): + weighted_deviation_matrix += (info_set_node.y_values[dev]/z) * dev.return_transform_matrix() + + normalisation_row = np.ones(num_actions) + weighted_deviation_matrix = np.vstack([weighted_deviation_matrix, normalisation_row]) + b = np.zeros(num_actions+1) + b[num_actions] = 1 + b = np.reshape(b, (num_actions+1, 1)) + + strategy = lstsq(weighted_deviation_matrix, b)[0] + normalised_strategy = strategy + #Adopt same cutting strategy as author's code + normalised_strategy[np.where(normalised_strategy<0)] = 0 + normalised_strategy[np.where(normalised_strategy>1)] = 1 + + #Should be irrelavant + normalised_strategy = normalised_strategy/sum(normalised_strategy) + for index in range(len(normalised_strategy)): + info_state_policy[info_set_node.legal_actions[index]] = normalised_strategy[index] + #Use a uniform strategy as sum of all regrets is negative + else: + for index in range(len(legal_actions)): + info_state_policy[legal_actions[index]] = 1.0 / len(legal_actions) + + return info_state_policy + +def _update_average_policy(average_policy, info_state_nodes): + """Updates in place `average_policy` to the average of all policies iterated. + + This function is a module level function to be reused by both CFRSolver and + CFRBRSolver. + + Args: + average_policy: A `policy.TabularPolicy` to be updated in-place. + info_state_nodes: A dictionary {`info_state_str` -> `_InfoStateNode`}. + """ + for info_state, info_state_node in info_state_nodes.items(): + info_state_policies_sum = info_state_node.cumulative_policy + state_policy = average_policy.policy_for_key(info_state) + probabilities_sum = sum(info_state_policies_sum.values()) + if probabilities_sum == 0: + num_actions = len(info_state_node.legal_actions) + for action in info_state_node.legal_actions: + state_policy[action] = 1 / num_actions + else: + for action, action_prob_sum in info_state_policies_sum.items(): + state_policy[action] = action_prob_sum / probabilities_sum + +def strat_dict_to_array(sd): + actions = list(sd.keys()) + strategy = np.zeros((len(actions),1)) + for action in range(len(actions)): + strategy[action][0] = sd[actions[action]] + return strategy + +def array_to_strat_dict(sa, legal_actions): + sd = {} + for action in legal_actions: + sd[action] = sa[action] + return sd + +def create_probs_from_index(indices, current_policy): + path_to_state = [] + if indices == None or len(indices) == 0: + return [] + for index in indices: + strat_dict = array_to_strat_dict(current_policy.action_probability_array[index[1]], index[0]) + path_to_state.append(strat_dict) + return path_to_state + + +#Deviation set definitions +def return_blind_action(num_actions, history, _): + """ + Returns an array of all Blind Action deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Blind Action deviations that are realizable at the + information set. + """ + memory_weights = [np.full(len(history), 1)] + prior_actions_in_memory = history + return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + +def return_informed_action(num_actions, history, _): + """ + Returns an array of all Informed Action deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Informed Action deviations that are realizable at the + information set. + """ + memory_weights = [np.full(len(history), 1)] + prior_actions_in_memory = history + return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + +def return_blind_CF(num_actions, history, _): + """ + Returns an array of all Blind Counterfactual deviations with respect to an information set. + Note: EFR using only Blind Counterfactual deviations is equivalent to vanilla Counterfactual + Regret Minimisation (CFR). + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Blind CF deviations that are realizable at the + information set. + """ + memory_weights = [None] + prior_actions_in_memory = np.zeros(len(history)) + return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + +def return_informed_CF(num_actions, history, _): + memory_weights = [None] + prior_actions_in_memory = history + return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + +def return_blind_partial_sequence(num_actions, history, _): + """ + Returns an array of all Blind Partial Sequence deviations (BPS) with respect to an information set + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all BPS deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + memory_weights = [None] + if len(history)>0: + memory_weights.append(np.ones(len(history))) + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + memory_weights.append(possible_memory_weight) + return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + +def return_cf_partial_sequence(num_actions, history, _): + """ + Returns an array of all Counterfactual Partial Sequence deviations (CFPS) with respect to an information set + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all CFPS deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + memory_weights = [None] + if len(history)>0: + memory_weights.append(np.ones(len(history))) + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + memory_weights.append(possible_memory_weight) + return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + +def return_cs_partial_sequence(num_actions, history, prior_legal_actions): + """ + Returns an array of all Casual Partial Sequence deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + prior_legal_actions: an array containing the index in .... that + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Casual Partial Sequence deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + external_memory_weights = [None] + + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + external_memory_weights.append(possible_memory_weight) + + external = return_all_external_modified_deviations(num_actions, external_memory_weights, prior_legal_actions,prior_actions_in_memory, history) + internal = return_blind_action(num_actions, history, None) + + cf_ext = return_informed_CF(num_actions, history, None) + cf_int = return_blind_CF(num_actions, history, None) + + return np.concatenate((external, internal, cf_ext, cf_int)) + +def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions): + """ + Returns an array of all Casual Partial Sequence deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + prior_legal_actions: an array containing the index in .... that + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Casual Partial Sequence deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + external_memory_weights = [None] + + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + external_memory_weights.append(possible_memory_weight) + + external = return_all_external_modified_deviations(num_actions, external_memory_weights, prior_legal_actions,prior_actions_in_memory, history) + internal = return_informed_action(num_actions, history, None) + + cf_ext = return_informed_CF(num_actions, history, None) + return np.concatenate((external, internal, cf_ext)) + +def return_twice_informed_partial_sequence(num_actions, history, prior_legal_actions): + """ + Returns an array of all Twice Informed Partial Sequence (TIPS) deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior + prior_legal_actions: an array containing the index in .... that + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all TIPS deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + memory_weights = [None] + + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + memory_weights.append(possible_memory_weight) + + internal = return_all_internal_modified_deviations(num_actions, memory_weights, prior_legal_actions, prior_actions_in_memory, history) + + cf_int = return_informed_CF(num_actions, history, None) + return np.concatenate((internal, cf_int)) + +def generate_all_action_permutations(current_stem, remaining_actions): + if len(remaining_actions) == 0: + return [np.array(current_stem)] + else: + next_actions = remaining_actions[0] + permutations = [] + for action in next_actions: + next_stem = current_stem.copy() + next_stem.append(action) + next_remaining_actions = remaining_actions[1:] + prev_permutations = generate_all_action_permutations(next_stem ,next_remaining_actions) + for i in prev_permutations: + permutations.append(i) + return permutations +#Includes identity +def return_behavourial(num_actions, history, prior_legal_actions): + deviations = [] + if len(history) == 0: + internal = return_all_non_identity_internal_deviations(num_actions,[None], [None], history) + for i in internal: + deviations.append(i) + else: + for deviation_info in range(len(history)): + prior_possible_memory_actions = generate_all_action_permutations([],prior_legal_actions[:deviation_info+1]) + memory_weights = np.concatenate((np.ones(deviation_info), np.zeros(len(history) - deviation_info))) + for prior_memory_actions in prior_possible_memory_actions: + prior_memory_actions = np.concatenate((prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) + for i in range (len(history) - len(prior_memory_actions)): + prior_memory_actions.append(0) + prior_memory_actions_cp = prior_memory_actions.copy() + internal = return_all_non_identity_internal_deviations(num_actions,[memory_weights], prior_memory_actions_cp, prior_memory_actions_cp) + for i in internal: + deviations.append(i) + + return deviations + + +class LocalDeviationWithTimeSelection(object): + localSwapTransform = attr.ib() + + #Which actions have been forgotten (0) or remembered (1) according to the memory state + prior_actions_weight = attr.ib() + + #Which actions have been take according to the memory state + prior_memory_actions = attr.ib() + + use_unmodified_history = attr.ib() + + def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, is_external, use_unmodified_history = True): + """" + Args: + target: the action that will be played when the deviation is triggered + source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) + num_actions: the integer of actions + prior_actions_weight: + is_external: a boolean use to determine whether to create an internal or external type deviation + use_unmodified_history: + """ + self.localSwapTransform = LocalSwapTransform(target, source, num_actions, is_external = is_external) + self.prior_actions_weight = prior_actions_weight + self.prior_memory_actions = prior_memory_actions + self.use_unmodified_history = use_unmodified_history + + #If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) + def deviate(self,strategy): + return self.localSwapTransform.deviate(strategy) + def return_transform_matrix(self): + return self.localSwapTransform.matrix_transform + def player_deviation_reach_probability(self, prior_possible_action_probabilities): + try: + if self.prior_actions_weight == None: + return 1.0 + elif self.prior_memory_actions == None: + return 1.0 + except: + try: + if prior_possible_action_probabilities == None: + return 1.0 + except: + try: + if self.prior_memory_actions == None: + return 1.0 + except: + pass + + memory_action_probabilities = np.ones(len(self.prior_actions_weight)) + #Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs + memory_weightings = self.prior_actions_weight.copy() + if self.use_unmodified_history: + for state in range(len(self.prior_memory_actions)): + if not self.prior_actions_weight[state] == 0: + #Append this, create an array of these and multiply (migt need to cast to an np array) + #print(prior_possible_action_probabilities) + #print(self.prior_memory_actions) + memory_action_probabilities[state] = (prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) + else: + memory_action_probabilities[state] = 1 + memory_weightings[state] = 1 + path_probability = np.multiply(memory_weightings, memory_action_probabilities) + memory_reach_probability = np.prod(path_probability) + return memory_reach_probability + def __eq__(self,other): + if self.localSwapTransform == other.localSwapTransform: + return True + else: + return False + def __hash__(self): + return hash(self.localSwapTransform) + +#Methods to return all +def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions, history): + deviations = [] + for prior_actions_weight in possible_prior_weights: + for target in range(num_actions): + for source in range(num_actions): + if not source == target: + deviations.append(LocalDeviationWithTimeSelection(target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + return deviations + +#EXCLUDES IDENTITY +def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, history): + deviations = [] + for prior_actions_weight in possible_prior_weights: + try: + modificationIndex = np.where(prior_actions_weight == 0)[0][0] + except: + modificationIndex = 0 + if modificationIndex == len(prior_memory_actions): + for target in range(num_actions): + for source in range(num_actions): + if not source == target: + deviations.append(LocalDeviationWithTimeSelection(target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + else: + previous_action = prior_memory_actions[modificationIndex] + for alt_action in possible_prior_memory_actions[modificationIndex]: + prior_memory_actions[modificationIndex] = alt_action + for target in range(num_actions): + for source in range(num_actions): + if not source == target: + deviations.append(LocalDeviationWithTimeSelection(target, source, num_actions, prior_actions_weight, prior_memory_actions.copy(), False)) + prior_memory_actions[modificationIndex] = previous_action + return deviations + +def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions, history): + deviations = [] + for prior_actions_weight in possible_prior_weights: + for target in range(num_actions): + deviations.append(LocalDeviationWithTimeSelection(target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + return deviations + +#Modify last action as required +def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, history): + deviations = [] + for prior_actions_weight in possible_prior_weights: + try: + modificationIndex = np.where(prior_actions_weight == 0)[0][0] + except: + modificationIndex = 0 + if modificationIndex == len(prior_memory_actions): + for target in range(num_actions): + deviations.append(LocalDeviationWithTimeSelection(target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + else: + previous_action = prior_memory_actions[modificationIndex] + for alt_action in possible_prior_memory_actions[modificationIndex]: + prior_memory_actions[modificationIndex] = alt_action + for target in range(num_actions): + deviations.append(LocalDeviationWithTimeSelection(target, target, num_actions, prior_actions_weight, prior_memory_actions.copy(), True)) + prior_memory_actions[modificationIndex] = previous_action + return deviations + +def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions, history): + deviations = [] + for prior_actions_weight in possible_prior_weights: + deviations.append(LocalDeviationWithTimeSelection(0, 0, num_actions, prior_actions_weight, prior_memory_actions, False)) + return deviations + + +#A swap transformation given by the matrix_transform for an information state of +class LocalSwapTransform(object): + sourceAction = attr.ib() + targetAction = attr.ib() + matrix_transform = attr.ib() + actionsNum = attr.ib() + is_external = attr.ib() + + def __init__(self, target,source,actionsNum, is_external = True): + self.sourceAction = source + self.targetAction = target + self.actionsNum = actionsNum + #A + if is_external: + self.sourceAction = None + self.matrix_transform = np.zeros((actionsNum,actionsNum)) + self.matrix_transform[target] = np.ones(actionsNum) + else: + self.matrix_transform = np.eye(actionsNum) + self.matrix_transform[target][source] = 1 + self.matrix_transform[source][source] = 0 + def __repr__(self) -> str: + return "Shifting probabilty from Action: "+str(self.sourceAction) +" to Action: "+str(self.targetAction) + def __eq__(self, __o: object) -> bool: + if self.sourceAction == __o.sourceAction and self.targetAction == __o.targetAction and self.actionsNum == __o.actionsNum: + return True + else: + return False + def __hash__(self): + separator = "£$" + return hash(str(self.sourceAction)+separator+str(self.targetAction)+separator+str(self.actionsNum)+ separator +str(self.is_external)) + #If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) + def deviate(self,strategy): + """ + Returns the + + """ + return np.matmul(self.matrix_transform, strategy) \ No newline at end of file From c21c7e2fd9611ec2068f1bd183dce154128f25d8 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Mon, 24 Jul 2023 21:52:21 +0100 Subject: [PATCH 02/18] Removed discounting --- open_spiel/python/algorithms/efr.py | 165 +++++++++++----------------- 1 file changed, 62 insertions(+), 103 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index 0760aaeab1..da4ec0a8bd 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -1,4 +1,4 @@ -# Copyright 2023 DeepMind Technologies Limited +# Copyright 2019 DeepMind Technologies Limited # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. @@ -11,10 +11,8 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - #Modified: 2023 James Flynn #Original: https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py - """Python implementation of the counterfactual regret minimization algorithm. One iteration of CFR consists of: @@ -25,7 +23,6 @@ The average policy is what converges to a Nash Equilibrium. """ -import collections import attr import copy import numpy as np @@ -48,18 +45,16 @@ class _InfoStateNode(object): #An array representing history = attr.ib() - updates = attr.ib() - updated = attr.ib() - cumulative_regret = attr.ib(factory=lambda: collections.defaultdict(float)) + cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) # Same as above for the cumulative of the policy probabilities computed # during the policy iterations - cumulative_policy = attr.ib(factory=lambda: collections.defaultdict(float)) - y_values = attr.ib(factory=lambda: collections.defaultdict(float)) + cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) + y_values = attr.ib(factory=lambda: defaultdict(float)) class _EFRSolverBase(object): - def __init__(self, game, _deviation_gen, discounting, discounting_parameters): + def __init__(self, game, _deviation_gen): assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, () self._game = game @@ -72,19 +67,16 @@ def __init__(self, game, _deviation_gen, discounting, discounting_parameters): self._deviation_gen = _deviation_gen self._info_state_nodes = {} - hist = {player : [] for player in range(self._num_players)} - self._initialize_info_state_nodes(self._root_node, hist, [[] for _ in range(self._num_players)],[[] for _ in range(self._num_players)]) + hist = {player: [] for player in range(self._num_players)} + unif_probs = [[] for _ in range(self._num_players)], + empty_path_indices = [[] for _ in range(self._num_players)] + self._initialize_info_state_nodes(self._root_node, hist, unif_probs, empty_path_indices) self._iteration = 1 # For possible linear-averaging. - self.discounting = discounting - self.alpha = discounting_parameters[0] - self.beta = discounting_parameters[1] - self.gamma = discounting_parameters[2] - - self._str_to_action = {} def return_cumulative_regret(self): return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret for i in range(len(self._info_state_nodes.keys()))} + def current_policy(self): return self._current_policy @@ -111,9 +103,7 @@ def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state,pa index_in_tabular_policy=self._current_policy.state_lookup[info_state], relizable_deviations = None, history = history[current_player].copy(), - current_history_probs = copy.deepcopy(path_indices[current_player]), - updates = 0, - updated = False + current_history_probs = copy.deepcopy(path_indices[current_player]) ) prior_possible_actions = [] for i in range(len(info_state_node.current_history_probs)): @@ -155,22 +145,13 @@ def _update_current_policy(self,state, current_policy): #change too infostate mem_reach_probs = create_probs_from_index(info_state_node.current_history_probs, current_policy) deviation_reach_prob = deviations[devation].player_deviation_reach_probability(mem_reach_probs) - accum_regret_discount = 1 - if self.discounting == True: - #No point in discounting 0 regret - if info_state_node.y_values[deviations[devation]]>0: - talpha = self._iteration**self.alpha - accum_regret_discount = talpha/(talpha+1) - info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]]*accum_regret_discount + max(0,info_state_node.cumulative_regret[devation])*deviation_reach_prob + info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]] + max(0,info_state_node.cumulative_regret[devation])*deviation_reach_prob #Might be incorrect state_policy = current_policy.policy_for_key(info_state) #print for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): state_policy[action] = value - info_state_node.updated = True - - info_state_node.updates +=1 for action in info_state_node.legal_actions: new_state = state.child(action) @@ -220,7 +201,7 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, info_state_node = self._info_state_nodes[info_state] #Reset y values - info_state_node.y_values = collections.defaultdict(float) + info_state_node.y_values = defaultdict(float) if policies is None: info_state_policy = self._get_infostate_policy(info_state) else: @@ -229,10 +210,7 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, reach_prob = reach_probabilities[current_player] for action in state.legal_actions(): action_prob = info_state_policy.get(action, 0.) - current_policy_discount = 1.0 - if self.discounting == True: - current_policy_discount = (self._iteration/self._iteration)**self.gamma - info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action]*current_policy_discount + action_prob * reach_prob + info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action] + action_prob * reach_prob new_state = state.child(action) new_reach_probabilities = reach_probabilities.copy() assert action_prob <= 1 @@ -257,7 +235,7 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, memory_reach_probs = create_probs_from_index(info_state_node.current_history_probs,self.current_policy()) player_current_memory_reach_prob = deviation.player_deviation_reach_probability(memory_reach_probs) - deviation_regret = player_current_memory_reach_prob *((devation_cf_value*counterfactual_reach_prob) - (counterfactual_reach_prob * state_value_for_player)) + deviation_regret = player_current_memory_reach_prob * ((devation_cf_value*counterfactual_reach_prob) - (counterfactual_reach_prob * state_value_for_player)) info_state_node.cumulative_regret[deviationIndex] += deviation_regret return state_value @@ -277,8 +255,8 @@ def __get_infostate_policy_array(self, info_state_str): info_state_node.index_in_tabular_policy] class _EFRSolver(_EFRSolverBase): - def __init__(self, game, _deviation_gen, discounting, discounting_parameters): - super().__init__(game, _deviation_gen, discounting, discounting_parameters) + def __init__(self, game, _deviation_gen): + super().__init__(game, _deviation_gen) def evaluate_and_update_policy(self): """Performs a single step of policy evaluation and policy improvement.""" @@ -287,12 +265,11 @@ def evaluate_and_update_policy(self): policies=None, reach_probabilities=np.ones(self._game.num_players() + 1), player=None) - history = [ [] for _ in range(self._num_players)] - self._update_current_policy(self._root_node,self._current_policy) - self._iteration+= 1 + self._update_current_policy(self._root_node, self._current_policy) + self._iteration += 1 class EFRSolver(_EFRSolver): - def __init__(self, game, deviations_name, discounting = False, discounting_parameters = [1,1,1]): + def __init__(self, game, deviations_name): #Takes the deviation sets used for learning from Deviation_Sets external_only = False @@ -308,8 +285,6 @@ def __init__(self, game, deviations_name, discounting = False, discounting_param external_only = True elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": deviation_sets = return_informed_CF - elif deviations_name == "swap cf" or deviations_name == "swap counterfactual": - deviation_sets = return_swap_cf elif deviations_name == "bps" or deviations_name == "blind partial sequence": deviation_sets = return_blind_partial_sequence external_only = True @@ -324,11 +299,7 @@ def __init__(self, game, deviations_name, discounting = False, discounting_param else: print("Unsupported Deviation Set") return None - super(EFRSolver, self).__init__(game, - _deviation_gen = deviation_sets, - discounting = discounting, - discounting_parameters = discounting_parameters - ) + super(EFRSolver, self).__init__(game, _deviation_gen=deviation_sets) self._external_only = external_only def _regret_matching(self, legal_actions, info_set_node): """Returns an info state policy by applying regret-matching. @@ -357,7 +328,6 @@ def _regret_matching(self, legal_actions, info_set_node): num_actions = len(info_set_node.legal_actions) weighted_deviation_matrix = -np.eye(num_actions) - #Calculate the for dev in list(info_set_node.y_values.keys()): weighted_deviation_matrix += (info_set_node.y_values[dev]/z) * dev.return_transform_matrix() @@ -368,20 +338,18 @@ def _regret_matching(self, legal_actions, info_set_node): b = np.reshape(b, (num_actions+1, 1)) strategy = lstsq(weighted_deviation_matrix, b)[0] - normalised_strategy = strategy - #Adopt same cutting strategy as author's code - normalised_strategy[np.where(normalised_strategy<0)] = 0 - normalised_strategy[np.where(normalised_strategy>1)] = 1 - - #Should be irrelavant - normalised_strategy = normalised_strategy/sum(normalised_strategy) - for index in range(len(normalised_strategy)): - info_state_policy[info_set_node.legal_actions[index]] = normalised_strategy[index] + + #Adopt same cutting strategy as paper author's code + strategy[np.where(strategy<0)] = 0 + strategy[np.where(strategy>1)] = 1 + + strategy = strategy/sum(strategy) + for index in range(len(strategy)): + info_state_policy[info_set_node.legal_actions[index]] = strategy[index] #Use a uniform strategy as sum of all regrets is negative else: for index in range(len(legal_actions)): info_state_policy[legal_actions[index]] = 1.0 / len(legal_actions) - return info_state_policy def _update_average_policy(average_policy, info_state_nodes): @@ -405,7 +373,8 @@ def _update_average_policy(average_policy, info_state_nodes): else: for action, action_prob_sum in info_state_policies_sum.items(): state_policy[action] = action_prob_sum / probabilities_sum - + + def strat_dict_to_array(sd): actions = list(sd.keys()) strategy = np.zeros((len(actions),1)) @@ -413,12 +382,14 @@ def strat_dict_to_array(sd): strategy[action][0] = sd[actions[action]] return strategy + def array_to_strat_dict(sa, legal_actions): sd = {} for action in legal_actions: sd[action] = sa[action] return sd + def create_probs_from_index(indices, current_policy): path_to_state = [] if indices == None or len(indices) == 0: @@ -622,10 +593,10 @@ def return_behavourial(num_actions, history, prior_legal_actions): memory_weights = np.concatenate((np.ones(deviation_info), np.zeros(len(history) - deviation_info))) for prior_memory_actions in prior_possible_memory_actions: prior_memory_actions = np.concatenate((prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) - for i in range (len(history) - len(prior_memory_actions)): + for i in range(len(history) - len(prior_memory_actions)): prior_memory_actions.append(0) prior_memory_actions_cp = prior_memory_actions.copy() - internal = return_all_non_identity_internal_deviations(num_actions,[memory_weights], prior_memory_actions_cp, prior_memory_actions_cp) + internal = return_all_non_identity_internal_deviations(num_actions, [memory_weights], prior_memory_actions_cp, prior_memory_actions_cp) for i in internal: deviations.append(i) @@ -644,19 +615,19 @@ class LocalDeviationWithTimeSelection(object): use_unmodified_history = attr.ib() def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, is_external, use_unmodified_history = True): - """" - Args: - target: the action that will be played when the deviation is triggered - source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) - num_actions: the integer of actions - prior_actions_weight: - is_external: a boolean use to determine whether to create an internal or external type deviation - use_unmodified_history: - """ - self.localSwapTransform = LocalSwapTransform(target, source, num_actions, is_external = is_external) - self.prior_actions_weight = prior_actions_weight - self.prior_memory_actions = prior_memory_actions - self.use_unmodified_history = use_unmodified_history + """" + Args: + target: the action that will be played when the deviation is triggered + source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) + num_actions: the integer of actions + prior_actions_weight: + is_external: a boolean use to determine whether to create an internal or external type deviation + use_unmodified_history: + """ + self.localSwapTransform = LocalSwapTransform(target, source, num_actions, is_external = is_external) + self.prior_actions_weight = prior_actions_weight + self.prior_memory_actions = prior_memory_actions + self.use_unmodified_history = use_unmodified_history #If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) def deviate(self,strategy): @@ -665,30 +636,17 @@ def return_transform_matrix(self): return self.localSwapTransform.matrix_transform def player_deviation_reach_probability(self, prior_possible_action_probabilities): try: - if self.prior_actions_weight == None: - return 1.0 - elif self.prior_memory_actions == None: - return 1.0 + if self.prior_actions_weight == None or self.prior_memory_actions == None or prior_possible_action_probabilities: + return 1.0 except: - try: - if prior_possible_action_probabilities == None: - return 1.0 - except: - try: - if self.prior_memory_actions == None: - return 1.0 - except: - pass - + return 1.0 + memory_action_probabilities = np.ones(len(self.prior_actions_weight)) #Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs memory_weightings = self.prior_actions_weight.copy() if self.use_unmodified_history: for state in range(len(self.prior_memory_actions)): if not self.prior_actions_weight[state] == 0: - #Append this, create an array of these and multiply (migt need to cast to an np array) - #print(prior_possible_action_probabilities) - #print(self.prior_memory_actions) memory_action_probabilities[state] = (prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) else: memory_action_probabilities[state] = 1 @@ -696,7 +654,8 @@ def player_deviation_reach_probability(self, prior_possible_action_probabilities path_probability = np.multiply(memory_weightings, memory_action_probabilities) memory_reach_probability = np.prod(path_probability) return memory_reach_probability - def __eq__(self,other): + + def __eq__(self, other): if self.localSwapTransform == other.localSwapTransform: return True else: @@ -705,7 +664,7 @@ def __hash__(self): return hash(self.localSwapTransform) #Methods to return all -def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions, history): +def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: for target in range(num_actions): @@ -715,7 +674,7 @@ def return_all_non_identity_internal_deviations(num_actions, possible_prior_weig return deviations #EXCLUDES IDENTITY -def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, history): +def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: try: @@ -738,7 +697,7 @@ def return_all_internal_modified_deviations(num_actions, possible_prior_weights prior_memory_actions[modificationIndex] = previous_action return deviations -def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions, history): +def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: for target in range(num_actions): @@ -746,7 +705,7 @@ def return_all_external_deviations(num_actions, possible_prior_weights, prior_m return deviations #Modify last action as required -def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, history): +def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: try: @@ -765,7 +724,7 @@ def return_all_external_modified_deviations(num_actions, possible_prior_weights prior_memory_actions[modificationIndex] = previous_action return deviations -def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions, history): +def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: deviations.append(LocalDeviationWithTimeSelection(0, 0, num_actions, prior_actions_weight, prior_memory_actions, False)) @@ -801,12 +760,12 @@ def __eq__(self, __o: object) -> bool: else: return False def __hash__(self): - separator = "£$" + separator = " " return hash(str(self.sourceAction)+separator+str(self.targetAction)+separator+str(self.actionsNum)+ separator +str(self.is_external)) #If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) def deviate(self,strategy): """ - Returns the + Returns the deviation strategy """ - return np.matmul(self.matrix_transform, strategy) \ No newline at end of file + return np.matmul(self.matrix_transform, strategy) From a79389279be4ccc716e0744437108c8bde1b9c6e Mon Sep 17 00:00:00 2001 From: James Flynn Date: Mon, 24 Jul 2023 22:06:19 +0100 Subject: [PATCH 03/18] Updated algorithm doc --- docs/algorithms.md | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/algorithms.md b/docs/algorithms.md index 0bc7d66b1e..4714d9d0ff 100644 --- a/docs/algorithms.md +++ b/docs/algorithms.md @@ -23,6 +23,7 @@ CFR against a best responder (CFR-BR) | Tabular Exploitability / Best response | Tabular | [Shoham & Leyton-Brown '09](http://masfoundations.org/) | ![](_static/green_circ10.png "green circle") External sampling Monte Carlo CFR | Tabular | [Lanctot et al. '09](http://mlanctot.info/files/papers/nips09mccfr.pdf), [Lanctot '13](http://mlanctot.info/files/papers/PhD_Thesis_MarcLanctot.pdf) | ![](_static/green_circ10.png "green circle") Fixed Strategy Iteration CFR (FSICFR) | Tabular | [Neller & Hnath '11](https://cupola.gettysburg.edu/csfac/2/) | ~ +Extensive-form Regret Minimization | Tabular | [Morrill et. al. '22](https://arxiv.org/abs/2102.06973) | ~ Mean-field Ficticious Play for MFG | Tabular | [Perrin et. al. '20](https://arxiv.org/abs/2007.03458) | ~ Online Mirror Descent for MFG | Tabular | [Perolat et. al. '21](https://arxiv.org/abs/2103.00623) | ~ Munchausen Online Mirror Descent for MFG | Tabular | [Lauriere et. al. '22](https://arxiv.org/pdf/2203.11973) | ~ From 4bd81a8a9077697e62df6062c19b08545bc9fcaf Mon Sep 17 00:00:00 2001 From: Jameswflynn1 Date: Mon, 24 Jul 2023 22:41:18 +0100 Subject: [PATCH 04/18] Added initial test --- open_spiel/python/algorithms/efr_test.py | 112 +++++++++++++++++++++++ 1 file changed, 112 insertions(+) create mode 100644 open_spiel/python/algorithms/efr_test.py diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py new file mode 100644 index 0000000000..8cfa3a7628 --- /dev/null +++ b/open_spiel/python/algorithms/efr_test.py @@ -0,0 +1,112 @@ +# Copyright 2023 DeepMind Technologies Limited +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Tests for open_spiel.python.algorithms.efr.""" + +import itertools + +from absl.testing import absltest +from absl.testing import parameterized +import numpy as np + +from open_spiel.python import policy +from open_spiel.python.algorithms import efr +from open_spiel.python.algorithms import expected_game_score +from open_spiel.python.algorithms import exploitability +import pyspiel + +_KUHN_GAME = pyspiel.load_game("kuhn_poker") +_LEDUC_GAME = pyspiel.load_game("leduc_poker") + +_KUHN_UNIFORM_POLICY = policy.TabularPolicy(_KUHN_GAME) +_LEDUC_UNIFORM_POLICY = policy.TabularPolicy(_LEDUC_GAME) + + +class ModuleLevelFunctionTest(absltest.TestCase): + + def test__update_current_policy(self): + game = pyspiel.load_game("kuhn_poker") + tabular_policy = policy.TabularPolicy(game) + + cumulative_regrets = np.arange(0, 12 * 2).reshape((12, 2)) + expected_policy = cumulative_regrets / np.sum( + cumulative_regrets, axis=-1, keepdims=True) + nodes_indices = { + u"0": 0, + u"0pb": 1, + u"1": 2, + u"1pb": 3, + u"2": 4, + u"2pb": 5, + u"1p": 6, + u"1b": 7, + u"2p": 8, + u"2b": 9, + u"0p": 10, + u"0b": 11, + } + # pylint: disable=g-complex-comprehension + info_state_nodes = { + key: efr._InfoStateNode( + legal_actions=[0, 1], + index_in_tabular_policy=None, + cumulative_regret=dict(enumerate(cumulative_regrets[index])), + cumulative_policy=None) for key, index in nodes_indices.items() + } + available_deviations = ["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"] + + # pylint: enable=g-complex-comprehension + + efr._update_current_policy(tabular_policy, info_state_nodes) + + np.testing.assert_array_equal(expected_policy, + tabular_policy.action_probability_array) + + +class EFRTest(parameterized.TestCase, absltest.TestCase): + + @parameterized.parameters( + ["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) + def test_policy_zero_is_uniform(self): + # We use Leduc and not Kuhn, because Leduc has illegal actions and Kuhn does + # not. + game = pyspiel.load_game("leduc_poker") + cfr_solver = efr._EFRSolver( + game, + deviations_name=deviations_name + ) + + np.testing.assert_array_equal( + _LEDUC_UNIFORM_POLICY.action_probability_array, + cfr_solver.current_policy().action_probability_array) + np.testing.assert_array_equal( + _LEDUC_UNIFORM_POLICY.action_probability_array, + cfr_solver.average_policy().action_probability_array) + + @parameterized.parameters( + ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) + def test_cfr_kuhn_poker(self): + game = pyspiel.load_game("kuhn_poker") + efr_solver = efr.EFRSolver(game) + for _ in range(300): + efr_solver.evaluate_and_update_policy() + average_policy = efr_solver.average_policy() + average_policy_values = expected_game_score.policy_value( + game.new_initial_state(), [average_policy] * 2) + # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker + np.testing.assert_allclose( + average_policy_values, [-1 / 18, 1 / 18], atol=1e-3) + +if __name__ == "__main__": + absltest.main() From ed1267e049c441bd7d88c8eac21f1ce79412e8e6 Mon Sep 17 00:00:00 2001 From: Jameswflynn1 Date: Mon, 21 Aug 2023 00:59:07 +0100 Subject: [PATCH 05/18] Linted and more comments --- open_spiel/python/algorithms/efr.py | 990 +++++++++++++++------------- 1 file changed, 549 insertions(+), 441 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index da4ec0a8bd..59a325d069 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -11,396 +11,450 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. -#Modified: 2023 James Flynn -#Original: https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py -"""Python implementation of the counterfactual regret minimization algorithm. +# Modified: 2023 James Flynn +# Original: https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py +"""Python implementation of the extensive-form regret minimization algorithm. -One iteration of CFR consists of: +One iteration of EFR consists of: 1) Compute current strategy from regrets (e.g. using Regret Matching). 2) Compute values using the current strategy 3) Compute regrets from these values -The average policy is what converges to a Nash Equilibrium. +The average policy converges to a Nash Equilibrium rather than the current policy as in CFR. """ - -import attr import copy -import numpy as np from collections import defaultdict +import attr -from open_spiel.python import policy +import numpy as np from scipy.linalg import lstsq + import pyspiel +from open_spiel.python import policy + @attr.s class _InfoStateNode(object): - """An object wrapping values associated to an information state.""" - # The list of the legal actions. - legal_actions = attr.ib() - index_in_tabular_policy = attr.ib() - # The newly availible deviations + the old ones - relizable_deviations = attr.ib() - #Player -> state -> action -> prob - current_history_probs = attr.ib() + """An object wrapping values associated to an information state.""" + # The list of the legal actions. + legal_actions = attr.ib() + index_in_tabular_policy = attr.ib() + # The newly availible deviations + the old ones + relizable_deviations = attr.ib() + # Player -> state -> action -> prob + current_history_probs = attr.ib() - #An array representing - history = attr.ib() + # An array representing + history = attr.ib() - cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) - # Same as above for the cumulative of the policy probabilities computed - # during the policy iterations - cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) - y_values = attr.ib(factory=lambda: defaultdict(float)) + cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) + # Same as above for the cumulative of the policy probabilities computed + # during the policy iterations + cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) + y_values = attr.ib(factory=lambda: defaultdict(float)) class _EFRSolverBase(object): - def __init__(self, game, _deviation_gen): - assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, () - - self._game = game - self._num_players = game.num_players() - self._root_node = self._game.new_initial_state() - - # This is for returning the current policy and average policy to a caller - self._current_policy = policy.TabularPolicy(game) - self._average_policy = self._current_policy.__copy__() - self._deviation_gen = _deviation_gen - - self._info_state_nodes = {} - hist = {player: [] for player in range(self._num_players)} - unif_probs = [[] for _ in range(self._num_players)], - empty_path_indices = [[] for _ in range(self._num_players)] - self._initialize_info_state_nodes(self._root_node, hist, unif_probs, empty_path_indices) - - self._iteration = 1 # For possible linear-averaging. - - def return_cumulative_regret(self): - return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret for i in range(len(self._info_state_nodes.keys()))} - - def current_policy(self): - return self._current_policy - - def average_policy(self): - _update_average_policy(self._average_policy, self._info_state_nodes) - return self._average_policy - - def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state,path_indices): - if state.is_terminal(): - return - - if state.is_chance_node(): - for action, unused_action_prob in state.chance_outcomes(): - self._initialize_info_state_nodes(state.child(action), history, uniform_probs_to_state, path_indices) - return - - current_player = state.current_player() - info_state = state.information_state_string(current_player) - info_state_node = self._info_state_nodes.get(info_state) - if info_state_node is None: - legal_actions = state.legal_actions(current_player) - info_state_node = _InfoStateNode( - legal_actions=legal_actions, - index_in_tabular_policy=self._current_policy.state_lookup[info_state], - relizable_deviations = None, - history = history[current_player].copy(), - current_history_probs = copy.deepcopy(path_indices[current_player]) - ) - prior_possible_actions = [] - for i in range(len(info_state_node.current_history_probs)): - prior_possible_actions.append(info_state_node.current_history_probs[i][0]) - prior_possible_actions.append(info_state_node.legal_actions) - - info_state_node.relizable_deviations = self._deviation_gen(len(info_state_node.legal_actions), info_state_node.history, prior_possible_actions) - self._info_state_nodes[info_state] = info_state_node - - legal_actions = state.legal_actions(current_player) - new_uniform_probs_to_state = copy.deepcopy(uniform_probs_to_state) - assert len(new_uniform_probs_to_state[current_player]) == len(history[current_player]) - - new_uniform_probs_to_state[current_player].append({legal_actions[i]: 1/len(legal_actions) for i in range(len(legal_actions))}) - for action in info_state_node.legal_actions: - #Speedup - new_path_indices = copy.deepcopy(path_indices) - new_path_indices[current_player].append([legal_actions, info_state_node.index_in_tabular_policy]) - #Speedup - new_history = copy.deepcopy(history) - new_history[current_player].append(action) - assert len(new_history[current_player]) == len(new_path_indices[current_player]) - - self._initialize_info_state_nodes(state.child(action), new_history, new_uniform_probs_to_state, new_path_indices) - - def _update_current_policy(self,state, current_policy): - """Updated in order so that memory reach probs are defined wrt to the new strategy - """ + def __init__(self, game, _deviation_gen): + assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, () + + self._game = game + self._num_players = game.num_players() + self._root_node = self._game.new_initial_state() + + # This is for returning the current policy and average policy to a caller + self._current_policy = policy.TabularPolicy(game) + self._average_policy = self._current_policy.__copy__() + self._deviation_gen = _deviation_gen + + self._info_state_nodes = {} + hist = {player: [] for player in range(self._num_players)} + unif_probs = [[] for _ in range(self._num_players)], + empty_path_indices = [[] for _ in range(self._num_players)] + self._initialize_info_state_nodes( + self._root_node, hist, unif_probs, empty_path_indices) + + self._iteration = 1 # For possible linear-averaging. + + def return_cumulative_regret(self): + return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret for i in range(len(self._info_state_nodes.keys()))} + + def current_policy(self): + return self._current_policy + + def average_policy(self): + _update_average_policy(self._average_policy, self._info_state_nodes) + return self._average_policy + + def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state, path_indices): + if state.is_terminal(): + return + + if state.is_chance_node(): + for action, unused_action_prob in state.chance_outcomes(): + self._initialize_info_state_nodes(state.child( + action), history, uniform_probs_to_state, path_indices) + return + + current_player = state.current_player() + info_state = state.information_state_string(current_player) + info_state_node = self._info_state_nodes.get(info_state) + if info_state_node is None: + legal_actions = state.legal_actions(current_player) + info_state_node = _InfoStateNode( + legal_actions=legal_actions, + index_in_tabular_policy=self._current_policy.state_lookup[info_state], + relizable_deviations=None, + history=history[current_player].copy(), + current_history_probs=copy.deepcopy( + path_indices[current_player]) + ) + prior_possible_actions = [] + for i in range(len(info_state_node.current_history_probs)): + prior_possible_actions.append( + info_state_node.current_history_probs[i][0]) + prior_possible_actions.append(info_state_node.legal_actions) + + info_state_node.relizable_deviations = self._deviation_gen(len( + info_state_node.legal_actions), info_state_node.history, prior_possible_actions) + self._info_state_nodes[info_state] = info_state_node + + legal_actions = state.legal_actions(current_player) + new_uniform_probs_to_state = copy.deepcopy(uniform_probs_to_state) + assert len(new_uniform_probs_to_state[current_player]) == len( + history[current_player]) + + new_uniform_probs_to_state[current_player].append( + {legal_actions[i]: 1/len(legal_actions) for i in range(len(legal_actions))}) + for action in info_state_node.legal_actions: + # Speedup + new_path_indices = copy.deepcopy(path_indices) + new_path_indices[current_player].append( + [legal_actions, info_state_node.index_in_tabular_policy]) + # Speedup + new_history = copy.deepcopy(history) + new_history[current_player].append(action) + assert len(new_history[current_player]) == len( + new_path_indices[current_player]) + + self._initialize_info_state_nodes(state.child( + action), new_history, new_uniform_probs_to_state, new_path_indices) + + def _update_current_policy(self, state, current_policy): + """Updated in order so that memory reach probs are defined wrt to the new strategy + """ - if state.is_terminal(): - return - elif not state.is_chance_node(): - current_player = state.current_player() - info_state = state.information_state_string(current_player) - info_state_node = self._info_state_nodes[info_state] - deviations = info_state_node.relizable_deviations - #print(info_state) - for devation in range(len(deviations)): - #change too infostate - mem_reach_probs = create_probs_from_index(info_state_node.current_history_probs, current_policy) - deviation_reach_prob = deviations[devation].player_deviation_reach_probability(mem_reach_probs) - info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]] + max(0,info_state_node.cumulative_regret[devation])*deviation_reach_prob - - #Might be incorrect - state_policy = current_policy.policy_for_key(info_state) - #print - for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): - state_policy[action] = value - - for action in info_state_node.legal_actions: - new_state = state.child(action) - self._update_current_policy(new_state, current_policy) - else: - for action, action_prob in state.chance_outcomes(): - new_state = state.child(action) - self._update_current_policy(new_state, current_policy) - #Path to state probability ignores chance probabilty as this is stored as new_reach_probabilities[-1] - def _compute_cumulative_immediate_regret_for_player(self, state, policies, - reach_probabilities, player): - if state.is_terminal(): - return np.asarray(state.returns()) - - if state.is_chance_node(): - state_value = 0.0 - for action, action_prob in state.chance_outcomes(): - assert action_prob > 0 - new_state = state.child(action) - new_reach_probabilities = reach_probabilities.copy() - new_reach_probabilities[-1] *= action_prob - - state_value += action_prob * self._compute_cumulative_immediate_regret_for_player( - new_state, policies, new_reach_probabilities, player) - return state_value - - current_player = state.current_player() - info_state = state.information_state_string(current_player) - - - - # No need to continue on this history branch as no update will be performed - # for any player. - # The value we return here is not used in practice. If the conditional - # statement is True, then the last taken action has probability 0 of - # occurring, so the returned value is not impacting the parent node value. - if all(reach_probabilities[:-1] == 0): - return np.zeros(self._num_players) - - state_value = np.zeros(self._num_players) - - # The utilities of the children states are computed recursively. As the - # regrets are added to the information state regrets for each state in that - # information state, the recursive call can only be made once per child - # state. Therefore, the utilities are cached. - children_utilities = {} - - info_state_node = self._info_state_nodes[info_state] - #Reset y values - info_state_node.y_values = defaultdict(float) - if policies is None: - info_state_policy = self._get_infostate_policy(info_state) - else: - info_state_policy = policies[current_player](info_state) + if state.is_terminal(): + return + elif not state.is_chance_node(): + current_player = state.current_player() + info_state = state.information_state_string(current_player) + info_state_node = self._info_state_nodes[info_state] + deviations = info_state_node.relizable_deviations + # print(info_state) + for devation in range(len(deviations)): + # change too infostate + mem_reach_probs = create_probs_from_index( + info_state_node.current_history_probs, current_policy) + deviation_reach_prob = deviations[devation].player_deviation_reach_probability( + mem_reach_probs) + info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]] + max( + 0, info_state_node.cumulative_regret[devation])*deviation_reach_prob + + # Might be incorrect + state_policy = current_policy.policy_for_key(info_state) + # print + for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): + state_policy[action] = value + + for action in info_state_node.legal_actions: + new_state = state.child(action) + self._update_current_policy(new_state, current_policy) + else: + for action, _ in state.chance_outcomes(): + new_state = state.child(action) + self._update_current_policy(new_state, current_policy) + # Path to state probability ignores chance probabilty as this is stored as new_reach_probabilities[-1] + + def _compute_cumulative_immediate_regret_for_player(self, state, policies, + reach_probabilities, player): + if state.is_terminal(): + return np.asarray(state.returns()) + + if state.is_chance_node(): + state_value = 0.0 + for action, action_prob in state.chance_outcomes(): + assert action_prob > 0 + new_state = state.child(action) + new_reach_probabilities = reach_probabilities.copy() + new_reach_probabilities[-1] *= action_prob + + state_value += action_prob * self._compute_cumulative_immediate_regret_for_player( + new_state, policies, new_reach_probabilities, player) + return state_value + + current_player = state.current_player() + info_state = state.information_state_string(current_player) + + # No need to continue on this history branch as no update will be performed + # for any player. + # The value we return here is not used in practice. If the conditional + # statement is True, then the last taken action has probability 0 of + # occurring, so the returned value is not impacting the parent node value. + if all(reach_probabilities[:-1] == 0): + return np.zeros(self._num_players) + + state_value = np.zeros(self._num_players) + + # The utilities of the children states are computed recursively. As the + # regrets are added to the information state regrets for each state in that + # information state, the recursive call can only be made once per child + # state. Therefore, the utilities are cached. + children_utilities = {} + + info_state_node = self._info_state_nodes[info_state] + # Reset y values + info_state_node.y_values = defaultdict(float) + if policies is None: + info_state_policy = self._get_infostate_policy(info_state) + else: + info_state_policy = policies[current_player](info_state) + + reach_prob = reach_probabilities[current_player] + for action in state.legal_actions(): + action_prob = info_state_policy.get(action, 0.) + info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action] + \ + action_prob * reach_prob + new_state = state.child(action) + new_reach_probabilities = reach_probabilities.copy() + assert action_prob <= 1 + new_reach_probabilities[current_player] *= action_prob + child_utility = self._compute_cumulative_immediate_regret_for_player( + new_state, policies=policies, reach_probabilities=new_reach_probabilities, player=player) + + state_value += action_prob * child_utility + children_utilities[action] = child_utility + + counterfactual_reach_prob = (np.prod( + reach_probabilities[:current_player]) * np.prod(reach_probabilities[current_player + 1:])) + + state_value_for_player = state_value[current_player] + deviations = info_state_node.relizable_deviations + for deviation_index in range(len(deviations)): + # FIX ADD DICT TO ARRAY CONVERSION FUNCTION + deviation = deviations[deviation_index] + deviation_strategy = deviation.deviate( + strat_dict_to_array(self._get_infostate_policy(info_state))) + + player_child_utilities = np.array(list(children_utilities.values()))[ + :, current_player] + devation_cf_value = np.inner(np.transpose( + deviation_strategy), player_child_utilities) + + memory_reach_probs = create_probs_from_index( + info_state_node.current_history_probs, self.current_policy()) + player_current_memory_reach_prob = deviation.player_deviation_reach_probability( + memory_reach_probs) + + deviation_regret = player_current_memory_reach_prob * \ + ((devation_cf_value*counterfactual_reach_prob) - + (counterfactual_reach_prob * state_value_for_player)) + + info_state_node.cumulative_regret[deviation_index] += deviation_regret + return state_value + + def _get_infostate_policy(self, info_state_str): + """Returns an {action: prob} dictionary for the policy on `info_state`.""" + info_state_node = self._info_state_nodes[info_state_str] + prob_vec = self._current_policy.action_probability_array[ + info_state_node.index_in_tabular_policy] + return { + action: prob_vec[action] for action in info_state_node.legal_actions + } - reach_prob = reach_probabilities[current_player] - for action in state.legal_actions(): - action_prob = info_state_policy.get(action, 0.) - info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action] + action_prob * reach_prob - new_state = state.child(action) - new_reach_probabilities = reach_probabilities.copy() - assert action_prob <= 1 - new_reach_probabilities[current_player] *= action_prob - child_utility = self._compute_cumulative_immediate_regret_for_player(new_state,policies=policies,reach_probabilities=new_reach_probabilities,player=player) - state_value += action_prob * child_utility - children_utilities[action] = child_utility +def __get_infostate_policy_array(self, info_state_str): + info_state_node = self._info_state_nodes[info_state_str] + return self._current_policy.action_probability_array[ + info_state_node.index_in_tabular_policy] - counterfactual_reach_prob = (np.prod(reach_probabilities[:current_player]) * np.prod(reach_probabilities[current_player + 1:])) - state_value_for_player = state_value[current_player] - deviations = info_state_node.relizable_deviations - for deviationIndex in range(len(deviations)): - #FIX ADD DICT TO ARRAY CONVERSION FUNCTION - deviation = deviations[deviationIndex] - deviation_strategy = deviation.deviate(strat_dict_to_array(self._get_infostate_policy(info_state))) +class _EFRSolver(_EFRSolverBase): + def __init__(self, game, _deviation_gen): + super().__init__(game, _deviation_gen) - player_child_utilities = np.array(list(children_utilities.values()))[:,current_player] - devation_cf_value = np.inner(np.transpose(deviation_strategy), player_child_utilities) + def evaluate_and_update_policy(self): + """Performs a single step of policy evaluation and policy improvement.""" + self._compute_cumulative_immediate_regret_for_player( + self._root_node, + policies=None, + reach_probabilities=np.ones(self._game.num_players() + 1), + player=None) + self._update_current_policy(self._root_node, self._current_policy) + self._iteration += 1 - memory_reach_probs = create_probs_from_index(info_state_node.current_history_probs,self.current_policy()) - player_current_memory_reach_prob = deviation.player_deviation_reach_probability(memory_reach_probs) - - deviation_regret = player_current_memory_reach_prob * ((devation_cf_value*counterfactual_reach_prob) - (counterfactual_reach_prob * state_value_for_player)) - info_state_node.cumulative_regret[deviationIndex] += deviation_regret - return state_value +class EFRSolver(_EFRSolver): + def __init__(self, game, deviations_name): + + # Takes the deviation sets used for learning from Deviation_Sets + external_only = False + deviation_sets = None + + if deviations_name == "blind action": + deviation_sets = return_blind_action + external_only = True + elif deviations_name == "informed action": + deviation_sets = return_informed_action + elif deviations_name == "blind cf" or deviations_name == "blind counterfactual": + deviation_sets = return_blind_CF + external_only = True + elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": + deviation_sets = return_informed_CF + elif deviations_name == "bps" or deviations_name == "blind partial sequence": + deviation_sets = return_blind_partial_sequence + external_only = True + elif deviations_name == "cfps" or deviations_name == "cf partial sequence"\ + or deviations_name == "counterfactual partial sequence": + deviation_sets = return_cf_partial_sequence + elif deviations_name == "csps" or deviations_name == "casual partial sequence": + deviation_sets = return_cs_partial_sequence + elif deviations_name == "tips" or deviations_name == "twice informed partial sequence": + deviation_sets = return_twice_informed_partial_sequence + elif deviations_name == "bhv" or deviations_name == "single target behavioural"\ + or deviations_name == "behavioural": + deviation_sets = return_behavourial + else: + print("Unsupported Deviation Set") + return None + super(EFRSolver, self).__init__(game, _deviation_gen=deviation_sets) + self._external_only = external_only + + def _regret_matching(self, legal_actions, info_set_node): + """Returns an info state policy by applying regret-matching function + over all deviations and time selection functions. + Args: + cumulative_regrets: A {deviation: y value} dictionary. + legal_actions: the list of legal actions at this state. + + Returns: + A dict of action -> prob for all legal actions. + """ + z = sum(info_set_node.y_values.values()) + info_state_policy = {} + + # The fixed point solution can be directly obtained through the weighted regret matrix + # if only external deviations are used + if self._external_only and z > 0: + weighted_deviation_matrix = np.zeros( + (len(legal_actions), len(legal_actions))) + for dev in list(info_set_node.y_values.keys()): + weighted_deviation_matrix += ( + info_set_node.y_values[dev]/z) * dev.return_transform_matrix() + new_strategy = weighted_deviation_matrix[:, 0] + for index in range(len(legal_actions)): + info_state_policy[legal_actions[index]] = new_strategy[index] + + # Full regret matching by finding the least squares solution to the fixed point + # Last row of matrix and the column entry ensures the solution is a strategy (otherwise would have to normalise) + elif z > 0: + num_actions = len(info_set_node.legal_actions) + weighted_deviation_matrix = -np.eye(num_actions) + + for dev in list(info_set_node.y_values.keys()): + weighted_deviation_matrix += ( + info_set_node.y_values[dev]/z) * dev.return_transform_matrix() + + normalisation_row = np.ones(num_actions) + weighted_deviation_matrix = np.vstack( + [weighted_deviation_matrix, normalisation_row]) + b = np.zeros(num_actions+1) + b[num_actions] = 1 + b = np.reshape(b, (num_actions+1, 1)) + + strategy = lstsq(weighted_deviation_matrix, b)[0] + + # Adopt same clipping strategy as paper author's code + strategy[np.where(strategy < 0)] = 0 + strategy[np.where(strategy > 1)] = 1 + + strategy = strategy/sum(strategy) + for index in range(len(strategy)): + info_state_policy[info_set_node.legal_actions[index] + ] = strategy[index] + # Use a uniform strategy as sum of all regrets is negative + else: + for index in range(len(legal_actions)): + info_state_policy[legal_actions[index]]\ + = 1.0 / len(legal_actions) + return info_state_policy - def _get_infostate_policy(self, info_state_str): - """Returns an {action: prob} dictionary for the policy on `info_state`.""" - info_state_node = self._info_state_nodes[info_state_str] - prob_vec = self._current_policy.action_probability_array[ - info_state_node.index_in_tabular_policy] - return { - action: prob_vec[action] for action in info_state_node.legal_actions - } -def __get_infostate_policy_array(self, info_state_str): - info_state_node = self._info_state_nodes[info_state_str] - return self._current_policy.action_probability_array[ - info_state_node.index_in_tabular_policy] +def _update_average_policy(average_policy, info_state_nodes): + """Updates in place `average_policy` to the average of all policies iterated. -class _EFRSolver(_EFRSolverBase): - def __init__(self, game, _deviation_gen): - super().__init__(game, _deviation_gen) - - def evaluate_and_update_policy(self): - """Performs a single step of policy evaluation and policy improvement.""" - self._compute_cumulative_immediate_regret_for_player( - self._root_node, - policies=None, - reach_probabilities=np.ones(self._game.num_players() + 1), - player=None) - self._update_current_policy(self._root_node, self._current_policy) - self._iteration += 1 + This function is a module level function to be reused by both CFRSolver and + CFRBRSolver. -class EFRSolver(_EFRSolver): - def __init__(self, game, deviations_name): - - #Takes the deviation sets used for learning from Deviation_Sets - external_only = False - deviation_sets = None - - if deviations_name == "blind action": - deviation_sets = return_blind_action - external_only = True - elif deviations_name == "informed action": - deviation_sets = return_informed_action - elif deviations_name == "blind cf" or deviations_name == "blind counterfactual": - deviation_sets = return_blind_CF - external_only = True - elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": - deviation_sets = return_informed_CF - elif deviations_name == "bps" or deviations_name == "blind partial sequence": - deviation_sets = return_blind_partial_sequence - external_only = True - elif deviations_name == "cfps" or deviations_name == "cf partial sequence" or deviations_name == "counterfactual partial sequence": - deviation_sets = return_cf_partial_sequence - elif deviations_name == "csps" or deviations_name == "casual partial sequence": - deviation_sets = return_cs_partial_sequence - elif deviations_name == "tips" or deviations_name == "twice informed partial sequence": - deviation_sets = return_twice_informed_partial_sequence - elif deviations_name == "bhv" or deviations_name == "single target behavioural" or deviations_name =="behavioural": - deviation_sets = return_behavourial - else: - print("Unsupported Deviation Set") - return None - super(EFRSolver, self).__init__(game, _deviation_gen=deviation_sets) - self._external_only = external_only - def _regret_matching(self, legal_actions, info_set_node): - """Returns an info state policy by applying regret-matching. Args: - cumulative_regrets: A {deviation: y value} dictionary. - legal_actions: the list of legal actions at this state. - - Returns: - A dict of action -> prob for all legal actions. + average_policy: A `policy.TabularPolicy` to be updated in-place. + info_state_nodes: A dictionary {`info_state_str` -> `_InfoStateNode`}. """ - z = sum(info_set_node.y_values.values()) - info_state_policy = {} - - #The fixed point solution can be directly obtained through the weighted regret matrix if only external deviations are used - if self._external_only and z > 0: - weighted_deviation_matrix = np.zeros((len(legal_actions), len(legal_actions))) - for dev in list(info_set_node.y_values.keys()): - weighted_deviation_matrix += (info_set_node.y_values[dev]/z) * dev.return_transform_matrix() - new_strategy = weighted_deviation_matrix[:,0] - for index in range(len(legal_actions)): - info_state_policy[legal_actions[index]] = new_strategy[index] - - #Full regret matching by finding the least squares solution to the fixed point - #Last row of matrix and the column entry ensures the solution is a strategy (otherwise would have to normalise) - elif z > 0: - num_actions = len(info_set_node.legal_actions) - weighted_deviation_matrix = -np.eye(num_actions) - - for dev in list(info_set_node.y_values.keys()): - weighted_deviation_matrix += (info_set_node.y_values[dev]/z) * dev.return_transform_matrix() - - normalisation_row = np.ones(num_actions) - weighted_deviation_matrix = np.vstack([weighted_deviation_matrix, normalisation_row]) - b = np.zeros(num_actions+1) - b[num_actions] = 1 - b = np.reshape(b, (num_actions+1, 1)) - - strategy = lstsq(weighted_deviation_matrix, b)[0] - - #Adopt same cutting strategy as paper author's code - strategy[np.where(strategy<0)] = 0 - strategy[np.where(strategy>1)] = 1 - - strategy = strategy/sum(strategy) - for index in range(len(strategy)): - info_state_policy[info_set_node.legal_actions[index]] = strategy[index] - #Use a uniform strategy as sum of all regrets is negative - else: - for index in range(len(legal_actions)): - info_state_policy[legal_actions[index]] = 1.0 / len(legal_actions) - return info_state_policy - -def _update_average_policy(average_policy, info_state_nodes): - """Updates in place `average_policy` to the average of all policies iterated. - - This function is a module level function to be reused by both CFRSolver and - CFRBRSolver. - - Args: - average_policy: A `policy.TabularPolicy` to be updated in-place. - info_state_nodes: A dictionary {`info_state_str` -> `_InfoStateNode`}. - """ - for info_state, info_state_node in info_state_nodes.items(): - info_state_policies_sum = info_state_node.cumulative_policy - state_policy = average_policy.policy_for_key(info_state) - probabilities_sum = sum(info_state_policies_sum.values()) - if probabilities_sum == 0: - num_actions = len(info_state_node.legal_actions) - for action in info_state_node.legal_actions: - state_policy[action] = 1 / num_actions - else: - for action, action_prob_sum in info_state_policies_sum.items(): - state_policy[action] = action_prob_sum / probabilities_sum + for info_state, info_state_node in info_state_nodes.items(): + info_state_policies_sum = info_state_node.cumulative_policy + state_policy = average_policy.policy_for_key(info_state) + probabilities_sum = sum(info_state_policies_sum.values()) + if probabilities_sum == 0: + num_actions = len(info_state_node.legal_actions) + for action in info_state_node.legal_actions: + state_policy[action] = 1 / num_actions + else: + for action, action_prob_sum in info_state_policies_sum.items(): + state_policy[action] = action_prob_sum / probabilities_sum -def strat_dict_to_array(sd): - actions = list(sd.keys()) - strategy = np.zeros((len(actions),1)) - for action in range(len(actions)): - strategy[action][0] = sd[actions[action]] - return strategy +def strat_dict_to_array(strategy_dictionary): + """ + A helper function to convert the strategy dictionary action -> prob value to an array. + Args: + strategy_dictionary: a dictionary action -> prob value. + Returns: + strategy_array: an array with the ith action's value at the i-1th index. + """ + actions = list(strategy_dictionary.keys()) + strategy_array = np.zeros((len(actions), 1)) + for action in range(len(actions)): + strategy_array[action][0] = strategy_dictionary[actions[action]] + return strategy_array -def array_to_strat_dict(sa, legal_actions): - sd = {} - for action in legal_actions: - sd[action] = sa[action] - return sd +def array_to_strat_dict(strategy_array, legal_actions): + """ + A helper function to convert a strategy array to an action -> prob value dictionary. + Args: + strategy_array: an array with the ith action's value at the i-1th index. + legal_actions: the list of all legal actions at the current state. + Returns: + strategy_dictionary: a dictionary action -> prob value. + """ + strategy_dictionary = {} + for action in legal_actions: + strategy_dictionary[action] = strategy_array[action] + return strategy_dictionary def create_probs_from_index(indices, current_policy): - path_to_state = [] - if indices == None or len(indices) == 0: - return [] - for index in indices: - strat_dict = array_to_strat_dict(current_policy.action_probability_array[index[1]], index[0]) - path_to_state.append(strat_dict) - return path_to_state + path_to_state = [] + if indices is None or len(indices) == 0: + return [] + for index in indices: + strat_dict = array_to_strat_dict( + current_policy.action_probability_array[index[1]], index[0]) + path_to_state.append(strat_dict) + return path_to_state -#Deviation set definitions +# Deviation set definitions def return_blind_action(num_actions, history, _): """ Returns an array of all Blind Action deviations with respect to an information set. @@ -408,12 +462,15 @@ def return_blind_action(num_actions, history, _): num_actions: the integer of all actions that can be taken at that information set history: an array containing the prior Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Blind Action deviations that are realizable at the + an array of LocalDeviationWithTimeSelection objects that represent all Blind Action deviations + that are realizable at the information set. """ memory_weights = [np.full(len(history), 1)] prior_actions_in_memory = history - return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + return return_all_external_deviations(num_actions, memory_weights, + prior_actions_in_memory, history) + def return_informed_action(num_actions, history, _): """ @@ -429,6 +486,7 @@ def return_informed_action(num_actions, history, _): prior_actions_in_memory = history return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + def return_blind_CF(num_actions, history, _): """ Returns an array of all Blind Counterfactual deviations with respect to an information set. @@ -438,31 +496,34 @@ def return_blind_CF(num_actions, history, _): num_actions: the integer of all actions that can be taken at that information set history: an array containing the prior Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Blind CF deviations that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent all Blind CF deviations + that are realizable at the information set. """ memory_weights = [None] prior_actions_in_memory = np.zeros(len(history)) return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + def return_informed_CF(num_actions, history, _): memory_weights = [None] prior_actions_in_memory = history return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + def return_blind_partial_sequence(num_actions, history, _): """ - Returns an array of all Blind Partial Sequence deviations (BPS) with respect to an information set + Returns an array of all Blind Partial Sequence deviations (BPS) + with respect to an information set Args: num_actions: the integer of all actions that can be taken at that information set history: an array containing the prior Returns: - an array of LocalDeviationWithTimeSelection objects that represent all BPS deviations that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent all BPS deviations + that are realizable at the information set. """ prior_actions_in_memory = history memory_weights = [None] - if len(history)>0: + if len(history) > 0: memory_weights.append(np.ones(len(history))) for i in range(len(history)): possible_memory_weight = np.zeros(len(history)) @@ -470,19 +531,21 @@ def return_blind_partial_sequence(num_actions, history, _): memory_weights.append(possible_memory_weight) return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + def return_cf_partial_sequence(num_actions, history, _): """ - Returns an array of all Counterfactual Partial Sequence deviations (CFPS) with respect to an information set + Returns an array of all Counterfactual Partial Sequence deviations (CFPS) + with respect to an information set Args: num_actions: the integer of all actions that can be taken at that information set history: an array containing the prior Returns: - an array of LocalDeviationWithTimeSelection objects that represent all CFPS deviations that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent all CFPS deviations + that are realizable at the information set. """ prior_actions_in_memory = history memory_weights = [None] - if len(history)>0: + if len(history) > 0: memory_weights.append(np.ones(len(history))) for i in range(len(history)): possible_memory_weight = np.zeros(len(history)) @@ -490,6 +553,7 @@ def return_cf_partial_sequence(num_actions, history, _): memory_weights.append(possible_memory_weight) return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + def return_cs_partial_sequence(num_actions, history, prior_legal_actions): """ Returns an array of all Casual Partial Sequence deviations with respect to an information set. @@ -498,7 +562,8 @@ def return_cs_partial_sequence(num_actions, history, prior_legal_actions): history: an array containing the prior prior_legal_actions: an array containing the index in .... that Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Casual Partial Sequence deviations that are realizable at the + an array of LocalDeviationWithTimeSelection objects that represent all + Casual Partial Sequence deviations that are realizable at the information set. """ prior_actions_in_memory = history @@ -509,7 +574,8 @@ def return_cs_partial_sequence(num_actions, history, prior_legal_actions): possible_memory_weight[0:i] = np.full(i, 1.0) external_memory_weights.append(possible_memory_weight) - external = return_all_external_modified_deviations(num_actions, external_memory_weights, prior_legal_actions,prior_actions_in_memory, history) + external = return_all_external_modified_deviations( + num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory, history) internal = return_blind_action(num_actions, history, None) cf_ext = return_informed_CF(num_actions, history, None) @@ -517,6 +583,7 @@ def return_cs_partial_sequence(num_actions, history, prior_legal_actions): return np.concatenate((external, internal, cf_ext, cf_int)) + def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions): """ Returns an array of all Casual Partial Sequence deviations with respect to an information set. @@ -525,8 +592,8 @@ def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions history: an array containing the prior prior_legal_actions: an array containing the index in .... that Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Casual Partial Sequence deviations that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent all + Casual Partial Sequence deviations that are realizable at the information set. """ prior_actions_in_memory = history external_memory_weights = [None] @@ -536,15 +603,18 @@ def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions possible_memory_weight[0:i] = np.full(i, 1.0) external_memory_weights.append(possible_memory_weight) - external = return_all_external_modified_deviations(num_actions, external_memory_weights, prior_legal_actions,prior_actions_in_memory, history) + external = return_all_external_modified_deviations( + num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory, history) internal = return_informed_action(num_actions, history, None) cf_ext = return_informed_CF(num_actions, history, None) return np.concatenate((external, internal, cf_ext)) + def return_twice_informed_partial_sequence(num_actions, history, prior_legal_actions): """ - Returns an array of all Twice Informed Partial Sequence (TIPS) deviations with respect to an information set. + Returns an array of all Twice Informed Partial Sequence (TIPS) deviations + with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that information set history: an array containing the prior @@ -561,11 +631,13 @@ def return_twice_informed_partial_sequence(num_actions, history, prior_legal_act possible_memory_weight[0:i] = np.full(i, 1.0) memory_weights.append(possible_memory_weight) - internal = return_all_internal_modified_deviations(num_actions, memory_weights, prior_legal_actions, prior_actions_in_memory, history) + internal = return_all_internal_modified_deviations( + num_actions, memory_weights, prior_legal_actions, prior_actions_in_memory, history) cf_int = return_informed_CF(num_actions, history, None) return np.concatenate((internal, cf_int)) + def generate_all_action_permutations(current_stem, remaining_actions): if len(remaining_actions) == 0: return [np.array(current_stem)] @@ -576,27 +648,35 @@ def generate_all_action_permutations(current_stem, remaining_actions): next_stem = current_stem.copy() next_stem.append(action) next_remaining_actions = remaining_actions[1:] - prev_permutations = generate_all_action_permutations(next_stem ,next_remaining_actions) + prev_permutations = generate_all_action_permutations( + next_stem, next_remaining_actions) for i in prev_permutations: permutations.append(i) return permutations -#Includes identity +# Includes identity + + def return_behavourial(num_actions, history, prior_legal_actions): deviations = [] if len(history) == 0: - internal = return_all_non_identity_internal_deviations(num_actions,[None], [None], history) + internal = return_all_non_identity_internal_deviations( + num_actions, [None], [None], history) for i in internal: deviations.append(i) else: for deviation_info in range(len(history)): - prior_possible_memory_actions = generate_all_action_permutations([],prior_legal_actions[:deviation_info+1]) - memory_weights = np.concatenate((np.ones(deviation_info), np.zeros(len(history) - deviation_info))) + prior_possible_memory_actions = generate_all_action_permutations( + [], prior_legal_actions[:deviation_info+1]) + memory_weights = np.concatenate( + (np.ones(deviation_info), np.zeros(len(history) - deviation_info))) for prior_memory_actions in prior_possible_memory_actions: - prior_memory_actions = np.concatenate((prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) + prior_memory_actions = np.concatenate( + (prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) for i in range(len(history) - len(prior_memory_actions)): prior_memory_actions.append(0) prior_memory_actions_cp = prior_memory_actions.copy() - internal = return_all_non_identity_internal_deviations(num_actions, [memory_weights], prior_memory_actions_cp, prior_memory_actions_cp) + internal = return_all_non_identity_internal_deviations( + num_actions, [memory_weights], prior_memory_actions_cp, prior_memory_actions_cp) for i in internal: deviations.append(i) @@ -604,17 +684,18 @@ def return_behavourial(num_actions, history, prior_legal_actions): class LocalDeviationWithTimeSelection(object): - localSwapTransform = attr.ib() + local_swap_transform = attr.ib() - #Which actions have been forgotten (0) or remembered (1) according to the memory state + # Which actions have been forgotten (0) or remembered (1) according to the memory state prior_actions_weight = attr.ib() - #Which actions have been take according to the memory state + # Which actions have been take according to the memory state prior_memory_actions = attr.ib() use_unmodified_history = attr.ib() - - def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, is_external, use_unmodified_history = True): + + def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, + is_external, use_unmodified_history=True): """" Args: target: the action that will be played when the deviation is triggered @@ -624,148 +705,175 @@ def __init__(self, target, source, num_actions, prior_actions_weight, prior_memo is_external: a boolean use to determine whether to create an internal or external type deviation use_unmodified_history: """ - self.localSwapTransform = LocalSwapTransform(target, source, num_actions, is_external = is_external) + self.local_swap_transform = LocalSwapTransform( + target, source, num_actions, is_external=is_external) self.prior_actions_weight = prior_actions_weight self.prior_memory_actions = prior_memory_actions self.use_unmodified_history = use_unmodified_history - #If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) - def deviate(self,strategy): - return self.localSwapTransform.deviate(strategy) + # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) + def deviate(self, strategy): + return self.local_swap_transform.deviate(strategy) + def return_transform_matrix(self): - return self.localSwapTransform.matrix_transform + return self.local_swap_transform.matrix_transform + def player_deviation_reach_probability(self, prior_possible_action_probabilities): - try: - if self.prior_actions_weight == None or self.prior_memory_actions == None or prior_possible_action_probabilities: - return 1.0 - except: - return 1.0 - + if self.prior_actions_weight is None or self.prior_memory_actions is None or prior_possible_action_probabilities is None: + return 1.0 + memory_action_probabilities = np.ones(len(self.prior_actions_weight)) - #Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs + # Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs memory_weightings = self.prior_actions_weight.copy() if self.use_unmodified_history: for state in range(len(self.prior_memory_actions)): if not self.prior_actions_weight[state] == 0: - memory_action_probabilities[state] = (prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) + memory_action_probabilities[state] = ( + prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) else: memory_action_probabilities[state] = 1 memory_weightings[state] = 1 - path_probability = np.multiply(memory_weightings, memory_action_probabilities) + path_probability = np.multiply( + memory_weightings, memory_action_probabilities) memory_reach_probability = np.prod(path_probability) return memory_reach_probability - + def __eq__(self, other): - if self.localSwapTransform == other.localSwapTransform: + if self.local_swap_transform == other.local_swap_transform: return True else: return False + def __hash__(self): - return hash(self.localSwapTransform) + return hash(self.local_swap_transform) + +# Methods to return all + -#Methods to return all def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: for target in range(num_actions): for source in range(num_actions): if not source == target: - deviations.append(LocalDeviationWithTimeSelection(target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + deviations.append(LocalDeviationWithTimeSelection( + target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) return deviations -#EXCLUDES IDENTITY +# EXCLUDES IDENTITY + + def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: try: - modificationIndex = np.where(prior_actions_weight == 0)[0][0] - except: - modificationIndex = 0 - if modificationIndex == len(prior_memory_actions): + modification_index = np.where(prior_actions_weight == 0)[0][0] + except IndexError: + modification_index = 0 + if modification_index == len(prior_memory_actions): for target in range(num_actions): for source in range(num_actions): if not source == target: - deviations.append(LocalDeviationWithTimeSelection(target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + deviations.append(LocalDeviationWithTimeSelection( + target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) else: - previous_action = prior_memory_actions[modificationIndex] - for alt_action in possible_prior_memory_actions[modificationIndex]: - prior_memory_actions[modificationIndex] = alt_action + previous_action = prior_memory_actions[modification_index] + for alt_action in possible_prior_memory_actions[modification_index]: + prior_memory_actions[modification_index] = alt_action for target in range(num_actions): for source in range(num_actions): if not source == target: - deviations.append(LocalDeviationWithTimeSelection(target, source, num_actions, prior_actions_weight, prior_memory_actions.copy(), False)) - prior_memory_actions[modificationIndex] = previous_action + deviations.append(LocalDeviationWithTimeSelection( + target, source, num_actions, prior_actions_weight, prior_memory_actions.copy(), False)) + prior_memory_actions[modification_index] = previous_action return deviations + def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: for target in range(num_actions): - deviations.append(LocalDeviationWithTimeSelection(target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + deviations.append(LocalDeviationWithTimeSelection( + target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) return deviations -#Modify last action as required +# Modify last action as required + + def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: try: - modificationIndex = np.where(prior_actions_weight == 0)[0][0] - except: - modificationIndex = 0 - if modificationIndex == len(prior_memory_actions): + modification_index = np.where(prior_actions_weight == 0)[0][0] + except IndexError: + modification_index = 0 + if modification_index == len(prior_memory_actions): for target in range(num_actions): - deviations.append(LocalDeviationWithTimeSelection(target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + deviations.append(LocalDeviationWithTimeSelection( + target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) else: - previous_action = prior_memory_actions[modificationIndex] - for alt_action in possible_prior_memory_actions[modificationIndex]: - prior_memory_actions[modificationIndex] = alt_action + previous_action = prior_memory_actions[modification_index] + for alt_action in possible_prior_memory_actions[modification_index]: + prior_memory_actions[modification_index] = alt_action for target in range(num_actions): - deviations.append(LocalDeviationWithTimeSelection(target, target, num_actions, prior_actions_weight, prior_memory_actions.copy(), True)) - prior_memory_actions[modificationIndex] = previous_action + deviations.append(LocalDeviationWithTimeSelection( + target, target, num_actions, prior_actions_weight, prior_memory_actions.copy(), True)) + prior_memory_actions[modification_index] = previous_action return deviations + def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: - deviations.append(LocalDeviationWithTimeSelection(0, 0, num_actions, prior_actions_weight, prior_memory_actions, False)) + deviations.append(LocalDeviationWithTimeSelection( + 0, 0, num_actions, prior_actions_weight, prior_memory_actions, False)) return deviations -#A swap transformation given by the matrix_transform for an information state of +# A swap transformation given by the matrix_transform for an information state of class LocalSwapTransform(object): - sourceAction = attr.ib() - targetAction = attr.ib() + """ + TODO + """ + source_action = attr.ib() + target_action = attr.ib() matrix_transform = attr.ib() - actionsNum = attr.ib() + actions_num = attr.ib() is_external = attr.ib() - - def __init__(self, target,source,actionsNum, is_external = True): - self.sourceAction = source - self.targetAction = target - self.actionsNum = actionsNum - #A + + def __init__(self, target, source, actions_num, is_external=True): + self.source_action = source + self.target_action = target + self.actions_num = actions_num if is_external: - self.sourceAction = None - self.matrix_transform = np.zeros((actionsNum,actionsNum)) - self.matrix_transform[target] = np.ones(actionsNum) + self.source_action = None + self.matrix_transform = np.zeros((actions_num, actions_num)) + self.matrix_transform[target] = np.ones(actions_num) else: - self.matrix_transform = np.eye(actionsNum) + self.matrix_transform = np.eye(actions_num) self.matrix_transform[target][source] = 1 self.matrix_transform[source][source] = 0 + def __repr__(self) -> str: - return "Shifting probabilty from Action: "+str(self.sourceAction) +" to Action: "+str(self.targetAction) + return "Shifting probabilty from Action: "+str(self.source_action) + " to Action: "+str(self.target_action) + def __eq__(self, __o: object) -> bool: - if self.sourceAction == __o.sourceAction and self.targetAction == __o.targetAction and self.actionsNum == __o.actionsNum: + if self.source_action == __o.source_action and self.target_action == __o.target_action and self.actions_num == __o.actions_num: return True else: return False + def __hash__(self): separator = " " - return hash(str(self.sourceAction)+separator+str(self.targetAction)+separator+str(self.actionsNum)+ separator +str(self.is_external)) - #If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) - def deviate(self,strategy): + return hash(str(self.source_action)+separator+str(self.target_action)+separator+str(self.actions_num) + separator + str(self.is_external)) + + # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) + def deviate(self, strategy): """ Returns the deviation strategy + Args: + strategy: the strategy array to multiply the deviation matrix by. + Returns: """ return np.matmul(self.matrix_transform, strategy) From c17ae89f6d4a74b07f201555d600017f44566d8a Mon Sep 17 00:00:00 2001 From: James Flynn Date: Tue, 19 Sep 2023 22:17:05 +0100 Subject: [PATCH 06/18] Added initial tests --- open_spiel/python/algorithms/efr.py | 11 ++-- open_spiel/python/algorithms/efr_test.py | 66 +++++------------------- 2 files changed, 19 insertions(+), 58 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index 59a325d069..b066011484 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -69,7 +69,7 @@ def __init__(self, game, _deviation_gen): self._info_state_nodes = {} hist = {player: [] for player in range(self._num_players)} - unif_probs = [[] for _ in range(self._num_players)], + unif_probs = [[] for _ in range(self._num_players)] empty_path_indices = [[] for _ in range(self._num_players)] self._initialize_info_state_nodes( self._root_node, hist, unif_probs, empty_path_indices) @@ -121,8 +121,7 @@ def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state, p legal_actions = state.legal_actions(current_player) new_uniform_probs_to_state = copy.deepcopy(uniform_probs_to_state) - assert len(new_uniform_probs_to_state[current_player]) == len( - history[current_player]) + assert len(new_uniform_probs_to_state[current_player]) == len(history[current_player]) new_uniform_probs_to_state[current_player].append( {legal_actions[i]: 1/len(legal_actions) for i in range(len(legal_actions))}) @@ -719,11 +718,11 @@ def return_transform_matrix(self): return self.local_swap_transform.matrix_transform def player_deviation_reach_probability(self, prior_possible_action_probabilities): - if self.prior_actions_weight is None or self.prior_memory_actions is None or prior_possible_action_probabilities is None: - return 1.0 + if self.prior_actions_weight is None or self.prior_memory_actions is None or prior_possible_action_probabilities is None: + return 1.0 memory_action_probabilities = np.ones(len(self.prior_actions_weight)) - # Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs + # Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs memory_weightings = self.prior_actions_weight.copy() if self.use_unmodified_history: for state in range(len(self.prior_memory_actions)): diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index 8cfa3a7628..fb77a01932 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -21,7 +21,7 @@ import numpy as np from open_spiel.python import policy -from open_spiel.python.algorithms import efr +import efr from open_spiel.python.algorithms import expected_game_score from open_spiel.python.algorithms import exploitability import pyspiel @@ -31,62 +31,21 @@ _KUHN_UNIFORM_POLICY = policy.TabularPolicy(_KUHN_GAME) _LEDUC_UNIFORM_POLICY = policy.TabularPolicy(_LEDUC_GAME) - - -class ModuleLevelFunctionTest(absltest.TestCase): - - def test__update_current_policy(self): - game = pyspiel.load_game("kuhn_poker") - tabular_policy = policy.TabularPolicy(game) - - cumulative_regrets = np.arange(0, 12 * 2).reshape((12, 2)) - expected_policy = cumulative_regrets / np.sum( - cumulative_regrets, axis=-1, keepdims=True) - nodes_indices = { - u"0": 0, - u"0pb": 1, - u"1": 2, - u"1pb": 3, - u"2": 4, - u"2pb": 5, - u"1p": 6, - u"1b": 7, - u"2p": 8, - u"2b": 9, - u"0p": 10, - u"0b": 11, - } - # pylint: disable=g-complex-comprehension - info_state_nodes = { - key: efr._InfoStateNode( - legal_actions=[0, 1], - index_in_tabular_policy=None, - cumulative_regret=dict(enumerate(cumulative_regrets[index])), - cumulative_policy=None) for key, index in nodes_indices.items() - } - available_deviations = ["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"] - - # pylint: enable=g-complex-comprehension - - efr._update_current_policy(tabular_policy, info_state_nodes) - - np.testing.assert_array_equal(expected_policy, - tabular_policy.action_probability_array) - +_DEVIATIONS_ = ["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"] class EFRTest(parameterized.TestCase, absltest.TestCase): - @parameterized.parameters( - ["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) - def test_policy_zero_is_uniform(self): + @parameterized.parameters(_DEVIATIONS_) + def test_policy_zero_is_uniform(self, deviations_name): # We use Leduc and not Kuhn, because Leduc has illegal actions and Kuhn does # not. game = pyspiel.load_game("leduc_poker") - cfr_solver = efr._EFRSolver( - game, + cfr_solver = efr.EFRSolver( + game=game, deviations_name=deviations_name ) - + print(cfr_solver.current_policy().action_probability_array) + print(_LEDUC_UNIFORM_POLICY.action_probability_array) np.testing.assert_array_equal( _LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.current_policy().action_probability_array) @@ -96,9 +55,12 @@ def test_policy_zero_is_uniform(self): @parameterized.parameters( ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) - def test_cfr_kuhn_poker(self): + def test_cfr_kuhn_poker(self, deviations_name): game = pyspiel.load_game("kuhn_poker") - efr_solver = efr.EFRSolver(game) + efr_solver = efr.EFRSolver( + game=game, + deviations_name=deviations_name + ) for _ in range(300): efr_solver.evaluate_and_update_policy() average_policy = efr_solver.average_policy() @@ -106,7 +68,7 @@ def test_cfr_kuhn_poker(self): game.new_initial_state(), [average_policy] * 2) # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker np.testing.assert_allclose( - average_policy_values, [-1 / 18, 1 / 18], atol=1e-3) + average_policy_values, [-1 / 18, 1 / 18,0], atol=1e-3) if __name__ == "__main__": absltest.main() From 1ce44e724e9f46b652ab131d96e2015354dbe232 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Tue, 19 Sep 2023 22:45:27 +0100 Subject: [PATCH 07/18] Removed print statements --- open_spiel/python/algorithms/efr_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index fb77a01932..a76db1125e 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -44,8 +44,6 @@ def test_policy_zero_is_uniform(self, deviations_name): game=game, deviations_name=deviations_name ) - print(cfr_solver.current_policy().action_probability_array) - print(_LEDUC_UNIFORM_POLICY.action_probability_array) np.testing.assert_array_equal( _LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.current_policy().action_probability_array) @@ -68,7 +66,7 @@ def test_cfr_kuhn_poker(self, deviations_name): game.new_initial_state(), [average_policy] * 2) # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker np.testing.assert_allclose( - average_policy_values, [-1 / 18, 1 / 18,0], atol=1e-3) + average_policy_values, [-1 / 18, 1 / 18], atol=1e-3) if __name__ == "__main__": absltest.main() From 36f302b99281a90709285aa1479905bab88c6dcb Mon Sep 17 00:00:00 2001 From: James Flynn Date: Tue, 19 Sep 2023 22:45:39 +0100 Subject: [PATCH 08/18] Removed print statements --- open_spiel/python/algorithms/efr_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index a76db1125e..766998d050 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -21,7 +21,7 @@ import numpy as np from open_spiel.python import policy -import efr +from open_spiel.python.algorithms import efr from open_spiel.python.algorithms import expected_game_score from open_spiel.python.algorithms import exploitability import pyspiel From f185dbabc8c90affd8a832b7f5c3048e9f64a4a8 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Wed, 20 Sep 2023 02:16:26 +0100 Subject: [PATCH 09/18] More comments --- open_spiel/python/algorithms/efr.py | 111 +++++++++++++++++++++++++--- 1 file changed, 100 insertions(+), 11 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index b066011484..bca0c43f0b 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -55,8 +55,31 @@ class _InfoStateNode(object): class _EFRSolverBase(object): - def __init__(self, game, _deviation_gen): - assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, () + """The base EFR solver class + + The main iteration loop is implemented in `evaluate_and_update_policy`: + ```python + game = pyspiel.load_game("game_name") + initial_state = game.new_initial_state() + solver = Solver(game) + for i in range(num_iterations): + solver.evaluate_and_update_policy() + solver.current_policy() # Access the current policy + solver.average_policy() # Access the average policy + ``` + """ + def __init__(self, game, deviation_gen): + """Initializer. + Args: + game: The `pyspiel.Game` to run on. + deviation_gen: a function that accepts (num_actions : int, history : , prior_legal_actions) and returns a list containing `LocalDeviationWithTimeSelection` objects of the + the realisable deviations of a described type (e.g blind causal deviations) and given the information state described by the function parameters. + """ + # pyformat: enable + assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( + "EFR requires sequential games. If you're trying to run it " + + "on a simultaneous (or normal-form) game, please first transform it " + + "using turn_based_simultaneous_game.") self._game = game self._num_players = game.num_players() @@ -65,7 +88,7 @@ def __init__(self, game, _deviation_gen): # This is for returning the current policy and average policy to a caller self._current_policy = policy.TabularPolicy(game) self._average_policy = self._current_policy.__copy__() - self._deviation_gen = _deviation_gen + self._deviation_gen = deviation_gen self._info_state_nodes = {} hist = {player: [] for player in range(self._num_players)} @@ -77,16 +100,52 @@ def __init__(self, game, _deviation_gen): self._iteration = 1 # For possible linear-averaging. def return_cumulative_regret(self): - return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret for i in range(len(self._info_state_nodes.keys()))} + """Returns a dictionary mapping every information state to its associated regret (accumulated over all iterations). + """ + return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret + for i in range(len(self._info_state_nodes.keys()))} def current_policy(self): + """Returns the current policy as a TabularPolicy. + + WARNING: The same object, updated in-place will be returned! You can copy + it (or its `action_probability_array` field). + + For EFR, this policy does not necessarily have to converge. + """ return self._current_policy def average_policy(self): + """Returns the average of all policies iterated. + WARNING: The same object, updated in-place will be returned! You can copy + it (or its `action_probability_array` field). + + This average policy converges to a equilibrium policy as the number of iterations + increases (equilibrium type depends on learning deviations used). + + The policy is computed using the accumulated policy probabilities computed + using `evaluate_and_update_policy`. + + Returns: + A `policy.TabularPolicy` object (shared between calls) giving the (linear) + time averaged policy (weighted by player reach probabilities) for all + players. + """ _update_average_policy(self._average_policy, self._info_state_nodes) return self._average_policy def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state, path_indices): + """Initializes info_state_nodes. + Create one _InfoStateNode per infoset. We could also initialize the node + when we try to access it and it does not exist. [todo] + + Args: + state: The current state in the tree walk. This should be the root node + when we call this function from the EFR solver. + history: [todo] + uniform_probs_to_state: [todo] + path_indices: [todo] + """ if state.is_terminal(): return @@ -126,11 +185,9 @@ def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state, p new_uniform_probs_to_state[current_player].append( {legal_actions[i]: 1/len(legal_actions) for i in range(len(legal_actions))}) for action in info_state_node.legal_actions: - # Speedup new_path_indices = copy.deepcopy(path_indices) new_path_indices[current_player].append( [legal_actions, info_state_node.index_in_tabular_policy]) - # Speedup new_history = copy.deepcopy(history) new_history[current_player].append(action) assert len(new_history[current_player]) == len( @@ -141,6 +198,10 @@ def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state, p def _update_current_policy(self, state, current_policy): """Updated in order so that memory reach probs are defined wrt to the new strategy + + Args: + state: [todo] + current_policy: [todo] """ if state.is_terminal(): @@ -177,6 +238,24 @@ def _update_current_policy(self, state, current_policy): def _compute_cumulative_immediate_regret_for_player(self, state, policies, reach_probabilities, player): + """Increments the cumulative regrets and policy for `player`. [todo] + Args: + state: The initial game state to analyze from. + policies: A list of `num_players` callables taking as input an + `info_state_node` and returning a {action: prob} dictionary. For CFR, + this is simply returning the current policy, but this can be used in + the CFR-BR solver, to prevent code duplication. If None, + `_get_infostate_policy` is used. + reach_probabilities: The probability for each player of reaching `state` + as a numpy array [prob for player 0, for player 1,..., for chance]. [todo] + `player_reach_probabilities[player]` will work in all cases. + player: The 0-indexed player to update the values for. If `None`, the + update for all players will be performed. + + Returns: + The utility of `state` for all players, assuming all players follow the + current policy defined by `self.Policy`. + """ if state.is_terminal(): return np.asarray(state.returns()) @@ -240,7 +319,6 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, state_value_for_player = state_value[current_player] deviations = info_state_node.relizable_deviations for deviation_index in range(len(deviations)): - # FIX ADD DICT TO ARRAY CONVERSION FUNCTION deviation = deviations[deviation_index] deviation_strategy = deviation.deviate( strat_dict_to_array(self._get_infostate_policy(info_state))) @@ -295,8 +373,20 @@ def evaluate_and_update_policy(self): class EFRSolver(_EFRSolver): def __init__(self, game, deviations_name): + """Initializer. + Args: + game: The `pyspiel.Game` to run on. + deviation_name: the name of the deviation type to use for accumulating regrets and calculating the strategy at the next timestep. + + Deviation types implemented are "blind action", "informed action", "blind cf", + "informed counterfactual", "blind partial sequence", "counterfactual partial sequence", + "casual partial sequence", "twice informed partial sequence", "single target behavioural". + See "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games" by D. Morrill et al. 2021b + for the full definition of each type. + + """ - # Takes the deviation sets used for learning from Deviation_Sets + #external_only = True leads to a shortcut in the external_only = False deviation_sets = None @@ -324,8 +414,7 @@ def __init__(self, game, deviations_name): or deviations_name == "behavioural": deviation_sets = return_behavourial else: - print("Unsupported Deviation Set") - return None + raise(ValueError("Unsupported Deviation Set Passed As Constructor Argument")) super(EFRSolver, self).__init__(game, _deviation_gen=deviation_sets) self._external_only = external_only @@ -854,7 +943,7 @@ def __init__(self, target, source, actions_num, is_external=True): self.matrix_transform[source][source] = 0 def __repr__(self) -> str: - return "Shifting probabilty from Action: "+str(self.source_action) + " to Action: "+str(self.target_action) + return "Diverting from Action: "+str(self.source_action) + " to Action: "+str(self.target_action) def __eq__(self, __o: object) -> bool: if self.source_action == __o.source_action and self.target_action == __o.target_action and self.actions_num == __o.actions_num: From 2c47a50a2a05f781941ccd278eaafff6381d2430 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Wed, 20 Sep 2023 20:55:48 +0100 Subject: [PATCH 10/18] Added paper reference --- open_spiel/python/algorithms/efr.py | 59 +++++++++++++++++++---------- 1 file changed, 39 insertions(+), 20 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index bca0c43f0b..c75cbb62bc 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -13,14 +13,19 @@ # limitations under the License. # Modified: 2023 James Flynn # Original: https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py + """Python implementation of the extensive-form regret minimization algorithm. +See: "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games", +Morrill et al. 2021b, +https://arxiv.org/abs/2102.06973 + One iteration of EFR consists of: 1) Compute current strategy from regrets (e.g. using Regret Matching). 2) Compute values using the current strategy 3) Compute regrets from these values -The average policy converges to a Nash Equilibrium rather than the current policy as in CFR. +The average policy converges to a Nash Equilibrium rather than the current policy. """ import copy from collections import defaultdict @@ -785,13 +790,19 @@ class LocalDeviationWithTimeSelection(object): def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, is_external, use_unmodified_history=True): """" + Represents a swap transformation (both external and internal) for a given memory state. Args: - target: the action that will be played when the deviation is triggered - source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) - num_actions: the integer of actions - prior_actions_weight: - is_external: a boolean use to determine whether to create an internal or external type deviation - use_unmodified_history: + target: the action that will be played when the deviation is triggered. + source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False). + num_actions: the integer of actions that can be played for this information state + prior_actions_weight: an array the length of the history of the information state + actions have been forgotten (0) or remembered (1) according to the memory state. + This is represented numerically for possible experimentation with partially forgotten + actions (i.e in the range (0,1)). + prior_memory_actions: the preceeding actions upto the the information state + (which the LocalDeviationWithTimeSelection is defined with respect to). + is_external: a boolean use to determine whether this is an internal or external type deviation. + use_unmodified_history: a boolean used to """ self.local_swap_transform = LocalSwapTransform( target, source, num_actions, is_external=is_external) @@ -801,9 +812,16 @@ def __init__(self, target, source, num_actions, prior_actions_weight, prior_memo # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) def deviate(self, strategy): + """ + Args: + + """ return self.local_swap_transform.deviate(strategy) def return_transform_matrix(self): + """ + Returns the matrix_transform of the associated `LocalSwapTransform` object. + """ return self.local_swap_transform.matrix_transform def player_deviation_reach_probability(self, prior_possible_action_probabilities): @@ -821,6 +839,8 @@ def player_deviation_reach_probability(self, prior_possible_action_probabilities else: memory_action_probabilities[state] = 1 memory_weightings[state] = 1 + + path_probability = np.multiply( memory_weightings, memory_action_probabilities) memory_reach_probability = np.prod(path_probability) @@ -835,9 +855,6 @@ def __eq__(self, other): def __hash__(self): return hash(self.local_swap_transform) -# Methods to return all - - def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: @@ -848,9 +865,6 @@ def return_all_non_identity_internal_deviations(num_actions, possible_prior_weig target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) return deviations -# EXCLUDES IDENTITY - - def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: @@ -886,8 +900,6 @@ def return_all_external_deviations(num_actions, possible_prior_weights, prior_m return deviations # Modify last action as required - - def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): deviations = [] for prior_actions_weight in possible_prior_weights: @@ -921,7 +933,7 @@ def return_identity_deviation(num_actions, possible_prior_weights, prior_memory # A swap transformation given by the matrix_transform for an information state of class LocalSwapTransform(object): """ - TODO + Represents a swap transformation (both external and internal) for an information state for a certain number of actions. """ source_action = attr.ib() target_action = attr.ib() @@ -930,6 +942,14 @@ class LocalSwapTransform(object): is_external = attr.ib() def __init__(self, target, source, actions_num, is_external=True): + """" + Creates the matrix transformation that describes the transformation and initalises the other variables. + Args: + target: the action that will be played when the deviation is triggered + source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) + num_actions: the integer of actions that can be played for this information state + is_external: a boolean used to determine whether to create an internal or external type deviation. + """ self.source_action = source self.target_action = target self.actions_num = actions_num @@ -955,13 +975,12 @@ def __hash__(self): separator = " " return hash(str(self.source_action)+separator+str(self.target_action)+separator+str(self.actions_num) + separator + str(self.is_external)) - # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) def deviate(self, strategy): """ - Returns the deviation strategy + Returns the strategy array given by deviating according to 'self.matrix_transform' matrix. Args: - strategy: the strategy array to multiply the deviation matrix by. + strategy: the strategy array to deviate from. Returns: - + the matrix product of the the matrix_transform and the provided strategy. """ return np.matmul(self.matrix_transform, strategy) From 423beebd6d3064fa266bf6a300addc5ba4ed5390 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Thu, 21 Sep 2023 01:47:26 +0100 Subject: [PATCH 11/18] Linting changes and removed unused vars --- open_spiel/python/algorithms/efr.py | 1736 +++++++++++----------- open_spiel/python/algorithms/efr_test.py | 7 +- 2 files changed, 880 insertions(+), 863 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index c75cbb62bc..ce4cfa0805 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -40,947 +40,967 @@ @attr.s class _InfoStateNode(object): - """An object wrapping values associated to an information state.""" - # The list of the legal actions. - legal_actions = attr.ib() - index_in_tabular_policy = attr.ib() - # The newly availible deviations + the old ones - relizable_deviations = attr.ib() - # Player -> state -> action -> prob - current_history_probs = attr.ib() + """An object wrapping values associated to an information state.""" + # The list of the legal actions. + legal_actions = attr.ib() + index_in_tabular_policy = attr.ib() + # The newly availible deviations + the old ones + relizable_deviations = attr.ib() + # Player -> state -> action -> prob + current_history_probs = attr.ib() - # An array representing - history = attr.ib() + # An array representing the preceeding actions played upto this information state + history = attr.ib() - cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) - # Same as above for the cumulative of the policy probabilities computed - # during the policy iterations - cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) - y_values = attr.ib(factory=lambda: defaultdict(float)) + cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) + #The sum of all prior iteration's policies + cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) + + #A dictionary mapping each deviation to their "y values" for the current iteration + y_values = attr.ib(factory=lambda: defaultdict(float)) class _EFRSolverBase(object): - """The base EFR solver class - - The main iteration loop is implemented in `evaluate_and_update_policy`: - ```python - game = pyspiel.load_game("game_name") - initial_state = game.new_initial_state() - solver = Solver(game) - for i in range(num_iterations): - solver.evaluate_and_update_policy() - solver.current_policy() # Access the current policy - solver.average_policy() # Access the average policy - ``` - """ - def __init__(self, game, deviation_gen): - """Initializer. - Args: - game: The `pyspiel.Game` to run on. - deviation_gen: a function that accepts (num_actions : int, history : , prior_legal_actions) and returns a list containing `LocalDeviationWithTimeSelection` objects of the - the realisable deviations of a described type (e.g blind causal deviations) and given the information state described by the function parameters. - """ - # pyformat: enable - assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( - "EFR requires sequential games. If you're trying to run it " + - "on a simultaneous (or normal-form) game, please first transform it " + - "using turn_based_simultaneous_game.") - - self._game = game - self._num_players = game.num_players() - self._root_node = self._game.new_initial_state() - - # This is for returning the current policy and average policy to a caller - self._current_policy = policy.TabularPolicy(game) - self._average_policy = self._current_policy.__copy__() - self._deviation_gen = deviation_gen - - self._info_state_nodes = {} - hist = {player: [] for player in range(self._num_players)} - unif_probs = [[] for _ in range(self._num_players)] - empty_path_indices = [[] for _ in range(self._num_players)] - self._initialize_info_state_nodes( - self._root_node, hist, unif_probs, empty_path_indices) - - self._iteration = 1 # For possible linear-averaging. - - def return_cumulative_regret(self): - """Returns a dictionary mapping every information state to its associated regret (accumulated over all iterations). - """ - return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret - for i in range(len(self._info_state_nodes.keys()))} - - def current_policy(self): - """Returns the current policy as a TabularPolicy. - - WARNING: The same object, updated in-place will be returned! You can copy - it (or its `action_probability_array` field). - - For EFR, this policy does not necessarily have to converge. - """ - return self._current_policy - - def average_policy(self): - """Returns the average of all policies iterated. - WARNING: The same object, updated in-place will be returned! You can copy - it (or its `action_probability_array` field). - - This average policy converges to a equilibrium policy as the number of iterations - increases (equilibrium type depends on learning deviations used). - - The policy is computed using the accumulated policy probabilities computed - using `evaluate_and_update_policy`. - - Returns: - A `policy.TabularPolicy` object (shared between calls) giving the (linear) - time averaged policy (weighted by player reach probabilities) for all - players. - """ - _update_average_policy(self._average_policy, self._info_state_nodes) - return self._average_policy - - def _initialize_info_state_nodes(self, state, history, uniform_probs_to_state, path_indices): - """Initializes info_state_nodes. - Create one _InfoStateNode per infoset. We could also initialize the node - when we try to access it and it does not exist. [todo] - - Args: - state: The current state in the tree walk. This should be the root node - when we call this function from the EFR solver. - history: [todo] - uniform_probs_to_state: [todo] - path_indices: [todo] - """ - if state.is_terminal(): - return - - if state.is_chance_node(): - for action, unused_action_prob in state.chance_outcomes(): - self._initialize_info_state_nodes(state.child( - action), history, uniform_probs_to_state, path_indices) - return - - current_player = state.current_player() - info_state = state.information_state_string(current_player) - info_state_node = self._info_state_nodes.get(info_state) - if info_state_node is None: - legal_actions = state.legal_actions(current_player) - info_state_node = _InfoStateNode( - legal_actions=legal_actions, - index_in_tabular_policy=self._current_policy.state_lookup[info_state], - relizable_deviations=None, - history=history[current_player].copy(), - current_history_probs=copy.deepcopy( - path_indices[current_player]) - ) - prior_possible_actions = [] - for i in range(len(info_state_node.current_history_probs)): - prior_possible_actions.append( - info_state_node.current_history_probs[i][0]) - prior_possible_actions.append(info_state_node.legal_actions) - - info_state_node.relizable_deviations = self._deviation_gen(len( - info_state_node.legal_actions), info_state_node.history, prior_possible_actions) - self._info_state_nodes[info_state] = info_state_node - - legal_actions = state.legal_actions(current_player) - new_uniform_probs_to_state = copy.deepcopy(uniform_probs_to_state) - assert len(new_uniform_probs_to_state[current_player]) == len(history[current_player]) - - new_uniform_probs_to_state[current_player].append( - {legal_actions[i]: 1/len(legal_actions) for i in range(len(legal_actions))}) - for action in info_state_node.legal_actions: - new_path_indices = copy.deepcopy(path_indices) - new_path_indices[current_player].append( - [legal_actions, info_state_node.index_in_tabular_policy]) - new_history = copy.deepcopy(history) - new_history[current_player].append(action) - assert len(new_history[current_player]) == len( - new_path_indices[current_player]) - - self._initialize_info_state_nodes(state.child( - action), new_history, new_uniform_probs_to_state, new_path_indices) - - def _update_current_policy(self, state, current_policy): - """Updated in order so that memory reach probs are defined wrt to the new strategy - - Args: - state: [todo] - current_policy: [todo] - """ - - if state.is_terminal(): - return - elif not state.is_chance_node(): - current_player = state.current_player() - info_state = state.information_state_string(current_player) - info_state_node = self._info_state_nodes[info_state] - deviations = info_state_node.relizable_deviations - # print(info_state) - for devation in range(len(deviations)): - # change too infostate - mem_reach_probs = create_probs_from_index( - info_state_node.current_history_probs, current_policy) - deviation_reach_prob = deviations[devation].player_deviation_reach_probability( - mem_reach_probs) - info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]] + max( - 0, info_state_node.cumulative_regret[devation])*deviation_reach_prob - - # Might be incorrect - state_policy = current_policy.policy_for_key(info_state) - # print - for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): - state_policy[action] = value - - for action in info_state_node.legal_actions: - new_state = state.child(action) - self._update_current_policy(new_state, current_policy) - else: - for action, _ in state.chance_outcomes(): - new_state = state.child(action) - self._update_current_policy(new_state, current_policy) - # Path to state probability ignores chance probabilty as this is stored as new_reach_probabilities[-1] - - def _compute_cumulative_immediate_regret_for_player(self, state, policies, - reach_probabilities, player): - """Increments the cumulative regrets and policy for `player`. [todo] - Args: - state: The initial game state to analyze from. - policies: A list of `num_players` callables taking as input an - `info_state_node` and returning a {action: prob} dictionary. For CFR, - this is simply returning the current policy, but this can be used in - the CFR-BR solver, to prevent code duplication. If None, - `_get_infostate_policy` is used. - reach_probabilities: The probability for each player of reaching `state` - as a numpy array [prob for player 0, for player 1,..., for chance]. [todo] - `player_reach_probabilities[player]` will work in all cases. - player: The 0-indexed player to update the values for. If `None`, the - update for all players will be performed. - - Returns: - The utility of `state` for all players, assuming all players follow the - current policy defined by `self.Policy`. - """ - if state.is_terminal(): - return np.asarray(state.returns()) - - if state.is_chance_node(): - state_value = 0.0 - for action, action_prob in state.chance_outcomes(): - assert action_prob > 0 - new_state = state.child(action) - new_reach_probabilities = reach_probabilities.copy() - new_reach_probabilities[-1] *= action_prob - - state_value += action_prob * self._compute_cumulative_immediate_regret_for_player( - new_state, policies, new_reach_probabilities, player) - return state_value - - current_player = state.current_player() - info_state = state.information_state_string(current_player) - - # No need to continue on this history branch as no update will be performed - # for any player. - # The value we return here is not used in practice. If the conditional - # statement is True, then the last taken action has probability 0 of - # occurring, so the returned value is not impacting the parent node value. - if all(reach_probabilities[:-1] == 0): - return np.zeros(self._num_players) - - state_value = np.zeros(self._num_players) - - # The utilities of the children states are computed recursively. As the - # regrets are added to the information state regrets for each state in that - # information state, the recursive call can only be made once per child - # state. Therefore, the utilities are cached. - children_utilities = {} - - info_state_node = self._info_state_nodes[info_state] - # Reset y values - info_state_node.y_values = defaultdict(float) - if policies is None: - info_state_policy = self._get_infostate_policy(info_state) - else: - info_state_policy = policies[current_player](info_state) - - reach_prob = reach_probabilities[current_player] - for action in state.legal_actions(): - action_prob = info_state_policy.get(action, 0.) - info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action] + \ - action_prob * reach_prob - new_state = state.child(action) - new_reach_probabilities = reach_probabilities.copy() - assert action_prob <= 1 - new_reach_probabilities[current_player] *= action_prob - child_utility = self._compute_cumulative_immediate_regret_for_player( - new_state, policies=policies, reach_probabilities=new_reach_probabilities, player=player) - - state_value += action_prob * child_utility - children_utilities[action] = child_utility - - counterfactual_reach_prob = (np.prod( - reach_probabilities[:current_player]) * np.prod(reach_probabilities[current_player + 1:])) - - state_value_for_player = state_value[current_player] - deviations = info_state_node.relizable_deviations - for deviation_index in range(len(deviations)): - deviation = deviations[deviation_index] - deviation_strategy = deviation.deviate( - strat_dict_to_array(self._get_infostate_policy(info_state))) - - player_child_utilities = np.array(list(children_utilities.values()))[ - :, current_player] - devation_cf_value = np.inner(np.transpose( - deviation_strategy), player_child_utilities) - - memory_reach_probs = create_probs_from_index( - info_state_node.current_history_probs, self.current_policy()) - player_current_memory_reach_prob = deviation.player_deviation_reach_probability( - memory_reach_probs) - - deviation_regret = player_current_memory_reach_prob * \ - ((devation_cf_value*counterfactual_reach_prob) - - (counterfactual_reach_prob * state_value_for_player)) - - info_state_node.cumulative_regret[deviation_index] += deviation_regret - return state_value - - def _get_infostate_policy(self, info_state_str): - """Returns an {action: prob} dictionary for the policy on `info_state`.""" - info_state_node = self._info_state_nodes[info_state_str] - prob_vec = self._current_policy.action_probability_array[ - info_state_node.index_in_tabular_policy] - return { - action: prob_vec[action] for action in info_state_node.legal_actions - } - - -def __get_infostate_policy_array(self, info_state_str): - info_state_node = self._info_state_nodes[info_state_str] - return self._current_policy.action_probability_array[ - info_state_node.index_in_tabular_policy] + """The base EFR solver class + + The main iteration loop is implemented in `evaluate_and_update_policy`: + ```python + game = pyspiel.load_game("game_name") + initial_state = game.new_initial_state() + solver = Solver(game) + for i in range(num_iterations): + solver.evaluate_and_update_policy() + solver.current_policy() # Access the current policy + solver.average_policy() # Access the average policy + ``` + """ + def __init__(self, game, deviation_gen): + """Initializer. + Args: + game: The `pyspiel.Game` to run on. + deviation_gen: a function that accepts (num_actions : int, history : , prior_legal_actions) and returns a list containing `LocalDeviationWithTimeSelection` objects of the + the realisable deviations of a described type (e.g blind causal deviations) and given the information state described by the function parameters. + """ + # pyformat: enable + assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( + "EFR requires sequential games. If you're trying to run it " + + "on a simultaneous (or normal-form) game, please first transform it " + + "using turn_based_simultaneous_game.") + self._game = game + self._num_players = game.num_players() + self._root_node = self._game.new_initial_state() -class _EFRSolver(_EFRSolverBase): - def __init__(self, game, _deviation_gen): - super().__init__(game, _deviation_gen) + # This is for returning the current policy and average policy to a caller + self._current_policy = policy.TabularPolicy(game) + self._average_policy = self._current_policy.__copy__() + self._deviation_gen = deviation_gen - def evaluate_and_update_policy(self): - """Performs a single step of policy evaluation and policy improvement.""" - self._compute_cumulative_immediate_regret_for_player( - self._root_node, - policies=None, - reach_probabilities=np.ones(self._game.num_players() + 1), - player=None) - self._update_current_policy(self._root_node, self._current_policy) - self._iteration += 1 + self._info_state_nodes = {} + hist = {player: [] for player in range(self._num_players)} + empty_path_indices = [[] for _ in range(self._num_players)] + self._initialize_info_state_nodes(self._root_node, hist, empty_path_indices) -class EFRSolver(_EFRSolver): - def __init__(self, game, deviations_name): - """Initializer. - Args: - game: The `pyspiel.Game` to run on. - deviation_name: the name of the deviation type to use for accumulating regrets and calculating the strategy at the next timestep. - - Deviation types implemented are "blind action", "informed action", "blind cf", - "informed counterfactual", "blind partial sequence", "counterfactual partial sequence", - "casual partial sequence", "twice informed partial sequence", "single target behavioural". - See "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games" by D. Morrill et al. 2021b - for the full definition of each type. - - """ - - #external_only = True leads to a shortcut in the - external_only = False - deviation_sets = None - - if deviations_name == "blind action": - deviation_sets = return_blind_action - external_only = True - elif deviations_name == "informed action": - deviation_sets = return_informed_action - elif deviations_name == "blind cf" or deviations_name == "blind counterfactual": - deviation_sets = return_blind_CF - external_only = True - elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": - deviation_sets = return_informed_CF - elif deviations_name == "bps" or deviations_name == "blind partial sequence": - deviation_sets = return_blind_partial_sequence - external_only = True - elif deviations_name == "cfps" or deviations_name == "cf partial sequence"\ - or deviations_name == "counterfactual partial sequence": - deviation_sets = return_cf_partial_sequence - elif deviations_name == "csps" or deviations_name == "casual partial sequence": - deviation_sets = return_cs_partial_sequence - elif deviations_name == "tips" or deviations_name == "twice informed partial sequence": - deviation_sets = return_twice_informed_partial_sequence - elif deviations_name == "bhv" or deviations_name == "single target behavioural"\ - or deviations_name == "behavioural": - deviation_sets = return_behavourial - else: - raise(ValueError("Unsupported Deviation Set Passed As Constructor Argument")) - super(EFRSolver, self).__init__(game, _deviation_gen=deviation_sets) - self._external_only = external_only - - def _regret_matching(self, legal_actions, info_set_node): - """Returns an info state policy by applying regret-matching function - over all deviations and time selection functions. - Args: - cumulative_regrets: A {deviation: y value} dictionary. - legal_actions: the list of legal actions at this state. - - Returns: - A dict of action -> prob for all legal actions. - """ - z = sum(info_set_node.y_values.values()) - info_state_policy = {} - - # The fixed point solution can be directly obtained through the weighted regret matrix - # if only external deviations are used - if self._external_only and z > 0: - weighted_deviation_matrix = np.zeros( - (len(legal_actions), len(legal_actions))) - for dev in list(info_set_node.y_values.keys()): - weighted_deviation_matrix += ( - info_set_node.y_values[dev]/z) * dev.return_transform_matrix() - new_strategy = weighted_deviation_matrix[:, 0] - for index in range(len(legal_actions)): - info_state_policy[legal_actions[index]] = new_strategy[index] - - # Full regret matching by finding the least squares solution to the fixed point - # Last row of matrix and the column entry ensures the solution is a strategy (otherwise would have to normalise) - elif z > 0: - num_actions = len(info_set_node.legal_actions) - weighted_deviation_matrix = -np.eye(num_actions) - - for dev in list(info_set_node.y_values.keys()): - weighted_deviation_matrix += ( - info_set_node.y_values[dev]/z) * dev.return_transform_matrix() - - normalisation_row = np.ones(num_actions) - weighted_deviation_matrix = np.vstack( - [weighted_deviation_matrix, normalisation_row]) - b = np.zeros(num_actions+1) - b[num_actions] = 1 - b = np.reshape(b, (num_actions+1, 1)) - - strategy = lstsq(weighted_deviation_matrix, b)[0] - - # Adopt same clipping strategy as paper author's code - strategy[np.where(strategy < 0)] = 0 - strategy[np.where(strategy > 1)] = 1 - - strategy = strategy/sum(strategy) - for index in range(len(strategy)): - info_state_policy[info_set_node.legal_actions[index] - ] = strategy[index] - # Use a uniform strategy as sum of all regrets is negative - else: - for index in range(len(legal_actions)): - info_state_policy[legal_actions[index]]\ - = 1.0 / len(legal_actions) - return info_state_policy + self._iteration = 1 # For possible linear-averaging. + def return_cumulative_regret(self): + """Returns a dictionary mapping every information state to its associated regret (accumulated over all iterations). + """ + return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret + for i in range(len(self._info_state_nodes.keys()))} -def _update_average_policy(average_policy, info_state_nodes): - """Updates in place `average_policy` to the average of all policies iterated. + def current_policy(self): + """Returns the current policy as a TabularPolicy. - This function is a module level function to be reused by both CFRSolver and - CFRBRSolver. + WARNING: The same object, updated in-place will be returned! You can copy + it (or its `action_probability_array` field). - Args: - average_policy: A `policy.TabularPolicy` to be updated in-place. - info_state_nodes: A dictionary {`info_state_str` -> `_InfoStateNode`}. + For EFR, this policy does not necessarily have to converge. """ - for info_state, info_state_node in info_state_nodes.items(): - info_state_policies_sum = info_state_node.cumulative_policy - state_policy = average_policy.policy_for_key(info_state) - probabilities_sum = sum(info_state_policies_sum.values()) - if probabilities_sum == 0: - num_actions = len(info_state_node.legal_actions) - for action in info_state_node.legal_actions: - state_policy[action] = 1 / num_actions - else: - for action, action_prob_sum in info_state_policies_sum.items(): - state_policy[action] = action_prob_sum / probabilities_sum + return self._current_policy + def average_policy(self): + """Returns the average of all policies iterated. + WARNING: The same object, updated in-place will be returned! You can copy + it (or its `action_probability_array` field). + + This average policy converges to a equilibrium policy as the number of iterations + increases (equilibrium type depends on learning deviations used). + + The policy is computed using the accumulated policy probabilities computed + using `evaluate_and_update_policy`. -def strat_dict_to_array(strategy_dictionary): - """ - A helper function to convert the strategy dictionary action -> prob value to an array. - Args: - strategy_dictionary: a dictionary action -> prob value. Returns: - strategy_array: an array with the ith action's value at the i-1th index. + A `policy.TabularPolicy` object (shared between calls) giving the (linear) + time averaged policy (weighted by player reach probabilities) for all + players. """ - actions = list(strategy_dictionary.keys()) - strategy_array = np.zeros((len(actions), 1)) - for action in range(len(actions)): - strategy_array[action][0] = strategy_dictionary[actions[action]] - return strategy_array + _update_average_policy(self._average_policy, self._info_state_nodes) + return self._average_policy + def _initialize_info_state_nodes(self, state, history, path_indices): + """Initializes info_state_nodes. + Create one _InfoStateNode per infoset. We could also initialize the node + when we try to access it and it does not exist. -def array_to_strat_dict(strategy_array, legal_actions): + Generates all deviations that are realisable at this state and stores + the history and preceeding state policy information to create memory states + and calculate the memory reach probability for each deviation. + + Args: + state: The current state in the tree traversal. This should be the root node + when we call this function from the EFR solver. + history: an arrays of the preceeding actions taken prior to the state for each player. + path_indices: a 3d array [player number]x[preceeding state]x[legal actions for state, + index of the policy for this state in TabularPolicy]. """ - A helper function to convert a strategy array to an action -> prob value dictionary. + if state.is_terminal(): + return + + if state.is_chance_node(): + for action, unused_action_prob in state.chance_outcomes(): + self._initialize_info_state_nodes(state.child( + action), history, path_indices) + return + + current_player = state.current_player() + info_state = state.information_state_string(current_player) + info_state_node = self._info_state_nodes.get(info_state) + if info_state_node is None: + legal_actions = state.legal_actions(current_player) + info_state_node = _InfoStateNode( + legal_actions=legal_actions, + index_in_tabular_policy=self._current_policy.state_lookup[info_state], + relizable_deviations=None, + history=history[current_player].copy(), + current_history_probs=copy.deepcopy( + path_indices[current_player]) + ) + prior_possible_actions = [] + for i in range(len(info_state_node.current_history_probs)): + prior_possible_actions.append( + info_state_node.current_history_probs[i][0]) + prior_possible_actions.append(info_state_node.legal_actions) + + info_state_node.relizable_deviations = self._deviation_gen(len( + info_state_node.legal_actions), info_state_node.history, prior_possible_actions) + self._info_state_nodes[info_state] = info_state_node + + legal_actions = state.legal_actions(current_player) + + for action in info_state_node.legal_actions: + new_path_indices = copy.deepcopy(path_indices) + new_path_indices[current_player].append( + [legal_actions, info_state_node.index_in_tabular_policy]) + new_history = copy.deepcopy(history) + new_history[current_player].append(action) + assert len(new_history[current_player]) == len(new_path_indices[current_player]) + + self._initialize_info_state_nodes(state.child(action), new_history, new_path_indices) + + def _update_current_policy(self, state, current_policy): + """Updated in order so that memory reach probs are defined wrt to the new strategy + Note that the function is called recursively (first call should be the root). Additionally, + to update the strategy for a given state we require the (t+1)th strategy for all prior states. + Args: - strategy_array: an array with the ith action's value at the i-1th index. - legal_actions: the list of all legal actions at the current state. + state: the state of which to update the strategy. + current_policy: the (t+1)th strategy that is being recursively computed, see the function + description for more detail. + """ + + if state.is_terminal(): + return + elif not state.is_chance_node(): + current_player = state.current_player() + info_state = state.information_state_string(current_player) + info_state_node = self._info_state_nodes[info_state] + deviations = info_state_node.relizable_deviations + for devation in range(len(deviations)): + mem_reach_probs = create_probs_from_index( + info_state_node.current_history_probs, current_policy) + deviation_reach_prob = deviations[devation].player_deviation_reach_probability( + mem_reach_probs) + info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]] + max( + 0, info_state_node.cumulative_regret[devation])*deviation_reach_prob + + state_policy = current_policy.policy_for_key(info_state) + for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): + state_policy[action] = value + + for action in info_state_node.legal_actions: + new_state = state.child(action) + self._update_current_policy(new_state, current_policy) + else: + for action, _ in state.chance_outcomes(): + new_state = state.child(action) + self._update_current_policy(new_state, current_policy) + + # Path to state probability ignores chance probabilty as this is stored as new_reach_probabilities[-1] + def _compute_cumulative_immediate_regret_for_player(self, state, policies, + reach_probabilities, player): + """Increments the immediate regrets and policy for `player` of + all realisable deviations at this state. + Args: + state: The initial game state to analyze from. + policies: A list of `num_players` callables taking as input an + `info_state_node` and returning a {action: prob} dictionary. + reach_probabilities: The probability for each player of reaching `state` + as a numpy array [prob for player 0, for player 1,..., for chance]. + `reach_probabilities[player]` will work in all cases. + player: The 0-indexed player to update the values for. If `None`, the + update for all players will be performed. + Returns: - strategy_dictionary: a dictionary action -> prob value. + The utility of `state` for all players, assuming all players follow the + current policy defined by `self.Policy`. """ - strategy_dictionary = {} - for action in legal_actions: - strategy_dictionary[action] = strategy_array[action] - return strategy_dictionary + if state.is_terminal(): + return np.asarray(state.returns()) + + if state.is_chance_node(): + state_value = 0.0 + for action, action_prob in state.chance_outcomes(): + assert action_prob > 0 + new_state = state.child(action) + new_reach_probabilities = reach_probabilities.copy() + new_reach_probabilities[-1] *= action_prob + + state_value += action_prob * self._compute_cumulative_immediate_regret_for_player( + new_state, policies, new_reach_probabilities, player) + return state_value + + current_player = state.current_player() + info_state = state.information_state_string(current_player) + + # No need to continue on this history branch as no update will be performed + # for any player. + # The value we return here is not used in practice. If the conditional + # statement is True, then the last taken action has probability 0 of + # occurring, so the returned value is not impacting the parent node value. + if all(reach_probabilities[:-1] == 0): + return np.zeros(self._num_players) + + state_value = np.zeros(self._num_players) + + # The utilities of the children states are computed recursively. As the + # regrets are added to the information state regrets for each state in that + # information state, the recursive call can only be made once per child + # state. Therefore, the utilities are cached. + children_utilities = {} + + info_state_node = self._info_state_nodes[info_state] + # Reset y values + info_state_node.y_values = defaultdict(float) + if policies is None: + info_state_policy = self._get_infostate_policy(info_state) + else: + info_state_policy = policies[current_player](info_state) + + reach_prob = reach_probabilities[current_player] + for action in state.legal_actions(): + action_prob = info_state_policy.get(action, 0.) + info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action] + \ + action_prob * reach_prob + new_state = state.child(action) + new_reach_probabilities = reach_probabilities.copy() + assert action_prob <= 1 + new_reach_probabilities[current_player] *= action_prob + child_utility = self._compute_cumulative_immediate_regret_for_player( + new_state, policies=policies, reach_probabilities=new_reach_probabilities, player=player) + + state_value += action_prob * child_utility + children_utilities[action] = child_utility + + counterfactual_reach_prob = (np.prod( + reach_probabilities[:current_player]) * np.prod(reach_probabilities[current_player + 1:])) + + state_value_for_player = state_value[current_player] + deviations = info_state_node.relizable_deviations + for deviation_index in range(len(deviations)): + deviation = deviations[deviation_index] + deviation_strategy = deviation.deviate( + strat_dict_to_array(self._get_infostate_policy(info_state))) + + player_child_utilities = np.array(list(children_utilities.values()))[ + :, current_player] + devation_cf_value = np.inner(np.transpose( + deviation_strategy), player_child_utilities) + + memory_reach_probs = create_probs_from_index( + info_state_node.current_history_probs, self.current_policy()) + player_current_memory_reach_prob = deviation.player_deviation_reach_probability( + memory_reach_probs) + + deviation_regret = player_current_memory_reach_prob * \ + ((devation_cf_value*counterfactual_reach_prob) - + (counterfactual_reach_prob * state_value_for_player)) + + info_state_node.cumulative_regret[deviation_index] += deviation_regret + return state_value + + def _get_infostate_policy(self, info_state_str): + """Returns an {action: prob} dictionary for the policy on `info_state`.""" + info_state_node = self._info_state_nodes[info_state_str] + prob_vec = self._current_policy.action_probability_array[ + info_state_node.index_in_tabular_policy] + return { + action: prob_vec[action] for action in info_state_node.legal_actions + } +class _EFRSolver(_EFRSolverBase): + def evaluate_and_update_policy(self): + """Performs a single step of policy evaluation and policy improvement.""" + self._compute_cumulative_immediate_regret_for_player( + self._root_node, + policies=None, + reach_probabilities=np.ones(self._game.num_players() + 1), + player=None) + self._update_current_policy(self._root_node, self._current_policy) + self._iteration += 1 -def create_probs_from_index(indices, current_policy): - path_to_state = [] - if indices is None or len(indices) == 0: - return [] - for index in indices: - strat_dict = array_to_strat_dict( - current_policy.action_probability_array[index[1]], index[0]) - path_to_state.append(strat_dict) - return path_to_state +class EFRSolver(_EFRSolver): + """ + Implements the EFR algorithm. -# Deviation set definitions -def return_blind_action(num_actions, history, _): - """ - Returns an array of all Blind Action deviations with respect to an information set. + See: https://arxiv.org/abs/2102.06973 + """ + def __init__(self, game, deviations_name): + """Initializer. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Blind Action deviations - that are realizable at the - information set. + game: The `pyspiel.Game` to run on. + deviation_name: the name of the deviation type to use for accumulating regrets and calculating the strategy at the next timestep. + + Deviation types implemented are "blind action", "informed action", "blind cf", + "informed counterfactual", "blind partial sequence", "counterfactual partial sequence", + "casual partial sequence", "twice informed partial sequence", "single target behavioural". + See "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games" by D. Morrill et al. 2021b + for the full definition of each type. """ - memory_weights = [np.full(len(history), 1)] - prior_actions_in_memory = history - return return_all_external_deviations(num_actions, memory_weights, - prior_actions_in_memory, history) + #external_only = True leads to a shortcut in the computation of the next timesteps strategy from the regrets + external_only = False + deviation_sets = None + + if deviations_name == "blind action": + deviation_sets = return_blind_action + external_only = True + elif deviations_name == "informed action": + deviation_sets = return_informed_action + elif deviations_name == "blind cf" or deviations_name == "blind counterfactual": + deviation_sets = return_blind_cf + external_only = True + elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": + deviation_sets = return_informed_cf + elif deviations_name == "bps" or deviations_name == "blind partial sequence": + deviation_sets = return_blind_partial_sequence + external_only = True + elif deviations_name == "cfps" or deviations_name == "cf partial sequence"\ + or deviations_name == "counterfactual partial sequence": + deviation_sets = return_cf_partial_sequence + elif deviations_name == "csps" or deviations_name == "casual partial sequence": + deviation_sets = return_cs_partial_sequence + elif deviations_name == "tips" or deviations_name == "twice informed partial sequence": + deviation_sets = return_twice_informed_partial_sequence + elif deviations_name == "bhv" or deviations_name == "single target behavioural"\ + or deviations_name == "behavioural": + deviation_sets = return_behavourial + else: + raise ValueError("Unsupported Deviation Set Passed As Constructor Argument") + super(EFRSolver, self).__init__(game, deviation_sets) + self._external_only = external_only -def return_informed_action(num_actions, history, _): - """ - Returns an array of all Informed Action deviations with respect to an information set. + def _regret_matching(self, legal_actions, info_set_node): + """Returns an info state policy by applying regret-matching function + over all deviations and time selection functions. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Informed Action deviations that are realizable at the - information set. - """ - memory_weights = [np.full(len(history), 1)] - prior_actions_in_memory = history - return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + legal_actions: the list of legal actions at this state. -def return_blind_CF(num_actions, history, _): - """ - Returns an array of all Blind Counterfactual deviations with respect to an information set. - Note: EFR using only Blind Counterfactual deviations is equivalent to vanilla Counterfactual - Regret Minimisation (CFR). - Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Blind CF deviations - that are realizable at the information set. + A dict of action -> prob for all legal actions. """ - memory_weights = [None] - prior_actions_in_memory = np.zeros(len(history)) - return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) + z = sum(info_set_node.y_values.values()) + info_state_policy = {} + + # The fixed point solution can be directly obtained through the weighted regret matrix + # if only external deviations are used + if self._external_only and z > 0: + weighted_deviation_matrix = np.zeros( + (len(legal_actions), len(legal_actions))) + for dev in list(info_set_node.y_values.keys()): + weighted_deviation_matrix += ( + info_set_node.y_values[dev]/z) * dev.return_transform_matrix() + new_strategy = weighted_deviation_matrix[:, 0] + for index in range(len(legal_actions)): + info_state_policy[legal_actions[index]] = new_strategy[index] + + # Full regret matching by finding the least squares solution to the fixed point + # Last row of matrix and the column entry ensures the solution is a strategy (otherwise would have to normalise) + elif z > 0: + num_actions = len(info_set_node.legal_actions) + weighted_deviation_matrix = -np.eye(num_actions) + + for dev in list(info_set_node.y_values.keys()): + weighted_deviation_matrix += ( + info_set_node.y_values[dev]/z) * dev.return_transform_matrix() + + normalisation_row = np.ones(num_actions) + weighted_deviation_matrix = np.vstack( + [weighted_deviation_matrix, normalisation_row]) + b = np.zeros(num_actions+1) + b[num_actions] = 1 + b = np.reshape(b, (num_actions+1, 1)) + + strategy = lstsq(weighted_deviation_matrix, b)[0] + + # Adopt same clipping strategy as paper author's code + strategy[np.where(strategy < 0)] = 0 + strategy[np.where(strategy > 1)] = 1 + + strategy = strategy/sum(strategy) + for index in range(len(strategy)): + info_state_policy[info_set_node.legal_actions[index] + ] = strategy[index] + # Use a uniform strategy as sum of all regrets is negative + else: + for index in range(len(legal_actions)): + info_state_policy[legal_actions[index]]\ + = 1.0 / len(legal_actions) + return info_state_policy -def return_informed_CF(num_actions, history, _): - memory_weights = [None] - prior_actions_in_memory = history - return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) +def _update_average_policy(average_policy, info_state_nodes): + """Updates in place `average_policy` to the average of all policies iterated. + + This function is a module level function to be reused by both CFRSolver and + CFRBRSolver. + + Args: + average_policy: A `policy.TabularPolicy` to be updated in-place. + info_state_nodes: A dictionary {`info_state_str` -> `_InfoStateNode`}. + """ + for info_state, info_state_node in info_state_nodes.items(): + info_state_policies_sum = info_state_node.cumulative_policy + state_policy = average_policy.policy_for_key(info_state) + probabilities_sum = sum(info_state_policies_sum.values()) + if probabilities_sum == 0: + num_actions = len(info_state_node.legal_actions) + for action in info_state_node.legal_actions: + state_policy[action] = 1 / num_actions + else: + for action, action_prob_sum in info_state_policies_sum.items(): + state_policy[action] = action_prob_sum / probabilities_sum -def return_blind_partial_sequence(num_actions, history, _): - """ - Returns an array of all Blind Partial Sequence deviations (BPS) - with respect to an information set - Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all BPS deviations - that are realizable at the information set. - """ - prior_actions_in_memory = history - memory_weights = [None] - if len(history) > 0: - memory_weights.append(np.ones(len(history))) - for i in range(len(history)): - possible_memory_weight = np.zeros(len(history)) - possible_memory_weight[0:i] = np.full(i, 1.0) - memory_weights.append(possible_memory_weight) - return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory, history) +def strat_dict_to_array(strategy_dictionary): + """ + A helper function to convert the strategy dictionary action -> prob value to an array. + Args: + strategy_dictionary: a dictionary action -> prob value. + Returns: + strategy_array: an array with the ith action's value at the i-1th index. + """ + actions = list(strategy_dictionary.keys()) + strategy_array = np.zeros((len(actions), 1)) + for action in range(len(actions)): + strategy_array[action][0] = strategy_dictionary[actions[action]] + return strategy_array -def return_cf_partial_sequence(num_actions, history, _): - """ - Returns an array of all Counterfactual Partial Sequence deviations (CFPS) - with respect to an information set - Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all CFPS deviations - that are realizable at the information set. - """ - prior_actions_in_memory = history - memory_weights = [None] - if len(history) > 0: - memory_weights.append(np.ones(len(history))) - for i in range(len(history)): - possible_memory_weight = np.zeros(len(history)) - possible_memory_weight[0:i] = np.full(i, 1.0) - memory_weights.append(possible_memory_weight) - return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory, history) +def array_to_strat_dict(strategy_array, legal_actions): + """ + A helper function to convert a strategy array to an action -> prob value dictionary. + Args: + strategy_array: an array with the ith action's value at the i-1th index. + legal_actions: the list of all legal actions at the current state. + Returns: + strategy_dictionary: a dictionary action -> prob value. + """ + strategy_dictionary = {} + for action in legal_actions: + strategy_dictionary[action] = strategy_array[action] + return strategy_dictionary -def return_cs_partial_sequence(num_actions, history, prior_legal_actions): - """ - Returns an array of all Casual Partial Sequence deviations with respect to an information set. - Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - prior_legal_actions: an array containing the index in .... that - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all - Casual Partial Sequence deviations that are realizable at the - information set. - """ - prior_actions_in_memory = history - external_memory_weights = [None] +def create_probs_from_index(indices, current_policy): + path_to_state = [] + if indices is None or len(indices) == 0: + return [] + for index in indices: + strat_dict = array_to_strat_dict( + current_policy.action_probability_array[index[1]], index[0]) + path_to_state.append(strat_dict) + return path_to_state - for i in range(len(history)): - possible_memory_weight = np.zeros(len(history)) - possible_memory_weight[0:i] = np.full(i, 1.0) - external_memory_weights.append(possible_memory_weight) - external = return_all_external_modified_deviations( - num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory, history) - internal = return_blind_action(num_actions, history, None) +# Deviation set definitions +def return_blind_action(num_actions, history, _): + """ + Returns an array of all Blind Action deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Blind Action deviations + that are realizable at the + information set. + """ + memory_weights = [np.full(len(history), 1)] + prior_actions_in_memory = history + return return_all_external_deviations(num_actions, memory_weights, + prior_actions_in_memory) - cf_ext = return_informed_CF(num_actions, history, None) - cf_int = return_blind_CF(num_actions, history, None) - return np.concatenate((external, internal, cf_ext, cf_int)) +def return_informed_action(num_actions, history, _): + """ + Returns an array of all Informed Action deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Informed Action deviations that are realizable at the + information set. + """ + memory_weights = [np.full(len(history), 1)] + prior_actions_in_memory = history + return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory) + + +def return_blind_cf(num_actions, history, _): + """ + Returns an array of all Blind Counterfactual deviations with respect to an information set. + Note: EFR using only Blind Counterfactual deviations is equivalent to vanilla Counterfactual + Regret Minimisation (CFR). + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all Blind CF deviations + that are realizable at the information set. + """ + memory_weights = [None] + prior_actions_in_memory = np.zeros(len(history)) + return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory) + + +def return_informed_cf(num_actions, history, _): + memory_weights = [None] + prior_actions_in_memory = history + return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory) -def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions): - """ - Returns an array of all Casual Partial Sequence deviations with respect to an information set. - Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - prior_legal_actions: an array containing the index in .... that - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all - Casual Partial Sequence deviations that are realizable at the information set. - """ - prior_actions_in_memory = history - external_memory_weights = [None] +def return_blind_partial_sequence(num_actions, history, _): + """ + Returns an array of all Blind Partial Sequence deviations (BPS) + with respect to an information set + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all BPS deviations + that are realizable at the information set. + """ + prior_actions_in_memory = history + memory_weights = [None] + if len(history) > 0: + memory_weights.append(np.ones(len(history))) + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + memory_weights.append(possible_memory_weight) + return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory) - for i in range(len(history)): - possible_memory_weight = np.zeros(len(history)) - possible_memory_weight[0:i] = np.full(i, 1.0) - external_memory_weights.append(possible_memory_weight) - external = return_all_external_modified_deviations( - num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory, history) - internal = return_informed_action(num_actions, history, None) +def return_cf_partial_sequence(num_actions, history, _): + """ + Returns an array of all Counterfactual Partial Sequence deviations (CFPS) + with respect to an information set + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all CFPS deviations + that are realizable at the information set. + """ + prior_actions_in_memory = history + memory_weights = [None] + if len(history) > 0: + memory_weights.append(np.ones(len(history))) + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + memory_weights.append(possible_memory_weight) + return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory) - cf_ext = return_informed_CF(num_actions, history, None) - return np.concatenate((external, internal, cf_ext)) +def return_cs_partial_sequence(num_actions, history, prior_legal_actions): + """ + Returns an array of all Casual Partial Sequence deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each preceeding state. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all + Casual Partial Sequence deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + external_memory_weights = [None] + + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + external_memory_weights.append(possible_memory_weight) + + external = return_all_external_modified_deviations( + num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory) + internal = return_blind_action(num_actions, history, None) + + cf_ext = return_informed_cf(num_actions, history, None) + cf_int = return_blind_cf(num_actions, history, None) + + return np.concatenate((external, internal, cf_ext, cf_int)) -def return_twice_informed_partial_sequence(num_actions, history, prior_legal_actions): - """ - Returns an array of all Twice Informed Partial Sequence (TIPS) deviations - with respect to an information set. - Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior - prior_legal_actions: an array containing the index in .... that - Returns: - an array of LocalDeviationWithTimeSelection objects that represent all TIPS deviations that are realizable at the - information set. - """ - prior_actions_in_memory = history - memory_weights = [None] - for i in range(len(history)): - possible_memory_weight = np.zeros(len(history)) - possible_memory_weight[0:i] = np.full(i, 1.0) - memory_weights.append(possible_memory_weight) +def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions): + """ + Returns an array of all Casual Partial Sequence deviations with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each preceeding state. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all + Casual Partial Sequence deviations that are realizable at the information set. + """ + prior_actions_in_memory = history + external_memory_weights = [None] + + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + external_memory_weights.append(possible_memory_weight) + + external = return_all_external_modified_deviations( + num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory) + internal = return_informed_action(num_actions, history, None) + + cf_ext = return_informed_cf(num_actions, history, None) + return np.concatenate((external, internal, cf_ext)) - internal = return_all_internal_modified_deviations( - num_actions, memory_weights, prior_legal_actions, prior_actions_in_memory, history) - cf_int = return_informed_CF(num_actions, history, None) - return np.concatenate((internal, cf_int)) +def return_twice_informed_partial_sequence(num_actions, history, prior_legal_actions): + """ + Returns an array of all Twice Informed Partial Sequence (TIPS) deviations + with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that information set + history: an array containing the prior actions played by the `player` to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each preceeding state. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all TIPS deviations that are realizable at the + information set. + """ + prior_actions_in_memory = history + memory_weights = [None] + + for i in range(len(history)): + possible_memory_weight = np.zeros(len(history)) + possible_memory_weight[0:i] = np.full(i, 1.0) + memory_weights.append(possible_memory_weight) + + internal = return_all_internal_modified_deviations( + num_actions, memory_weights, prior_legal_actions, prior_actions_in_memory) + + cf_int = return_informed_cf(num_actions, history, None) + return np.concatenate((internal, cf_int)) def generate_all_action_permutations(current_stem, remaining_actions): - if len(remaining_actions) == 0: - return [np.array(current_stem)] - else: - next_actions = remaining_actions[0] - permutations = [] - for action in next_actions: - next_stem = current_stem.copy() - next_stem.append(action) - next_remaining_actions = remaining_actions[1:] - prev_permutations = generate_all_action_permutations( - next_stem, next_remaining_actions) - for i in prev_permutations: - permutations.append(i) - return permutations + """ + Args: + current_stem: the prior sequence of actions to be completed by the remaining actions + remaining_actions: a 2d array of [subsequent states]x[possible actions] + Returns: + An array with each element being the current stem joined with a possible permuation of remaining actions + """ + if len(remaining_actions) == 0: + return [np.array(current_stem)] + else: + next_actions = remaining_actions[0] + permutations = [] + for action in next_actions: + next_stem = current_stem.copy() + next_stem.append(action) + next_remaining_actions = remaining_actions[1:] + prev_permutations = generate_all_action_permutations( + next_stem, next_remaining_actions) + for i in prev_permutations: + permutations.append(i) + return permutations # Includes identity def return_behavourial(num_actions, history, prior_legal_actions): - deviations = [] - if len(history) == 0: + """ + [TODO] + """ + deviations = [] + if len(history) == 0: + internal = return_all_non_identity_internal_deviations( + num_actions, [None], history) + for i in internal: + deviations.append(i) + else: + for deviation_info in range(len(history)): + prior_possible_memory_actions = generate_all_action_permutations( + [], prior_legal_actions[:deviation_info+1]) + memory_weights = np.concatenate( + (np.ones(deviation_info), np.zeros(len(history) - deviation_info))) + for prior_memory_actions in prior_possible_memory_actions: + prior_memory_actions = np.concatenate( + (prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) + for i in range(len(history) - len(prior_memory_actions)): + prior_memory_actions.append(0) + prior_memory_actions_cp = prior_memory_actions.copy() internal = return_all_non_identity_internal_deviations( - num_actions, [None], [None], history) + num_actions, [memory_weights], prior_memory_actions_cp) for i in internal: - deviations.append(i) - else: - for deviation_info in range(len(history)): - prior_possible_memory_actions = generate_all_action_permutations( - [], prior_legal_actions[:deviation_info+1]) - memory_weights = np.concatenate( - (np.ones(deviation_info), np.zeros(len(history) - deviation_info))) - for prior_memory_actions in prior_possible_memory_actions: - prior_memory_actions = np.concatenate( - (prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) - for i in range(len(history) - len(prior_memory_actions)): - prior_memory_actions.append(0) - prior_memory_actions_cp = prior_memory_actions.copy() - internal = return_all_non_identity_internal_deviations( - num_actions, [memory_weights], prior_memory_actions_cp, prior_memory_actions_cp) - for i in internal: - deviations.append(i) - - return deviations + deviations.append(i) + + return deviations class LocalDeviationWithTimeSelection(object): - local_swap_transform = attr.ib() - - # Which actions have been forgotten (0) or remembered (1) according to the memory state - prior_actions_weight = attr.ib() - - # Which actions have been take according to the memory state - prior_memory_actions = attr.ib() - - use_unmodified_history = attr.ib() - - def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, - is_external, use_unmodified_history=True): - """" - Represents a swap transformation (both external and internal) for a given memory state. - Args: - target: the action that will be played when the deviation is triggered. - source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False). - num_actions: the integer of actions that can be played for this information state - prior_actions_weight: an array the length of the history of the information state - actions have been forgotten (0) or remembered (1) according to the memory state. - This is represented numerically for possible experimentation with partially forgotten - actions (i.e in the range (0,1)). - prior_memory_actions: the preceeding actions upto the the information state - (which the LocalDeviationWithTimeSelection is defined with respect to). - is_external: a boolean use to determine whether this is an internal or external type deviation. - use_unmodified_history: a boolean used to - """ - self.local_swap_transform = LocalSwapTransform( - target, source, num_actions, is_external=is_external) - self.prior_actions_weight = prior_actions_weight - self.prior_memory_actions = prior_memory_actions - self.use_unmodified_history = use_unmodified_history - - # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) - def deviate(self, strategy): - """ - Args: - - """ - return self.local_swap_transform.deviate(strategy) - - def return_transform_matrix(self): - """ - Returns the matrix_transform of the associated `LocalSwapTransform` object. - """ - return self.local_swap_transform.matrix_transform - - def player_deviation_reach_probability(self, prior_possible_action_probabilities): - if self.prior_actions_weight is None or self.prior_memory_actions is None or prior_possible_action_probabilities is None: - return 1.0 - - memory_action_probabilities = np.ones(len(self.prior_actions_weight)) - # Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs - memory_weightings = self.prior_actions_weight.copy() - if self.use_unmodified_history: - for state in range(len(self.prior_memory_actions)): - if not self.prior_actions_weight[state] == 0: - memory_action_probabilities[state] = ( - prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) - else: - memory_action_probabilities[state] = 1 - memory_weightings[state] = 1 - - - path_probability = np.multiply( - memory_weightings, memory_action_probabilities) - memory_reach_probability = np.prod(path_probability) - return memory_reach_probability - - def __eq__(self, other): - if self.local_swap_transform == other.local_swap_transform: - return True - else: - return False + """" + Comprised of a swap transformation that will be applied at the current information state, a memory weighting + which describes the which actions are remembered and the memory action history (prior_memory_actions) that is remembered. + Note that the "memory action history" might not equal the history in the case of some deviation types (e.g tips deviations). + """ + #The swap transformation that will be compared to the unmodified strategy. + #The transformation is applied at the memory state. + local_swap_transform = attr.ib() + + # Which actions have been forgotten (0) or remembered (1) according to the memory state + prior_actions_weight = attr.ib() + + # Which actions have been take according to the memory state + prior_memory_actions = attr.ib() + + use_unmodified_history = attr.ib() + + def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, + is_external, use_unmodified_history=True): + """" + Represents a swap transformation (both external and internal) for a given memory state. + Args: + target: the action that will be played when the deviation is triggered. + source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False). + num_actions: the integer of actions that can be played for this information state + prior_actions_weight: an array the length of the history of the information state + actions have been forgotten (0) or remembered (1) according to the memory state. + This is represented numerically for possible experimentation with partially forgotten + actions (i.e in the range (0,1)). + prior_memory_actions: the preceeding actions upto the the information state + (which the LocalDeviationWithTimeSelection is defined with respect to). + is_external: a boolean use to determine whether this is an internal or external type deviation. + use_unmodified_history: a boolean used to indicate whether the provided memory_actions are the same as + the information state it was derived from. + """ + self.local_swap_transform = LocalSwapTransform( + target, source, num_actions, is_external=is_external) + self.prior_actions_weight = prior_actions_weight + self.prior_memory_actions = prior_memory_actions + self.use_unmodified_history = use_unmodified_history + + # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) + def deviate(self, strategy): + """ + Returns the strategy array given by deviating according to the 'self.local_swap_transform.matrix_transform' matrix. + Args: + strategy: the strategy array to deviate from. + Returns: + the matrix product of the the matrix_transform and the provided strategy. + """ + return self.local_swap_transform.deviate(strategy) - def __hash__(self): - return hash(self.local_swap_transform) + def return_transform_matrix(self): + """ + Returns the matrix_transform of the associated `LocalSwapTransform` object. + """ + return self.local_swap_transform.matrix_transform -def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): - deviations = [] - for prior_actions_weight in possible_prior_weights: - for target in range(num_actions): - for source in range(num_actions): - if not source == target: - deviations.append(LocalDeviationWithTimeSelection( - target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) - return deviations - -def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): - deviations = [] - for prior_actions_weight in possible_prior_weights: - try: - modification_index = np.where(prior_actions_weight == 0)[0][0] - except IndexError: - modification_index = 0 - if modification_index == len(prior_memory_actions): - for target in range(num_actions): - for source in range(num_actions): - if not source == target: - deviations.append(LocalDeviationWithTimeSelection( - target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + def player_deviation_reach_probability(self, prior_possible_action_probabilities): + """ + Calculate the probability of reaching the current memory state provided the + player played from the start of the game to this state. This is assuming that they play + with their current strategy with the deviation applied. + Args: + prior_possible_action_probabilities: a 2d array of length + [player's history]x[number of actions at that state]. These are the current strategies of + the player, from start to end of their history. + Returns: + The reach probability of the current memory state. + """ + if self.prior_actions_weight is None or self.prior_memory_actions is None or prior_possible_action_probabilities is None: + return 1.0 + + memory_action_probabilities = np.ones(len(self.prior_actions_weight)) + #Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs + memory_weightings = self.prior_actions_weight.copy() + if self.use_unmodified_history: + for state in range(len(self.prior_memory_actions)): + if not self.prior_actions_weight[state] == 0: + memory_action_probabilities[state] = ( + prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) else: - previous_action = prior_memory_actions[modification_index] - for alt_action in possible_prior_memory_actions[modification_index]: - prior_memory_actions[modification_index] = alt_action - for target in range(num_actions): - for source in range(num_actions): - if not source == target: - deviations.append(LocalDeviationWithTimeSelection( - target, source, num_actions, prior_actions_weight, prior_memory_actions.copy(), False)) - prior_memory_actions[modification_index] = previous_action - return deviations - - -def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions, _): - deviations = [] - for prior_actions_weight in possible_prior_weights: - for target in range(num_actions): + memory_action_probabilities[state] = 1 + memory_weightings[state] = 1 + + path_probability = np.multiply( + memory_weightings, memory_action_probabilities) + memory_reach_probability = np.prod(path_probability) + return memory_reach_probability + + def __eq__(self, other): + if self.local_swap_transform == other.local_swap_transform: + return True + else: + return False + + def __hash__(self): + return hash(self.local_swap_transform) + +def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions): + deviations = [] + for prior_actions_weight in possible_prior_weights: + for target in range(num_actions): + for source in range(num_actions): + if not source == target: + deviations.append(LocalDeviationWithTimeSelection( + target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + return deviations + +def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions): + deviations = [] + for prior_actions_weight in possible_prior_weights: + try: + modification_index = np.where(prior_actions_weight == 0)[0][0] + except IndexError: + modification_index = 0 + if modification_index == len(prior_memory_actions): + for target in range(num_actions): + for source in range(num_actions): + if not source == target: deviations.append(LocalDeviationWithTimeSelection( - target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) - return deviations + target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + else: + previous_action = prior_memory_actions[modification_index] + for alt_action in possible_prior_memory_actions[modification_index]: + prior_memory_actions[modification_index] = alt_action + for target in range(num_actions): + for source in range(num_actions): + if not source == target: + deviations.append(LocalDeviationWithTimeSelection( + target, source, num_actions, prior_actions_weight, prior_memory_actions.copy(), False)) + prior_memory_actions[modification_index] = previous_action + return deviations + + +def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions): + deviations = [] + for prior_actions_weight in possible_prior_weights: + for target in range(num_actions): + deviations.append(LocalDeviationWithTimeSelection( + target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + return deviations # Modify last action as required -def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions, _): - deviations = [] - for prior_actions_weight in possible_prior_weights: - try: - modification_index = np.where(prior_actions_weight == 0)[0][0] - except IndexError: - modification_index = 0 - if modification_index == len(prior_memory_actions): - for target in range(num_actions): - deviations.append(LocalDeviationWithTimeSelection( - target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) - else: - previous_action = prior_memory_actions[modification_index] - for alt_action in possible_prior_memory_actions[modification_index]: - prior_memory_actions[modification_index] = alt_action - for target in range(num_actions): - deviations.append(LocalDeviationWithTimeSelection( - target, target, num_actions, prior_actions_weight, prior_memory_actions.copy(), True)) - prior_memory_actions[modification_index] = previous_action - return deviations - - -def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions, _): - deviations = [] - for prior_actions_weight in possible_prior_weights: +def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions): + deviations = [] + for prior_actions_weight in possible_prior_weights: + try: + modification_index = np.where(prior_actions_weight == 0)[0][0] + except IndexError: + modification_index = 0 + if modification_index == len(prior_memory_actions): + for target in range(num_actions): deviations.append(LocalDeviationWithTimeSelection( - 0, 0, num_actions, prior_actions_weight, prior_memory_actions, False)) - return deviations + target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + else: + previous_action = prior_memory_actions[modification_index] + for alt_action in possible_prior_memory_actions[modification_index]: + prior_memory_actions[modification_index] = alt_action + for target in range(num_actions): + deviations.append(LocalDeviationWithTimeSelection( + target, target, num_actions, prior_actions_weight, prior_memory_actions.copy(), True)) + prior_memory_actions[modification_index] = previous_action + return deviations + + +def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions): + deviations = [] + for prior_actions_weight in possible_prior_weights: + deviations.append(LocalDeviationWithTimeSelection( + 0, 0, num_actions, prior_actions_weight, prior_memory_actions, False)) + return deviations # A swap transformation given by the matrix_transform for an information state of class LocalSwapTransform(object): + """ + Represents a swap transformation (both external and internal) for an information state for a certain number of actions. + """ + source_action = attr.ib() + target_action = attr.ib() + matrix_transform = attr.ib() + actions_num = attr.ib() + is_external = attr.ib() + + def __init__(self, target, source, actions_num, is_external=True): + """" + Creates the matrix transformation that describes the transformation and initalises the other variables. + Args: + target: the action that will be played when the deviation is triggered + source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) + num_actions: the integer of actions that can be played for this information state + is_external: a boolean used to determine whether to create an internal or external type deviation. """ - Represents a swap transformation (both external and internal) for an information state for a certain number of actions. - """ - source_action = attr.ib() - target_action = attr.ib() - matrix_transform = attr.ib() - actions_num = attr.ib() - is_external = attr.ib() - - def __init__(self, target, source, actions_num, is_external=True): - """" - Creates the matrix transformation that describes the transformation and initalises the other variables. - Args: - target: the action that will be played when the deviation is triggered - source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) - num_actions: the integer of actions that can be played for this information state - is_external: a boolean used to determine whether to create an internal or external type deviation. - """ - self.source_action = source - self.target_action = target - self.actions_num = actions_num - if is_external: - self.source_action = None - self.matrix_transform = np.zeros((actions_num, actions_num)) - self.matrix_transform[target] = np.ones(actions_num) - else: - self.matrix_transform = np.eye(actions_num) - self.matrix_transform[target][source] = 1 - self.matrix_transform[source][source] = 0 + self.source_action = source + self.target_action = target + self.actions_num = actions_num + if is_external: + self.source_action = None + self.matrix_transform = np.zeros((actions_num, actions_num)) + self.matrix_transform[target] = np.ones(actions_num) + else: + self.matrix_transform = np.eye(actions_num) + self.matrix_transform[target][source] = 1 + self.matrix_transform[source][source] = 0 - def __repr__(self) -> str: - return "Diverting from Action: "+str(self.source_action) + " to Action: "+str(self.target_action) + def __repr__(self) -> str: + return "Diverting from Action: "+str(self.source_action) + " to Action: "+str(self.target_action) - def __eq__(self, __o: object) -> bool: - if self.source_action == __o.source_action and self.target_action == __o.target_action and self.actions_num == __o.actions_num: - return True - else: - return False - - def __hash__(self): - separator = " " - return hash(str(self.source_action)+separator+str(self.target_action)+separator+str(self.actions_num) + separator + str(self.is_external)) - - def deviate(self, strategy): - """ - Returns the strategy array given by deviating according to 'self.matrix_transform' matrix. - Args: - strategy: the strategy array to deviate from. - Returns: - the matrix product of the the matrix_transform and the provided strategy. - """ - return np.matmul(self.matrix_transform, strategy) + def __eq__(self, other: object) -> bool: + if self.source_action == other.source_action and self.target_action == other.target_action and self.actions_num == other.actions_num: + return True + else: + return False + + def __hash__(self): + separator = " " + return hash(str(self.source_action)+separator+str(self.target_action)+separator+str(self.actions_num) + separator + str(self.is_external)) + + def deviate(self, strategy): + """ + Returns the strategy array given by deviating according to 'self.matrix_transform' matrix. + Args: + strategy: the strategy array to deviate from. + Returns: + the matrix product of the the matrix_transform and the provided strategy. + """ + return np.matmul(self.matrix_transform, strategy) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index 766998d050..195f7152d0 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -14,8 +14,6 @@ """Tests for open_spiel.python.algorithms.efr.""" -import itertools - from absl.testing import absltest from absl.testing import parameterized import numpy as np @@ -23,7 +21,6 @@ from open_spiel.python import policy from open_spiel.python.algorithms import efr from open_spiel.python.algorithms import expected_game_score -from open_spiel.python.algorithms import exploitability import pyspiel _KUHN_GAME = pyspiel.load_game("kuhn_poker") @@ -50,7 +47,7 @@ def test_policy_zero_is_uniform(self, deviations_name): np.testing.assert_array_equal( _LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.average_policy().action_probability_array) - + @parameterized.parameters( ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) def test_cfr_kuhn_poker(self, deviations_name): @@ -58,7 +55,7 @@ def test_cfr_kuhn_poker(self, deviations_name): efr_solver = efr.EFRSolver( game=game, deviations_name=deviations_name - ) + ) for _ in range(300): efr_solver.evaluate_and_update_policy() average_policy = efr_solver.average_policy() From e5b8ed4ff3a3e464072271e655ebb24f090baf05 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Tue, 14 Nov 2023 00:25:34 +0000 Subject: [PATCH 12/18] Added CCE test for EFR (3 player Kuhn Poker) --- open_spiel/python/algorithms/efr_test.py | 46 ++++++++++++++++++++++-- 1 file changed, 44 insertions(+), 2 deletions(-) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index 195f7152d0..ec74a9a63f 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -19,9 +19,11 @@ import numpy as np from open_spiel.python import policy -from open_spiel.python.algorithms import efr from open_spiel.python.algorithms import expected_game_score +from open_spiel.python.algorithms import cfr import pyspiel +import efr + _KUHN_GAME = pyspiel.load_game("kuhn_poker") _LEDUC_GAME = pyspiel.load_game("leduc_poker") @@ -50,7 +52,7 @@ def test_policy_zero_is_uniform(self, deviations_name): @parameterized.parameters( ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) - def test_cfr_kuhn_poker(self, deviations_name): + def test_efr_kuhn_poker(self, deviations_name): game = pyspiel.load_game("kuhn_poker") efr_solver = efr.EFRSolver( game=game, @@ -65,5 +67,45 @@ def test_cfr_kuhn_poker(self, deviations_name): np.testing.assert_allclose( average_policy_values, [-1 / 18, 1 / 18], atol=1e-3) + @parameterized.parameters( + ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) + def test_efr_kuhn_poker_3p(self, deviations_name): + game = pyspiel.load_game("kuhn_poker(players=3)") + efr_solver = efr.EFRSolver( + game=game, + deviations_name=deviations_name + ) + strategies = [] + corr_dist_values = [] + for _ in range(10): + efr_solver.evaluate_and_update_policy() + # Convert the policy to a pyspiel.TabularPolicy, needed by the CorrDist + # functions on the C++ side. + strategies.append(policy.python_policy_to_pyspiel_policy( + efr_solver.current_policy())) + corr_dev = pyspiel.uniform_correlation_device(strategies) + cce_dist_info = pyspiel.cce_dist(game, corr_dev) + corr_dist_values.append(cce_dist_info.dist_value) + self.assertLess(corr_dist_values[-1], corr_dist_values[0]) + + + @parameterized.parameters( + ["blind cf", "informed cf", "bps", "cfps", "csps"]) + def test_efr_cce_dist_sheriff(self, deviations_name): + game = pyspiel.load_game("sheriff") + efr_solver = efr.EFRSolver( + game=game, + deviations_name=deviations_name + ) + strategies = [] + corr_dist_values = [] + for _ in range(3): + efr_solver.evaluate_and_update_policy() + strategies.append(policy.python_policy_to_pyspiel_policy( + efr_solver.current_policy())) + corr_dev = pyspiel.uniform_correlation_device(strategies) + cce_dist_info = pyspiel.cce_dist(game, corr_dev) + corr_dist_values.append(cce_dist_info.dist_value) + self.assertLess(corr_dist_values[-1], corr_dist_values[0]) if __name__ == "__main__": absltest.main() From c4cd0926dcd9e408167090878782fffc8ddffb08 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Wed, 15 Nov 2023 00:58:22 +0000 Subject: [PATCH 13/18] Added Sheriff tests(Look into initial iterations) --- open_spiel/python/algorithms/efr_test.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index ec74a9a63f..8f860b4eb3 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -90,16 +90,18 @@ def test_efr_kuhn_poker_3p(self, deviations_name): @parameterized.parameters( - ["blind cf", "informed cf", "bps", "cfps", "csps"]) + ["blind cf", "bps", "tips"]) def test_efr_cce_dist_sheriff(self, deviations_name): game = pyspiel.load_game("sheriff") efr_solver = efr.EFRSolver( game=game, deviations_name=deviations_name - ) + ) + #efr_solver = cfr.CFRSolver(game) + strategies = [] corr_dist_values = [] - for _ in range(3): + for _ in range(5): efr_solver.evaluate_and_update_policy() strategies.append(policy.python_policy_to_pyspiel_policy( efr_solver.current_policy())) From 65d82f27ca9caea414e32dd5b30e7a2395b91654 Mon Sep 17 00:00:00 2001 From: James Flynn Date: Wed, 15 Nov 2023 01:08:16 +0000 Subject: [PATCH 14/18] Moved test variables to setUp --- open_spiel/python/algorithms/efr_test.py | 43 +++++++++++------------- 1 file changed, 20 insertions(+), 23 deletions(-) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index 8f860b4eb3..2f2cbb4296 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -20,49 +20,50 @@ from open_spiel.python import policy from open_spiel.python.algorithms import expected_game_score -from open_spiel.python.algorithms import cfr +from open_spiel.python.algorithms import efr import pyspiel -import efr -_KUHN_GAME = pyspiel.load_game("kuhn_poker") -_LEDUC_GAME = pyspiel.load_game("leduc_poker") -_KUHN_UNIFORM_POLICY = policy.TabularPolicy(_KUHN_GAME) -_LEDUC_UNIFORM_POLICY = policy.TabularPolicy(_LEDUC_GAME) -_DEVIATIONS_ = ["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"] class EFRTest(parameterized.TestCase, absltest.TestCase): - @parameterized.parameters(_DEVIATIONS_) + def setUp(self): + self._KUHN_GAME = pyspiel.load_game("kuhn_poker") + self._LEDUC_GAME = pyspiel.load_game("leduc_poker") + self._KUHN_3P_GAME = pyspiel.load_game("kuhn_poker(players=3)") + self._SHERIFF_GAME = pyspiel.load_game("sheriff") + + self._KUHN_UNIFORM_POLICY = policy.TabularPolicy(self._KUHN_GAME) + self._LEDUC_UNIFORM_POLICY = policy.TabularPolicy(self._LEDUC_GAME) + + @parameterized.parameters(["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) def test_policy_zero_is_uniform(self, deviations_name): # We use Leduc and not Kuhn, because Leduc has illegal actions and Kuhn does # not. - game = pyspiel.load_game("leduc_poker") cfr_solver = efr.EFRSolver( - game=game, + game=self._LEDUC_GAME, deviations_name=deviations_name ) np.testing.assert_array_equal( - _LEDUC_UNIFORM_POLICY.action_probability_array, + self._LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.current_policy().action_probability_array) np.testing.assert_array_equal( - _LEDUC_UNIFORM_POLICY.action_probability_array, + self._LEDUC_UNIFORM_POLICY.action_probability_array, cfr_solver.average_policy().action_probability_array) @parameterized.parameters( ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) def test_efr_kuhn_poker(self, deviations_name): - game = pyspiel.load_game("kuhn_poker") efr_solver = efr.EFRSolver( - game=game, + game=self._KUHN_GAME, deviations_name=deviations_name ) for _ in range(300): efr_solver.evaluate_and_update_policy() average_policy = efr_solver.average_policy() average_policy_values = expected_game_score.policy_value( - game.new_initial_state(), [average_policy] * 2) + self._KUHN_GAME.new_initial_state(), [average_policy] * 2) # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker np.testing.assert_allclose( average_policy_values, [-1 / 18, 1 / 18], atol=1e-3) @@ -70,9 +71,8 @@ def test_efr_kuhn_poker(self, deviations_name): @parameterized.parameters( ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) def test_efr_kuhn_poker_3p(self, deviations_name): - game = pyspiel.load_game("kuhn_poker(players=3)") efr_solver = efr.EFRSolver( - game=game, + game=self._KUHN_3P_GAME, deviations_name=deviations_name ) strategies = [] @@ -84,7 +84,7 @@ def test_efr_kuhn_poker_3p(self, deviations_name): strategies.append(policy.python_policy_to_pyspiel_policy( efr_solver.current_policy())) corr_dev = pyspiel.uniform_correlation_device(strategies) - cce_dist_info = pyspiel.cce_dist(game, corr_dev) + cce_dist_info = pyspiel.cce_dist(self._KUHN_3P_GAME, corr_dev) corr_dist_values.append(cce_dist_info.dist_value) self.assertLess(corr_dist_values[-1], corr_dist_values[0]) @@ -92,13 +92,10 @@ def test_efr_kuhn_poker_3p(self, deviations_name): @parameterized.parameters( ["blind cf", "bps", "tips"]) def test_efr_cce_dist_sheriff(self, deviations_name): - game = pyspiel.load_game("sheriff") efr_solver = efr.EFRSolver( - game=game, + game=self._SHERIFF_GAME, deviations_name=deviations_name ) - #efr_solver = cfr.CFRSolver(game) - strategies = [] corr_dist_values = [] for _ in range(5): @@ -106,7 +103,7 @@ def test_efr_cce_dist_sheriff(self, deviations_name): strategies.append(policy.python_policy_to_pyspiel_policy( efr_solver.current_policy())) corr_dev = pyspiel.uniform_correlation_device(strategies) - cce_dist_info = pyspiel.cce_dist(game, corr_dev) + cce_dist_info = pyspiel.cce_dist(self._SHERIFF_GAME, corr_dev) corr_dist_values.append(cce_dist_info.dist_value) self.assertLess(corr_dist_values[-1], corr_dist_values[0]) if __name__ == "__main__": From 3654a8dca74c86821fb9537a21f731c455a0230e Mon Sep 17 00:00:00 2001 From: jameswflynn Date: Wed, 15 Nov 2023 01:20:59 +0000 Subject: [PATCH 15/18] Linted efr_test --- open_spiel/python/algorithms/efr_test.py | 39 ++++++++++++------------ 1 file changed, 20 insertions(+), 19 deletions(-) diff --git a/open_spiel/python/algorithms/efr_test.py b/open_spiel/python/algorithms/efr_test.py index 2f2cbb4296..9ef99bd455 100644 --- a/open_spiel/python/algorithms/efr_test.py +++ b/open_spiel/python/algorithms/efr_test.py @@ -29,41 +29,43 @@ class EFRTest(parameterized.TestCase, absltest.TestCase): def setUp(self): - self._KUHN_GAME = pyspiel.load_game("kuhn_poker") - self._LEDUC_GAME = pyspiel.load_game("leduc_poker") - self._KUHN_3P_GAME = pyspiel.load_game("kuhn_poker(players=3)") - self._SHERIFF_GAME = pyspiel.load_game("sheriff") + self.kuhn_game = pyspiel.load_game("kuhn_poker") + self.leduc_game = pyspiel.load_game("leduc_poker") + self.kuhn_3p_game = pyspiel.load_game("kuhn_poker(players=3)") + self.sheriff_game = pyspiel.load_game("sheriff") - self._KUHN_UNIFORM_POLICY = policy.TabularPolicy(self._KUHN_GAME) - self._LEDUC_UNIFORM_POLICY = policy.TabularPolicy(self._LEDUC_GAME) + self.kuhn_uniform_policy = policy.TabularPolicy(self.kuhn_game) + self.leduc_uniform_policy = policy.TabularPolicy(self.leduc_game) - @parameterized.parameters(["blind action", "informed action", "blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) + @parameterized.parameters(["blind action", "informed action", "blind cf", + "informed cf","bps", "cfps", "csps", + "tips", "bhv"]) def test_policy_zero_is_uniform(self, deviations_name): # We use Leduc and not Kuhn, because Leduc has illegal actions and Kuhn does # not. cfr_solver = efr.EFRSolver( - game=self._LEDUC_GAME, + game=self.leduc_game, deviations_name=deviations_name ) np.testing.assert_array_equal( - self._LEDUC_UNIFORM_POLICY.action_probability_array, + self.leduc_uniform_policy.action_probability_array, cfr_solver.current_policy().action_probability_array) np.testing.assert_array_equal( - self._LEDUC_UNIFORM_POLICY.action_probability_array, + self.leduc_uniform_policy.action_probability_array, cfr_solver.average_policy().action_probability_array) @parameterized.parameters( ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) def test_efr_kuhn_poker(self, deviations_name): efr_solver = efr.EFRSolver( - game=self._KUHN_GAME, + game=self.kuhn_game, deviations_name=deviations_name ) for _ in range(300): efr_solver.evaluate_and_update_policy() average_policy = efr_solver.average_policy() average_policy_values = expected_game_score.policy_value( - self._KUHN_GAME.new_initial_state(), [average_policy] * 2) + self.kuhn_game.new_initial_state(), [average_policy] * 2) # 1/18 is the Nash value. See https://en.wikipedia.org/wiki/Kuhn_poker np.testing.assert_allclose( average_policy_values, [-1 / 18, 1 / 18], atol=1e-3) @@ -72,7 +74,7 @@ def test_efr_kuhn_poker(self, deviations_name): ["blind cf", "informed cf", "bps", "cfps", "csps", "tips", "bhv"]) def test_efr_kuhn_poker_3p(self, deviations_name): efr_solver = efr.EFRSolver( - game=self._KUHN_3P_GAME, + game=self.kuhn_3p_game, deviations_name=deviations_name ) strategies = [] @@ -84,26 +86,25 @@ def test_efr_kuhn_poker_3p(self, deviations_name): strategies.append(policy.python_policy_to_pyspiel_policy( efr_solver.current_policy())) corr_dev = pyspiel.uniform_correlation_device(strategies) - cce_dist_info = pyspiel.cce_dist(self._KUHN_3P_GAME, corr_dev) + cce_dist_info = pyspiel.cce_dist(self.kuhn_3p_game, corr_dev) corr_dist_values.append(cce_dist_info.dist_value) self.assertLess(corr_dist_values[-1], corr_dist_values[0]) - @parameterized.parameters( ["blind cf", "bps", "tips"]) def test_efr_cce_dist_sheriff(self, deviations_name): efr_solver = efr.EFRSolver( - game=self._SHERIFF_GAME, + game=self.sheriff_game, deviations_name=deviations_name - ) + ) strategies = [] corr_dist_values = [] for _ in range(5): efr_solver.evaluate_and_update_policy() strategies.append(policy.python_policy_to_pyspiel_policy( - efr_solver.current_policy())) + efr_solver.current_policy())) corr_dev = pyspiel.uniform_correlation_device(strategies) - cce_dist_info = pyspiel.cce_dist(self._SHERIFF_GAME, corr_dev) + cce_dist_info = pyspiel.cce_dist(self.sheriff_game, corr_dev) corr_dist_values.append(cce_dist_info.dist_value) self.assertLess(corr_dist_values[-1], corr_dist_values[0]) if __name__ == "__main__": From bf15f16a536c9a7936d528329c9e413badc02746 Mon Sep 17 00:00:00 2001 From: jameswflynn Date: Sun, 14 Jan 2024 01:38:45 +0000 Subject: [PATCH 16/18] EFR linting --- open_spiel/python/algorithms/efr.py | 513 ++++++++++++++++++---------- 1 file changed, 327 insertions(+), 186 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index ce4cfa0805..3ab0149e80 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -12,20 +12,23 @@ # See the License for the specific language governing permissions and # limitations under the License. # Modified: 2023 James Flynn -# Original: https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py +# Original: +# https://github.com/deepmind/open_spiel/blob/master/open_spiel/python/algorithms/cfr.py """Python implementation of the extensive-form regret minimization algorithm. -See: "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games", -Morrill et al. 2021b, -https://arxiv.org/abs/2102.06973 +See: "Efficient Deviation Types and Learning + for Hindsight Rationality in Extensive-Form Games", + Morrill et al. 2021b, + https://arxiv.org/abs/2102.06973 One iteration of EFR consists of: 1) Compute current strategy from regrets (e.g. using Regret Matching). 2) Compute values using the current strategy 3) Compute regrets from these values -The average policy converges to a Nash Equilibrium rather than the current policy. +The average policy converges to a Nash Equilibrium +rather than the current policy. """ import copy from collections import defaultdict @@ -49,20 +52,21 @@ class _InfoStateNode(object): # Player -> state -> action -> prob current_history_probs = attr.ib() - # An array representing the preceeding actions played upto this information state + # An array representing the preceeding actions played + # upto this information state history = attr.ib() cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) - #The sum of all prior iteration's policies + # The sum of all prior iteration's policies cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) - #A dictionary mapping each deviation to their "y values" for the current iteration + # A dictionary mapping each deviation to their "y values" + # for the current iteration y_values = attr.ib(factory=lambda: defaultdict(float)) class _EFRSolverBase(object): - """The base EFR solver class - + """The base EFR solver class The main iteration loop is implemented in `evaluate_and_update_policy`: ```python game = pyspiel.load_game("game_name") @@ -78,8 +82,12 @@ def __init__(self, game, deviation_gen): """Initializer. Args: game: The `pyspiel.Game` to run on. - deviation_gen: a function that accepts (num_actions : int, history : , prior_legal_actions) and returns a list containing `LocalDeviationWithTimeSelection` objects of the - the realisable deviations of a described type (e.g blind causal deviations) and given the information state described by the function parameters. + deviation_gen: a function that accepts (num_actions : int, + history : , prior_legal_actions) + and returns a list containing`LocalDeviationWithTimeSelection` objects + of the realisable deviations of a described type + (e.g blind causal deviations) and given the information state described + by the function parameters. """ # pyformat: enable assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( @@ -100,15 +108,19 @@ def __init__(self, game, deviation_gen): hist = {player: [] for player in range(self._num_players)} empty_path_indices = [[] for _ in range(self._num_players)] - self._initialize_info_state_nodes(self._root_node, hist, empty_path_indices) + self._initialize_info_state_nodes(self._root_node, + hist, empty_path_indices) self._iteration = 1 # For possible linear-averaging. def return_cumulative_regret(self): - """Returns a dictionary mapping every information state to its associated regret (accumulated over all iterations). + """Returns a dictionary mapping every information state + to its associated regret (accumulated over all iterations). """ - return {list(self._info_state_nodes.keys())[i]: list(self._info_state_nodes.values())[i].cumulative_regret - for i in range(len(self._info_state_nodes.keys()))} + return {list(self._info_state_nodes.keys())[i]: + list(self._info_state_nodes.values())[i].cumulative_regret + for i in range(len(self._info_state_nodes.keys())) + } def current_policy(self): """Returns the current policy as a TabularPolicy. @@ -125,8 +137,9 @@ def average_policy(self): WARNING: The same object, updated in-place will be returned! You can copy it (or its `action_probability_array` field). - This average policy converges to a equilibrium policy as the number of iterations - increases (equilibrium type depends on learning deviations used). + This average policy converges to a equilibrium policy as the number + of iterations increases (equilibrium type depends on learning + deviations used). The policy is computed using the accumulated policy probabilities computed using `evaluate_and_update_policy`. @@ -141,7 +154,7 @@ def average_policy(self): def _initialize_info_state_nodes(self, state, history, path_indices): """Initializes info_state_nodes. - Create one _InfoStateNode per infoset. We could also initialize the node + Create one _InfoStateNode per infoset. We could also initialize the node when we try to access it and it does not exist. Generates all deviations that are realisable at this state and stores @@ -149,11 +162,12 @@ def _initialize_info_state_nodes(self, state, history, path_indices): and calculate the memory reach probability for each deviation. Args: - state: The current state in the tree traversal. This should be the root node - when we call this function from the EFR solver. - history: an arrays of the preceeding actions taken prior to the state for each player. - path_indices: a 3d array [player number]x[preceeding state]x[legal actions for state, - index of the policy for this state in TabularPolicy]. + state: The current state in the tree traversal. This should be the + root node when we call this function from the EFR solver. + history: an arrays of the preceeding actions taken prior to the state + for each player. + path_indices: a 3d array [player number]x[preceeding state]x[legal actions + for state, index of the policy for this state in TabularPolicy]. """ if state.is_terminal(): return @@ -171,7 +185,8 @@ def _initialize_info_state_nodes(self, state, history, path_indices): legal_actions = state.legal_actions(current_player) info_state_node = _InfoStateNode( legal_actions=legal_actions, - index_in_tabular_policy=self._current_policy.state_lookup[info_state], + index_in_tabular_policy=\ + self._current_policy.state_lookup[info_state], relizable_deviations=None, history=history[current_player].copy(), current_history_probs=copy.deepcopy( @@ -184,7 +199,8 @@ def _initialize_info_state_nodes(self, state, history, path_indices): prior_possible_actions.append(info_state_node.legal_actions) info_state_node.relizable_deviations = self._deviation_gen(len( - info_state_node.legal_actions), info_state_node.history, prior_possible_actions) + info_state_node.legal_actions), info_state_node.history, + prior_possible_actions) self._info_state_nodes[info_state] = info_state_node legal_actions = state.legal_actions(current_player) @@ -195,19 +211,24 @@ def _initialize_info_state_nodes(self, state, history, path_indices): [legal_actions, info_state_node.index_in_tabular_policy]) new_history = copy.deepcopy(history) new_history[current_player].append(action) - assert len(new_history[current_player]) == len(new_path_indices[current_player]) + assert (len(new_history[current_player]) == + len(new_path_indices[current_player])) - self._initialize_info_state_nodes(state.child(action), new_history, new_path_indices) + self._initialize_info_state_nodes(state.child(action), new_history, + new_path_indices) def _update_current_policy(self, state, current_policy): - """Updated in order so that memory reach probs are defined wrt to the new strategy - Note that the function is called recursively (first call should be the root). Additionally, - to update the strategy for a given state we require the (t+1)th strategy for all prior states. + """Updated in order so that memory reach probs are defined wrt + to the new strategy. + Note that the function is called recursively (first call should + be the root). + Additionally, to update the strategy for a given state we require + the (t+1)th strategy for all prior states. Args: state: the state of which to update the strategy. - current_policy: the (t+1)th strategy that is being recursively computed, see the function - description for more detail. + current_policy: the (t+1)th strategy that is being recursively computed, + see the function description for more detail. """ if state.is_terminal(): @@ -220,13 +241,18 @@ def _update_current_policy(self, state, current_policy): for devation in range(len(deviations)): mem_reach_probs = create_probs_from_index( info_state_node.current_history_probs, current_policy) - deviation_reach_prob = deviations[devation].player_deviation_reach_probability( - mem_reach_probs) - info_state_node.y_values[deviations[devation]] = info_state_node.y_values[deviations[devation]] + max( - 0, info_state_node.cumulative_regret[devation])*deviation_reach_prob + #TODO + deviation_reach_prob =\ + deviations[devation].\ + player_deviation_reach_probability(mem_reach_probs) + info_state_node.y_values[deviations[devation]] =\ + info_state_node.y_values[deviations[devation]] +\ + max(0, info_state_node.cumulative_regret[devation])*\ + deviation_reach_prob state_policy = current_policy.policy_for_key(info_state) - for action, value in self._regret_matching(info_state_node.legal_actions, info_state_node).items(): + for action, value in self._regret_matching(info_state_node.legal_actions, + info_state_node).items(): state_policy[action] = value for action in info_state_node.legal_actions: @@ -237,10 +263,12 @@ def _update_current_policy(self, state, current_policy): new_state = state.child(action) self._update_current_policy(new_state, current_policy) - # Path to state probability ignores chance probabilty as this is stored as new_reach_probabilities[-1] + # Path to state probability ignores chance probabilty as this is stored as + # new_reach_probabilities[-1] def _compute_cumulative_immediate_regret_for_player(self, state, policies, - reach_probabilities, player): - """Increments the immediate regrets and policy for `player` of + reach_probabilities, + player): + """Increments the immediate regrets and policy for `player` of all realisable deviations at this state. Args: state: The initial game state to analyze from. @@ -267,8 +295,9 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, new_reach_probabilities = reach_probabilities.copy() new_reach_probabilities[-1] *= action_prob - state_value += action_prob * self._compute_cumulative_immediate_regret_for_player( - new_state, policies, new_reach_probabilities, player) + state_value += action_prob *\ + self._compute_cumulative_immediate_regret_for_player( + new_state, policies, new_reach_probabilities, player) return state_value current_player = state.current_player() @@ -301,20 +330,23 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, reach_prob = reach_probabilities[current_player] for action in state.legal_actions(): action_prob = info_state_policy.get(action, 0.) - info_state_node.cumulative_policy[action] = info_state_node.cumulative_policy[action] + \ - action_prob * reach_prob + info_state_node.cumulative_policy[action] =\ + info_state_node.cumulative_policy[action] + action_prob * reach_prob new_state = state.child(action) new_reach_probabilities = reach_probabilities.copy() assert action_prob <= 1 new_reach_probabilities[current_player] *= action_prob child_utility = self._compute_cumulative_immediate_regret_for_player( - new_state, policies=policies, reach_probabilities=new_reach_probabilities, player=player) + new_state, policies=policies, + reach_probabilities=new_reach_probabilities, + player=player) state_value += action_prob * child_utility children_utilities[action] = child_utility counterfactual_reach_prob = (np.prod( - reach_probabilities[:current_player]) * np.prod(reach_probabilities[current_player + 1:])) + reach_probabilities[:current_player]) * + np.prod(reach_probabilities[current_player + 1:])) state_value_for_player = state_value[current_player] deviations = info_state_node.relizable_deviations @@ -330,8 +362,8 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, memory_reach_probs = create_probs_from_index( info_state_node.current_history_probs, self.current_policy()) - player_current_memory_reach_prob = deviation.player_deviation_reach_probability( - memory_reach_probs) + player_current_memory_reach_prob =\ + deviation.player_deviation_reach_probability(memory_reach_probs) deviation_regret = player_current_memory_reach_prob * \ ((devation_cf_value*counterfactual_reach_prob) - @@ -371,16 +403,20 @@ def __init__(self, game, deviations_name): """Initializer. Args: game: The `pyspiel.Game` to run on. - deviation_name: the name of the deviation type to use for accumulating regrets and calculating the strategy at the next timestep. - - Deviation types implemented are "blind action", "informed action", "blind cf", - "informed counterfactual", "blind partial sequence", "counterfactual partial sequence", - "casual partial sequence", "twice informed partial sequence", "single target behavioural". - See "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games" by D. Morrill et al. 2021b + deviation_name: the name of the deviation type to use for + accumulating regrets and calculating the strategy at the next timestep. + + Deviation types implemented are "blind action", "informed action", + "blind cf", "informed counterfactual", "blind partial sequence", + "counterfactual partial sequence", "casual partial sequence", + "twice informed partial sequence", "single target behavioural". + See "Efficient Deviation Types and Learning for Hindsight Rationality in + Extensive-Form Games" by D. Morrill et al. 2021b for the full definition of each type. """ - #external_only = True leads to a shortcut in the computation of the next timesteps strategy from the regrets + #external_only = True leads to a shortcut in the computation of the next + # timesteps strategy from the regrets external_only = False deviation_sets = None @@ -389,26 +425,34 @@ def __init__(self, game, deviations_name): external_only = True elif deviations_name == "informed action": deviation_sets = return_informed_action - elif deviations_name == "blind cf" or deviations_name == "blind counterfactual": + elif (deviations_name == "blind cf" or + deviations_name == "blind counterfactual"): deviation_sets = return_blind_cf external_only = True - elif deviations_name == "informed cf" or deviations_name == "informed counterfactual": + elif (deviations_name == "informed cf" or + deviations_name == "informed counterfactual"): deviation_sets = return_informed_cf - elif deviations_name == "bps" or deviations_name == "blind partial sequence": + elif (deviations_name == "bps" or + deviations_name == "blind partial sequence"): deviation_sets = return_blind_partial_sequence external_only = True - elif deviations_name == "cfps" or deviations_name == "cf partial sequence"\ - or deviations_name == "counterfactual partial sequence": + elif (deviations_name == "cfps" or + deviations_name == "cf partial sequence" or + deviations_name == "counterfactual partial sequence"): deviation_sets = return_cf_partial_sequence - elif deviations_name == "csps" or deviations_name == "casual partial sequence": + elif (deviations_name == "csps" or + deviations_name == "casual partial sequence"): deviation_sets = return_cs_partial_sequence - elif deviations_name == "tips" or deviations_name == "twice informed partial sequence": + elif (deviations_name == "tips" or + deviations_name == "twice informed partial sequence"): deviation_sets = return_twice_informed_partial_sequence - elif deviations_name == "bhv" or deviations_name == "single target behavioural"\ - or deviations_name == "behavioural": + elif (deviations_name == "bhv" or + deviations_name == "single target behavioural" or + deviations_name == "behavioural"): deviation_sets = return_behavourial else: - raise ValueError("Unsupported Deviation Set Passed As Constructor Argument") + raise ValueError("Unsupported Deviation Set Passed As\ + Constructor Argument") super(EFRSolver, self).__init__(game, deviation_sets) self._external_only = external_only @@ -425,8 +469,8 @@ def _regret_matching(self, legal_actions, info_set_node): z = sum(info_set_node.y_values.values()) info_state_policy = {} - # The fixed point solution can be directly obtained through the weighted regret matrix - # if only external deviations are used + # The fixed point solution can be directly obtained through the + # weighted regret matrix if only external deviations are used. if self._external_only and z > 0: weighted_deviation_matrix = np.zeros( (len(legal_actions), len(legal_actions))) @@ -437,8 +481,10 @@ def _regret_matching(self, legal_actions, info_set_node): for index in range(len(legal_actions)): info_state_policy[legal_actions[index]] = new_strategy[index] - # Full regret matching by finding the least squares solution to the fixed point - # Last row of matrix and the column entry ensures the solution is a strategy (otherwise would have to normalise) + # Full regret matching by finding the least squares solution to the + # fixed point of the EFR regret matching function. + # Last row of matrix and the column entry minimises the solution + # towards a strategy. elif z > 0: num_actions = len(info_set_node.legal_actions) weighted_deviation_matrix = -np.eye(num_actions) @@ -497,7 +543,8 @@ def _update_average_policy(average_policy, info_state_nodes): def strat_dict_to_array(strategy_dictionary): """ - A helper function to convert the strategy dictionary action -> prob value to an array. + A helper function to convert the strategy dictionary mapping + action -> prob value to an array. Args: strategy_dictionary: a dictionary action -> prob value. Returns: @@ -512,7 +559,8 @@ def strat_dict_to_array(strategy_dictionary): def array_to_strat_dict(strategy_array, legal_actions): """ - A helper function to convert a strategy array to an action -> prob value dictionary. + A helper function to convert a strategy array to an + action -> prob value dictionary. Args: strategy_array: an array with the ith action's value at the i-1th index. legal_actions: the list of all legal actions at the current state. @@ -539,14 +587,16 @@ def create_probs_from_index(indices, current_policy): # Deviation set definitions def return_blind_action(num_actions, history, _): """ - Returns an array of all Blind Action deviations with respect to an information set. + Returns an array of all Blind Action deviations with respect to an + information set. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. + num_actions: the integer of all actions that can be taken at that + information set. + history: an array containing the prior actions played by the `player` + to reach the information set. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Blind Action deviations - that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent all + Blind Action deviations that are realizable at the information set. """ memory_weights = [np.full(len(history), 1)] prior_actions_in_memory = history @@ -556,52 +606,77 @@ def return_blind_action(num_actions, history, _): def return_informed_action(num_actions, history, _): """ - Returns an array of all Informed Action deviations with respect to an information set. + Returns an array of all Informed Action deviations with respect to an + information set. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. + num_actions: the integer of all actions that can be taken at that + information set. + history: an array containing the prior actions played by the `player` + to reach the information set. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Informed Action deviations that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent all + Informed Action deviations that are realizable at the information set. """ memory_weights = [np.full(len(history), 1)] prior_actions_in_memory = history - return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory) + return return_all_non_identity_internal_deviations(num_actions, + memory_weights, + prior_actions_in_memory) def return_blind_cf(num_actions, history, _): """ - Returns an array of all Blind Counterfactual deviations with respect to an information set. - Note: EFR using only Blind Counterfactual deviations is equivalent to vanilla Counterfactual - Regret Minimisation (CFR). + Returns an array of all Blind Counterfactual deviations with respect to an + information set. + Note: EFR using only Blind Counterfactual deviations is equivalent + to vanilla Counterfactual Regret Minimisation (CFR). Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. + num_actions: the integer of all actions that can be taken at that + information set. + history: an array containing the prior actions played by the `player` + to reach the information set. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all Blind CF deviations - that are realizable at the information set. + an array of LocalDeviationWithTimeSelection objects that represent all + Blind CF deviations that are realizable at the information set. """ memory_weights = [None] prior_actions_in_memory = np.zeros(len(history)) - return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory) + return return_all_external_deviations(num_actions, memory_weights, + prior_actions_in_memory) def return_informed_cf(num_actions, history, _): + """ + Returns an array of all Informed Counterfactual deviations with respect + to an information set. + Args: + num_actions: the integer of all actions that can be taken at that + information set. + history: an array containing the prior actions played by the `player` + to reach the information set. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent all + Informed CF deviations that are realizable at the information set. + """ memory_weights = [None] - prior_actions_in_memory = history - return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory) + prior_actions_in_memory = np.zeros(len(history)) + return return_all_non_identity_internal_deviations(num_actions, + memory_weights, + prior_actions_in_memory) def return_blind_partial_sequence(num_actions, history, _): """ - Returns an array of all Blind Partial Sequence deviations (BPS) - with respect to an information set + Returns an array of all Blind Partial Sequence deviations (BPS) + with respect to an information set. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. + num_actions: the integer of all actions that can be taken at that + information set. + history: an array containing the prior actions played by the `player` + to reach the information set. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all BPS deviations - that are realizable at the information set. + an array of LocalDeviationWithTimeSelection objects that represent all + BPS deviations that are realizable at the information set. """ prior_actions_in_memory = history memory_weights = [None] @@ -611,7 +686,8 @@ def return_blind_partial_sequence(num_actions, history, _): possible_memory_weight = np.zeros(len(history)) possible_memory_weight[0:i] = np.full(i, 1.0) memory_weights.append(possible_memory_weight) - return return_all_external_deviations(num_actions, memory_weights, prior_actions_in_memory) + return return_all_external_deviations(num_actions, memory_weights, + prior_actions_in_memory) def return_cf_partial_sequence(num_actions, history, _): @@ -619,11 +695,13 @@ def return_cf_partial_sequence(num_actions, history, _): Returns an array of all Counterfactual Partial Sequence deviations (CFPS) with respect to an information set Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. + num_actions: the integer of all actions that can be taken at that + information set. + history: an array containing the prior actions played by the `player` + to reach the information set. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all CFPS deviations - that are realizable at the information set. + an array of LocalDeviationWithTimeSelection objects that represent + all CFPS deviations that are realizable at the information set. """ prior_actions_in_memory = history memory_weights = [None] @@ -633,20 +711,26 @@ def return_cf_partial_sequence(num_actions, history, _): possible_memory_weight = np.zeros(len(history)) possible_memory_weight[0:i] = np.full(i, 1.0) memory_weights.append(possible_memory_weight) - return return_all_non_identity_internal_deviations(num_actions, memory_weights, prior_actions_in_memory) + return return_all_non_identity_internal_deviations(num_actions, + memory_weights, + prior_actions_in_memory) def return_cs_partial_sequence(num_actions, history, prior_legal_actions): """ - Returns an array of all Casual Partial Sequence deviations with respect to an information set. + Returns an array of all Casual Partial Sequence deviations with respect to + an information set. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. - prior_legal_actions: a 2d array containing the legal actions for each preceeding state. + num_actions: the integer of all actions that can be taken at that + information set + history: an array containing the prior actions played by the `player` + to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each + preceeding state. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all + an array of LocalDeviationWithTimeSelection objects that represent all Casual Partial Sequence deviations that are realizable at the - information set. + information set. """ prior_actions_in_memory = history external_memory_weights = [None] @@ -657,7 +741,8 @@ def return_cs_partial_sequence(num_actions, history, prior_legal_actions): external_memory_weights.append(possible_memory_weight) external = return_all_external_modified_deviations( - num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory) + num_actions, external_memory_weights, prior_legal_actions, + prior_actions_in_memory) internal = return_blind_action(num_actions, history, None) cf_ext = return_informed_cf(num_actions, history, None) @@ -666,16 +751,22 @@ def return_cs_partial_sequence(num_actions, history, prior_legal_actions): return np.concatenate((external, internal, cf_ext, cf_int)) -def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions): +def return_cs_partial_sequence_orginal(num_actions, history, + prior_legal_actions): """ - Returns an array of all Casual Partial Sequence deviations with respect to an information set. + Returns an array of all Casual Partial Sequence deviations with respect to + an information set. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. - prior_legal_actions: a 2d array containing the legal actions for each preceeding state. + num_actions: the integer of all actions that can be taken at that + information set + history: an array containing the prior actions played by the `player` + to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each + preceeding state. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all - Casual Partial Sequence deviations that are realizable at the information set. + an array of LocalDeviationWithTimeSelection objects that represent all + Casual Partial Sequence deviations that are realizable at the + information set. """ prior_actions_in_memory = history external_memory_weights = [None] @@ -686,24 +777,29 @@ def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions external_memory_weights.append(possible_memory_weight) external = return_all_external_modified_deviations( - num_actions, external_memory_weights, prior_legal_actions, prior_actions_in_memory) + num_actions, external_memory_weights, prior_legal_actions, + prior_actions_in_memory) internal = return_informed_action(num_actions, history, None) cf_ext = return_informed_cf(num_actions, history, None) return np.concatenate((external, internal, cf_ext)) -def return_twice_informed_partial_sequence(num_actions, history, prior_legal_actions): +def return_twice_informed_partial_sequence(num_actions, history, + prior_legal_actions): """ - Returns an array of all Twice Informed Partial Sequence (TIPS) deviations + Returns an array of all Twice Informed Partial Sequence (TIPS) deviations with respect to an information set. Args: - num_actions: the integer of all actions that can be taken at that information set - history: an array containing the prior actions played by the `player` to reach the information set. - prior_legal_actions: a 2d array containing the legal actions for each preceeding state. + num_actions: the integer of all actions that can be taken at that + information set + history: an array containing the prior actions played by the `player` + to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each + preceeding state. Returns: - an array of LocalDeviationWithTimeSelection objects that represent all TIPS deviations that are realizable at the - information set. + an array of LocalDeviationWithTimeSelection objects that represent + all TIPS deviations that are realizable at theinformation set. """ prior_actions_in_memory = history memory_weights = [None] @@ -714,7 +810,8 @@ def return_twice_informed_partial_sequence(num_actions, history, prior_legal_act memory_weights.append(possible_memory_weight) internal = return_all_internal_modified_deviations( - num_actions, memory_weights, prior_legal_actions, prior_actions_in_memory) + num_actions, memory_weights, prior_legal_actions, + prior_actions_in_memory) cf_int = return_informed_cf(num_actions, history, None) return np.concatenate((internal, cf_int)) @@ -723,10 +820,12 @@ def return_twice_informed_partial_sequence(num_actions, history, prior_legal_act def generate_all_action_permutations(current_stem, remaining_actions): """ Args: - current_stem: the prior sequence of actions to be completed by the remaining actions + current_stem: the prior sequence of actions to be completed by the + remaining actions remaining_actions: a 2d array of [subsequent states]x[possible actions] Returns: - An array with each element being the current stem joined with a possible permuation of remaining actions + An array with each element being the current stem joined with a possible + permuation of remaining actions """ if len(remaining_actions) == 0: return [np.array(current_stem)] @@ -763,7 +862,8 @@ def return_behavourial(num_actions, history, prior_legal_actions): (np.ones(deviation_info), np.zeros(len(history) - deviation_info))) for prior_memory_actions in prior_possible_memory_actions: prior_memory_actions = np.concatenate( - (prior_memory_actions, np.zeros(len(history) - len(prior_memory_actions)))) + (prior_memory_actions, np.zeros(len(history) - + len(prior_memory_actions)))) for i in range(len(history) - len(prior_memory_actions)): prior_memory_actions.append(0) prior_memory_actions_cp = prior_memory_actions.copy() @@ -777,15 +877,19 @@ def return_behavourial(num_actions, history, prior_legal_actions): class LocalDeviationWithTimeSelection(object): """" - Comprised of a swap transformation that will be applied at the current information state, a memory weighting - which describes the which actions are remembered and the memory action history (prior_memory_actions) that is remembered. - Note that the "memory action history" might not equal the history in the case of some deviation types (e.g tips deviations). + Comprised of a swap transformation that will be applied at the + current information state, a memory weighting which describes + the actions that are remembered and the memory action history + (prior_memory_actions) that is remembered. + Note that the "memory action history" might not equal the history in + the case of some deviation types (e.g tips deviations). """ - #The swap transformation that will be compared to the unmodified strategy. - #The transformation is applied at the memory state. + # The swap transformation that will be compared to the unmodified strategy. + # The transformation is applied at the memory state. local_swap_transform = attr.ib() - # Which actions have been forgotten (0) or remembered (1) according to the memory state + # Which actions have been forgotten (0) or remembered (1) according + # to the memory state. prior_actions_weight = attr.ib() # Which actions have been take according to the memory state @@ -793,23 +897,28 @@ class LocalDeviationWithTimeSelection(object): use_unmodified_history = attr.ib() - def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, - is_external, use_unmodified_history=True): + def __init__(self, target, source, num_actions, prior_actions_weight, + prior_memory_actions, is_external, use_unmodified_history=True): """" - Represents a swap transformation (both external and internal) for a given memory state. + Represents a swap transformation (either external and internal) + for a given memory state. Args: target: the action that will be played when the deviation is triggered. - source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False). - num_actions: the integer of actions that can be played for this information state - prior_actions_weight: an array the length of the history of the information state - actions have been forgotten (0) or remembered (1) according to the memory state. - This is represented numerically for possible experimentation with partially forgotten - actions (i.e in the range (0,1)). - prior_memory_actions: the preceeding actions upto the the information state + source: the action that will trigger the target action when suggested + (used only by internal deviations, i.e is_external = False). + num_actions: the number of actions that can be played for this + information state. + prior_actions_weight: an array (the length of the game history) + of the information state actions have been forgotten (0) + or remembered (1) wrt to the memory state. + This is represented numerically for possible experimentation with + "partially forgotten" actions (i.e in the range (0,1)). + prior_memory_actions: the preceeding actions upto the the information state (which the LocalDeviationWithTimeSelection is defined with respect to). - is_external: a boolean use to determine whether this is an internal or external type deviation. - use_unmodified_history: a boolean used to indicate whether the provided memory_actions are the same as - the information state it was derived from. + is_external: a boolean use to determine whether this is an + internal or external deviation. + use_unmodified_history: a boolean used to indicate whether the provided + memory_actions are the same as the information state it was derived from. """ self.local_swap_transform = LocalSwapTransform( target, source, num_actions, is_external=is_external) @@ -817,10 +926,12 @@ def __init__(self, target, source, num_actions, prior_actions_weight, prior_memo self.prior_memory_actions = prior_memory_actions self.use_unmodified_history = use_unmodified_history - # If a pure strategy, a pure strategy will be returned (aka function works for both actions and strategies as input) + # If a pure strategy, a pure strategy will be returned (aka function works + # for both actions and strategies as input). def deviate(self, strategy): """ - Returns the strategy array given by deviating according to the 'self.local_swap_transform.matrix_transform' matrix. + Returns the strategy array given by deviating according to the + 'self.local_swap_transform.matrix_transform' matrix. Args: strategy: the strategy array to deviate from. Returns: @@ -834,29 +945,34 @@ def return_transform_matrix(self): """ return self.local_swap_transform.matrix_transform - def player_deviation_reach_probability(self, prior_possible_action_probabilities): + def player_deviation_reach_probability(self, + prior_possible_action_probabilities): """ Calculate the probability of reaching the current memory state provided the - player played from the start of the game to this state. This is assuming that they play - with their current strategy with the deviation applied. + player played from the start of the game to this state. This is assuming + that they play with their current strategy with the deviation applied. Args: - prior_possible_action_probabilities: a 2d array of length - [player's history]x[number of actions at that state]. These are the current strategies of - the player, from start to end of their history. + prior_possible_action_probabilities: a 2d array of length + [player's history]x[number of actions at that state]. + These are the current strategies of the player, + from start to end of their history. Returns: The reach probability of the current memory state. """ - if self.prior_actions_weight is None or self.prior_memory_actions is None or prior_possible_action_probabilities is None: + if (self.prior_actions_weight is None or self.prior_memory_actions is None + or prior_possible_action_probabilities is None): return 1.0 memory_action_probabilities = np.ones(len(self.prior_actions_weight)) - #Reconstruct memory probabilities from history provided to the deviation to reach info set and the current memory probs + # Reconstruct memory probabilities from history provided to the deviation + # to reach info set and the current memory probs. memory_weightings = self.prior_actions_weight.copy() if self.use_unmodified_history: for state in range(len(self.prior_memory_actions)): if not self.prior_actions_weight[state] == 0: memory_action_probabilities[state] = ( - prior_possible_action_probabilities[state][self.prior_memory_actions[state]]) + prior_possible_action_probabilities[state] + [self.prior_memory_actions[state]]) else: memory_action_probabilities[state] = 1 memory_weightings[state] = 1 @@ -875,17 +991,23 @@ def __eq__(self, other): def __hash__(self): return hash(self.local_swap_transform) -def return_all_non_identity_internal_deviations(num_actions, possible_prior_weights, prior_memory_actions): +def return_all_non_identity_internal_deviations(num_actions, + possible_prior_weights, + prior_memory_actions): deviations = [] for prior_actions_weight in possible_prior_weights: for target in range(num_actions): for source in range(num_actions): if not source == target: deviations.append(LocalDeviationWithTimeSelection( - target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + target, source, num_actions, prior_actions_weight, + prior_memory_actions, False)) return deviations -def return_all_internal_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions): +def return_all_internal_modified_deviations(num_actions, + possible_prior_weights, + possible_prior_memory_actions, + prior_memory_actions): deviations = [] for prior_actions_weight in possible_prior_weights: try: @@ -897,7 +1019,8 @@ def return_all_internal_modified_deviations(num_actions, possible_prior_weights, for source in range(num_actions): if not source == target: deviations.append(LocalDeviationWithTimeSelection( - target, source, num_actions, prior_actions_weight, prior_memory_actions, False)) + target, source, num_actions, prior_actions_weight, + prior_memory_actions, False)) else: previous_action = prior_memory_actions[modification_index] for alt_action in possible_prior_memory_actions[modification_index]: @@ -906,21 +1029,27 @@ def return_all_internal_modified_deviations(num_actions, possible_prior_weights, for source in range(num_actions): if not source == target: deviations.append(LocalDeviationWithTimeSelection( - target, source, num_actions, prior_actions_weight, prior_memory_actions.copy(), False)) + target, source, num_actions, prior_actions_weight, + prior_memory_actions.copy(), False)) prior_memory_actions[modification_index] = previous_action return deviations -def return_all_external_deviations(num_actions, possible_prior_weights, prior_memory_actions): +def return_all_external_deviations(num_actions, possible_prior_weights, + prior_memory_actions): deviations = [] for prior_actions_weight in possible_prior_weights: for target in range(num_actions): deviations.append(LocalDeviationWithTimeSelection( - target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + target, target, num_actions, prior_actions_weight, + prior_memory_actions, True)) return deviations # Modify last action as required -def return_all_external_modified_deviations(num_actions, possible_prior_weights, possible_prior_memory_actions, prior_memory_actions): +def return_all_external_modified_deviations(num_actions, + possible_prior_weights, + possible_prior_memory_actions, + prior_memory_actions): deviations = [] for prior_actions_weight in possible_prior_weights: try: @@ -930,19 +1059,22 @@ def return_all_external_modified_deviations(num_actions, possible_prior_weights, if modification_index == len(prior_memory_actions): for target in range(num_actions): deviations.append(LocalDeviationWithTimeSelection( - target, target, num_actions, prior_actions_weight, prior_memory_actions, True)) + target, target, num_actions, prior_actions_weight, + prior_memory_actions, True)) else: previous_action = prior_memory_actions[modification_index] for alt_action in possible_prior_memory_actions[modification_index]: prior_memory_actions[modification_index] = alt_action for target in range(num_actions): deviations.append(LocalDeviationWithTimeSelection( - target, target, num_actions, prior_actions_weight, prior_memory_actions.copy(), True)) + target, target, num_actions, prior_actions_weight, + prior_memory_actions.copy(), True)) prior_memory_actions[modification_index] = previous_action return deviations -def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_actions): +def return_identity_deviation(num_actions, possible_prior_weights, + prior_memory_actions): deviations = [] for prior_actions_weight in possible_prior_weights: deviations.append(LocalDeviationWithTimeSelection( @@ -950,10 +1082,12 @@ def return_identity_deviation(num_actions, possible_prior_weights, prior_memory_ return deviations -# A swap transformation given by the matrix_transform for an information state of +# A swap transformation given by the matrix_transform for an information state. +# Of actions_num size. class LocalSwapTransform(object): """ - Represents a swap transformation (both external and internal) for an information state for a certain number of actions. + Represents a swap transformation (both external and internal) + for an information state for a certain number of actions. """ source_action = attr.ib() target_action = attr.ib() @@ -963,12 +1097,15 @@ class LocalSwapTransform(object): def __init__(self, target, source, actions_num, is_external=True): """" - Creates the matrix transformation that describes the transformation and initalises the other variables. + Creates the matrix transformation that describes the swap transformation + and initalises variables. Args: - target: the action that will be played when the deviation is triggered - source: the action that will trigger the target action if (used only by internal deviations, i.e is_external = False) - num_actions: the integer of actions that can be played for this information state - is_external: a boolean used to determine whether to create an internal or external type deviation. + target: the action that will be played when the deviation is triggered. + source: the action that triggers a swap to the target action + (used only by internal deviations, i.e is_external = False) + num_actions: the number of actions that can be played for this + information state. + is_external: determine whether to create an internal or external deviation. """ self.source_action = source self.target_action = target @@ -983,24 +1120,28 @@ def __init__(self, target, source, actions_num, is_external=True): self.matrix_transform[source][source] = 0 def __repr__(self) -> str: - return "Diverting from Action: "+str(self.source_action) + " to Action: "+str(self.target_action) + return ("Diverting from Action: "+str(self.source_action) + + " to Action: "+str(self.target_action)) def __eq__(self, other: object) -> bool: - if self.source_action == other.source_action and self.target_action == other.target_action and self.actions_num == other.actions_num: + if (self.source_action == other.source_action and + self.target_action == other.target_action and + self.actions_num == other.actions_num): return True else: return False def __hash__(self): - separator = " " - return hash(str(self.source_action)+separator+str(self.target_action)+separator+str(self.actions_num) + separator + str(self.is_external)) + return hash(f"{str(self.source_action)} {str(self.target_action)} \ + {str(self.actions_num)} {str(self.is_external)}") def deviate(self, strategy): """ - Returns the strategy array given by deviating according to 'self.matrix_transform' matrix. + Returns the strategy array given by deviating according to + 'self.matrix_transform' matrix. Args: strategy: the strategy array to deviate from. Returns: - the matrix product of the the matrix_transform and the provided strategy. + the matrix product of the the matrix_transform and the provided strategy. """ return np.matmul(self.matrix_transform, strategy) From be4acecc9f4b96972d3d70e0036128abd351ea91 Mon Sep 17 00:00:00 2001 From: jameswflynn Date: Sun, 14 Jan 2024 20:10:21 +0000 Subject: [PATCH 17/18] Further linting --- open_spiel/python/algorithms/efr.py | 180 +++++++++++++--------------- 1 file changed, 85 insertions(+), 95 deletions(-) diff --git a/open_spiel/python/algorithms/efr.py b/open_spiel/python/algorithms/efr.py index 3ab0149e80..704aa359f4 100644 --- a/open_spiel/python/algorithms/efr.py +++ b/open_spiel/python/algorithms/efr.py @@ -30,6 +30,7 @@ The average policy converges to a Nash Equilibrium rather than the current policy. """ + import copy from collections import defaultdict import attr @@ -53,7 +54,7 @@ class _InfoStateNode(object): current_history_probs = attr.ib() # An array representing the preceeding actions played - # upto this information state + # upto this information state. history = attr.ib() cumulative_regret = attr.ib(factory=lambda: defaultdict(float)) @@ -61,7 +62,7 @@ class _InfoStateNode(object): cumulative_policy = attr.ib(factory=lambda: defaultdict(float)) # A dictionary mapping each deviation to their "y values" - # for the current iteration + # for the current iteration. y_values = attr.ib(factory=lambda: defaultdict(float)) @@ -78,6 +79,7 @@ class _EFRSolverBase(object): solver.average_policy() # Access the average policy ``` """ + def __init__(self, game, deviation_gen): """Initializer. Args: @@ -92,8 +94,8 @@ def __init__(self, game, deviation_gen): # pyformat: enable assert game.get_type().dynamics == pyspiel.GameType.Dynamics.SEQUENTIAL, ( "EFR requires sequential games. If you're trying to run it " + - "on a simultaneous (or normal-form) game, please first transform it " + - "using turn_based_simultaneous_game.") + "on a simultaneous (or normal-form) game, please first transform it " + + "using turn_based_simultaneous_game.") self._game = game self._num_players = game.num_players() @@ -136,11 +138,11 @@ def average_policy(self): """Returns the average of all policies iterated. WARNING: The same object, updated in-place will be returned! You can copy it (or its `action_probability_array` field). - + This average policy converges to a equilibrium policy as the number of iterations increases (equilibrium type depends on learning deviations used). - + The policy is computed using the accumulated policy probabilities computed using `evaluate_and_update_policy`. @@ -160,7 +162,7 @@ def _initialize_info_state_nodes(self, state, history, path_indices): Generates all deviations that are realisable at this state and stores the history and preceeding state policy information to create memory states and calculate the memory reach probability for each deviation. - + Args: state: The current state in the tree traversal. This should be the root node when we call this function from the EFR solver. @@ -241,14 +243,15 @@ def _update_current_policy(self, state, current_policy): for devation in range(len(deviations)): mem_reach_probs = create_probs_from_index( info_state_node.current_history_probs, current_policy) - #TODO deviation_reach_prob =\ deviations[devation].\ player_deviation_reach_probability(mem_reach_probs) + y_increment = max(0, info_state_node.cumulative_regret[devation])*\ + deviation_reach_prob info_state_node.y_values[deviations[devation]] =\ - info_state_node.y_values[deviations[devation]] +\ - max(0, info_state_node.cumulative_regret[devation])*\ - deviation_reach_prob + info_state_node.y_values[deviations[devation]] +\ + y_increment + state_policy = current_policy.policy_for_key(info_state) for action, value in self._regret_matching(info_state_node.legal_actions, @@ -278,7 +281,7 @@ def _compute_cumulative_immediate_regret_for_player(self, state, policies, as a numpy array [prob for player 0, for player 1,..., for chance]. `reach_probabilities[player]` will work in all cases. player: The 0-indexed player to update the values for. If `None`, the - update for all players will be performed. + update for all players will be performed. Returns: The utility of `state` for all players, assuming all players follow the @@ -394,11 +397,11 @@ def evaluate_and_update_policy(self): class EFRSolver(_EFRSolver): - """ - Implements the EFR algorithm. + """Implements the EFR algorithm with several deviation types. See: https://arxiv.org/abs/2102.06973 """ + def __init__(self, game, deviations_name): """Initializer. Args: @@ -410,45 +413,41 @@ def __init__(self, game, deviations_name): "blind cf", "informed counterfactual", "blind partial sequence", "counterfactual partial sequence", "casual partial sequence", "twice informed partial sequence", "single target behavioural". + See "Efficient Deviation Types and Learning for Hindsight Rationality in Extensive-Form Games" by D. Morrill et al. 2021b for the full definition of each type. """ - #external_only = True leads to a shortcut in the computation of the next + # external_only = True leads to a shortcut in the computation of the next # timesteps strategy from the regrets external_only = False deviation_sets = None - if deviations_name == "blind action": + if deviations_name in {"blind action"}: deviation_sets = return_blind_action external_only = True - elif deviations_name == "informed action": + elif deviations_name in {"informed action"}: deviation_sets = return_informed_action - elif (deviations_name == "blind cf" or - deviations_name == "blind counterfactual"): + elif (deviations_name in {"blind cf", + "blind counterfactual"}): deviation_sets = return_blind_cf external_only = True - elif (deviations_name == "informed cf" or - deviations_name == "informed counterfactual"): + elif (deviations_name in {"informed cf", + "informed counterfactual"}): deviation_sets = return_informed_cf - elif (deviations_name == "bps" or - deviations_name == "blind partial sequence"): + elif (deviations_name in {"bps", "blind partial sequence"}): deviation_sets = return_blind_partial_sequence external_only = True - elif (deviations_name == "cfps" or - deviations_name == "cf partial sequence" or - deviations_name == "counterfactual partial sequence"): + elif (deviations_name in {"cfps", "cf partial sequence", + "counterfactual partial sequence"}): deviation_sets = return_cf_partial_sequence - elif (deviations_name == "csps" or - deviations_name == "casual partial sequence"): + elif (deviations_name in {"csps", "casual partial sequence"}): deviation_sets = return_cs_partial_sequence - elif (deviations_name == "tips" or - deviations_name == "twice informed partial sequence"): + elif (deviations_name in {"tips", "twice informed partial sequence"}): deviation_sets = return_twice_informed_partial_sequence - elif (deviations_name == "bhv" or - deviations_name == "single target behavioural" or - deviations_name == "behavioural"): + elif (deviations_name in {"bhv", "single target behavioural", + "behavioural"}): deviation_sets = return_behavourial else: raise ValueError("Unsupported Deviation Set Passed As\ @@ -460,7 +459,6 @@ def _regret_matching(self, legal_actions, info_set_node): """Returns an info state policy by applying regret-matching function over all deviations and time selection functions. Args: - legal_actions: the list of legal actions at this state. Returns: @@ -502,7 +500,7 @@ def _regret_matching(self, legal_actions, info_set_node): strategy = lstsq(weighted_deviation_matrix, b)[0] - # Adopt same clipping strategy as paper author's code + # Adopt same clipping strategy as paper author's code. strategy[np.where(strategy < 0)] = 0 strategy[np.where(strategy > 1)] = 1 @@ -510,7 +508,7 @@ def _regret_matching(self, legal_actions, info_set_node): for index in range(len(strategy)): info_state_policy[info_set_node.legal_actions[index] ] = strategy[index] - # Use a uniform strategy as sum of all regrets is negative + # Use a uniform strategy as sum of all regrets is negative. else: for index in range(len(legal_actions)): info_state_policy[legal_actions[index]]\ @@ -542,8 +540,7 @@ def _update_average_policy(average_policy, info_state_nodes): def strat_dict_to_array(strategy_dictionary): - """ - A helper function to convert the strategy dictionary mapping + """A helper function to convert the strategy dictionary mapping action -> prob value to an array. Args: strategy_dictionary: a dictionary action -> prob value. @@ -558,8 +555,7 @@ def strat_dict_to_array(strategy_dictionary): def array_to_strat_dict(strategy_array, legal_actions): - """ - A helper function to convert a strategy array to an + """A helper function to convert a strategy array to an action -> prob value dictionary. Args: strategy_array: an array with the ith action's value at the i-1th index. @@ -586,8 +582,7 @@ def create_probs_from_index(indices, current_policy): # Deviation set definitions def return_blind_action(num_actions, history, _): - """ - Returns an array of all Blind Action deviations with respect to an + """Returns an array of all Blind Action deviations with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -605,8 +600,7 @@ def return_blind_action(num_actions, history, _): def return_informed_action(num_actions, history, _): - """ - Returns an array of all Informed Action deviations with respect to an + """Returns an array of all Informed Action deviations with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -625,8 +619,7 @@ def return_informed_action(num_actions, history, _): def return_blind_cf(num_actions, history, _): - """ - Returns an array of all Blind Counterfactual deviations with respect to an + """Returns an array of all Blind Counterfactual deviations with respect to an information set. Note: EFR using only Blind Counterfactual deviations is equivalent to vanilla Counterfactual Regret Minimisation (CFR). @@ -646,8 +639,7 @@ def return_blind_cf(num_actions, history, _): def return_informed_cf(num_actions, history, _): - """ - Returns an array of all Informed Counterfactual deviations with respect + """Returns an array of all Informed Counterfactual deviations with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -666,8 +658,7 @@ def return_informed_cf(num_actions, history, _): def return_blind_partial_sequence(num_actions, history, _): - """ - Returns an array of all Blind Partial Sequence deviations (BPS) + """Returns an array of all Blind Partial Sequence deviations (BPS) with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -691,8 +682,7 @@ def return_blind_partial_sequence(num_actions, history, _): def return_cf_partial_sequence(num_actions, history, _): - """ - Returns an array of all Counterfactual Partial Sequence deviations (CFPS) + """Returns an array of all Counterfactual Partial Sequence deviations (CFPS) with respect to an information set Args: num_actions: the integer of all actions that can be taken at that @@ -717,8 +707,7 @@ def return_cf_partial_sequence(num_actions, history, _): def return_cs_partial_sequence(num_actions, history, prior_legal_actions): - """ - Returns an array of all Casual Partial Sequence deviations with respect to + """Returns an array of all Casual Partial Sequence deviations with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -753,8 +742,7 @@ def return_cs_partial_sequence(num_actions, history, prior_legal_actions): def return_cs_partial_sequence_orginal(num_actions, history, prior_legal_actions): - """ - Returns an array of all Casual Partial Sequence deviations with respect to + """Returns an array of all Casual Partial Sequence deviations with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -787,8 +775,7 @@ def return_cs_partial_sequence_orginal(num_actions, history, def return_twice_informed_partial_sequence(num_actions, history, prior_legal_actions): - """ - Returns an array of all Twice Informed Partial Sequence (TIPS) deviations + """Returns an array of all Twice Informed Partial Sequence (TIPS) deviations with respect to an information set. Args: num_actions: the integer of all actions that can be taken at that @@ -818,7 +805,10 @@ def return_twice_informed_partial_sequence(num_actions, history, def generate_all_action_permutations(current_stem, remaining_actions): - """ + """ Return a List of all possible game continuations playing on from the + current stem and with playing from the set of remaining actions. + `current_stem` = "" generates all possible playthroughs from the current + information state. Args: current_stem: the prior sequence of actions to be completed by the remaining actions @@ -841,13 +831,23 @@ def generate_all_action_permutations(current_stem, remaining_actions): for i in prev_permutations: permutations.append(i) return permutations -# Includes identity def return_behavourial(num_actions, history, prior_legal_actions): - """ - [TODO] - """ + """Returns an array of all single target behavioural deviations + with respect to an information set. + Args: + num_actions: the integer of all actions that can be taken at that + information set + history: an array containing the prior actions played by the `player` + to reach the information set. + prior_legal_actions: a 2d array containing the legal actions for each + preceeding state. + Returns: + an array of LocalDeviationWithTimeSelection objects that represent + all (single target) behaviourial deviations that are realizable at the + information set. + """ deviations = [] if len(history) == 0: internal = return_all_non_identity_internal_deviations( @@ -875,15 +875,15 @@ def return_behavourial(num_actions, history, prior_legal_actions): return deviations -class LocalDeviationWithTimeSelection(object): - """" - Comprised of a swap transformation that will be applied at the +class LocalDeviationWithTimeSelection: + """" Comprised of a swap transformation that will be applied at the current information state, a memory weighting which describes the actions that are remembered and the memory action history (prior_memory_actions) that is remembered. Note that the "memory action history" might not equal the history in the case of some deviation types (e.g tips deviations). """ + # The swap transformation that will be compared to the unmodified strategy. # The transformation is applied at the memory state. local_swap_transform = attr.ib() @@ -899,8 +899,7 @@ class LocalDeviationWithTimeSelection(object): def __init__(self, target, source, num_actions, prior_actions_weight, prior_memory_actions, is_external, use_unmodified_history=True): - """" - Represents a swap transformation (either external and internal) + """" Represents a swap transformation (either external and internal) for a given memory state. Args: target: the action that will be played when the deviation is triggered. @@ -929,28 +928,27 @@ def __init__(self, target, source, num_actions, prior_actions_weight, # If a pure strategy, a pure strategy will be returned (aka function works # for both actions and strategies as input). def deviate(self, strategy): - """ - Returns the strategy array given by deviating according to the + """Returns the strategy array given by deviating according to the 'self.local_swap_transform.matrix_transform' matrix. Args: strategy: the strategy array to deviate from. Returns: - the matrix product of the the matrix_transform and the provided strategy. + the matrix product of the the matrix_transform and the provided strategy. """ return self.local_swap_transform.deviate(strategy) def return_transform_matrix(self): - """ - Returns the matrix_transform of the associated `LocalSwapTransform` object. + """Returns the matrix_transform of the associated `LocalSwapTransform` + object. """ return self.local_swap_transform.matrix_transform def player_deviation_reach_probability(self, prior_possible_action_probabilities): - """ - Calculate the probability of reaching the current memory state provided the - player played from the start of the game to this state. This is assuming - that they play with their current strategy with the deviation applied. + """Calculate the probability of reaching the current memory state + provided the player played from the start of the game to this state. + This is assuming that they play with their current strategy with the + deviation applied. Args: prior_possible_action_probabilities: a 2d array of length [player's history]x[number of actions at that state]. @@ -983,10 +981,7 @@ def player_deviation_reach_probability(self, return memory_reach_probability def __eq__(self, other): - if self.local_swap_transform == other.local_swap_transform: - return True - else: - return False + return self.local_swap_transform == other.local_swap_transform def __hash__(self): return hash(self.local_swap_transform) @@ -1084,11 +1079,11 @@ def return_identity_deviation(num_actions, possible_prior_weights, # A swap transformation given by the matrix_transform for an information state. # Of actions_num size. -class LocalSwapTransform(object): - """ - Represents a swap transformation (both external and internal) +class LocalSwapTransform: + """ Represents a swap transformation (both external and internal) for an information state for a certain number of actions. """ + source_action = attr.ib() target_action = attr.ib() matrix_transform = attr.ib() @@ -1096,8 +1091,7 @@ class LocalSwapTransform(object): is_external = attr.ib() def __init__(self, target, source, actions_num, is_external=True): - """" - Creates the matrix transformation that describes the swap transformation + """"Creates the matrix transformation describing the swap transformation and initalises variables. Args: target: the action that will be played when the deviation is triggered. @@ -1120,24 +1114,20 @@ def __init__(self, target, source, actions_num, is_external=True): self.matrix_transform[source][source] = 0 def __repr__(self) -> str: - return ("Diverting from Action: "+str(self.source_action) + + return ("Swapping from Action: "+str(self.source_action) + " to Action: "+str(self.target_action)) def __eq__(self, other: object) -> bool: - if (self.source_action == other.source_action and - self.target_action == other.target_action and - self.actions_num == other.actions_num): - return True - else: - return False + return (self.source_action == other.source_action and + self.target_action == other.target_action and + self.actions_num == other.actions_num) def __hash__(self): return hash(f"{str(self.source_action)} {str(self.target_action)} \ {str(self.actions_num)} {str(self.is_external)}") def deviate(self, strategy): - """ - Returns the strategy array given by deviating according to + """Returns the strategy array given by deviating according to 'self.matrix_transform' matrix. Args: strategy: the strategy array to deviate from. From 4847b6e06167aa408587487e9c86b410ae92d55c Mon Sep 17 00:00:00 2001 From: James Flynn Date: Mon, 13 May 2024 12:48:05 +0100 Subject: [PATCH 18/18] Updated CMakeLists.txt with efr_test.py --- open_spiel/python/CMakeLists.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/open_spiel/python/CMakeLists.txt b/open_spiel/python/CMakeLists.txt index 8e16fa86f6..2a070d6660 100644 --- a/open_spiel/python/CMakeLists.txt +++ b/open_spiel/python/CMakeLists.txt @@ -182,6 +182,7 @@ set(PYTHON_TESTS ${PYTHON_TESTS} algorithms/boltzmann_tabular_qlearner_test.py algorithms/cfr_br_test.py algorithms/cfr_test.py + algorithms/efr_test.py algorithms/evaluate_bots_test.py algorithms/expected_game_score_test.py algorithms/external_sampling_mccfr_test.py