import numpy as np
import matplotlib.pyplot as plt
import IPython.display as ipd


class Bandit:
    def __init__(self, seed):
        """
        The constructor for bandit objects
        
        Parameters
        ----------
        seed: int
            A seed for the pseudorandom number generator.  Different seeds
            will give different values of the max this bandit can give at
            any pull (self.mx)
        """
        
        self.seed = seed
        np.random.seed(seed)
        self.mx = np.random.rand()*10
    
    def pull(self):
        """
        Returns
        -------
        One random pull of this bandit, which returns a number in [0, self.mx]
        """
        return np.random.rand()*self.mx
    
    def __repr__(self):
        return "Bandit seed {}, Max Payout {:.3f}".format(self.seed, self.mx)
    
# Setup a list of 10 bandits and print them out
bandits = [Bandit(i*2) for i in range(10)]
for b in bandits:
    print(b)
results = {}

Bandit seed 0, Max Payout 5.488
Bandit seed 2, Max Payout 4.360
Bandit seed 4, Max Payout 9.670
Bandit seed 6, Max Payout 8.929
Bandit seed 8, Max Payout 8.734
Bandit seed 10, Max Payout 7.713
Bandit seed 12, Max Payout 1.542
Bandit seed 14, Max Payout 5.139
Bandit seed 16, Max Payout 2.233
Bandit seed 18, Max Payout 6.504


def learn_bandit(bandits, total_pulls, eps_fn, alpha_fn=lambda n: 1/n, bias=0):
    """
    A general purpose method for learning policies for the multi-armed bandit problem
    
    Parameters
    ----------
    bandits: list of Bandit
        The bandits that we can pull on
    total_pulls: int
        Number of pulls to execute
    eps_fn: float OR method pull -> pull probability
        The probability that we will explore at a particular pull
        or exploit at a particular pull, where pull is the number pull we're on.
        * If a float is passed along, this is a constant, and we refer to this
        as an "epsilon_greedy" strategy
        * If a method is passed along, then this can vary as additional pulls happen
    alpha_fn: method pull_num -> float
        A method that gives us "step size" or "learning rate" when integrating a new
        pull's reward into our value estimate.  If we're taking an average of all pulls on
        that machine over time, this will simply be 1/pull_num.  
    bias: float
        The amount of bias to add to the initial estimate of each average value
    
    Returns
    -------
    total_reward: float
        The total reward we got after all pulls
    choices: ndarray(total_pulls, dtype=int)
        The index of the bandit chosen at each pull over time
    """
    from types import FunctionType
    if not isinstance(eps_fn, FunctionType):
        # If the user passed along a constant for the eps_fn, wrap
        # a method around it that always returns that constant
        eps = 1.0*eps_fn
        eps_fn = lambda x: eps
        
    if not isinstance(alpha_fn, FunctionType):
        # If the user passed along a constant for the alpha_fn, wrap
        # a method around it that always returns that constant
        alpha = 1.0*alpha_fn
        alpha_fn = lambda x: alpha
        
    n_bandits = len(bandits)
    total_reward = 0
    # Create an array of all 0's which stores how many pulls have been taken at each bandit
    pull_nums = np.zeros(n_bandits)    
    # Create an array of all 0's which stores the average payout of each bandit so far,
    # which we consider our "value estimate" of that bandit
    values = np.zeros(n_bandits) 
    # Create an array of all 0's stores the index of the bandit chosen at each pull
    choices = np.zeros(total_pulls)   
    
    # If there's a bias, pretend every bandit had one pull with this amount when
    # we start out
    if bias > 0:
        pull_nums[:] = 1
        values[:] = bias
    
    for pull in range(total_pulls):
        ## Step 1: Choose which bandit to pull
        dart = np.random.rand() # Choose number in [0, 1]
        if dart <= eps_fn(pull):
            # If this number is in [0, eps_fn(pull)], choose a random bandit
            idx = np.random.randint(n_bandits)
        else:
            # Otherwise, choose bandit which has maximum avg payout so far
            idx = np.argmax(values)
        
        ## Step 2: Pull the bandit that we intend to pull, and update our
        ## value estimate for that bandit
        this_pull = bandits[idx].pull()
        total_reward += this_pull
        pull_nums[idx] += 1
        # Our update equation: value = value + alpha*(this_pull - value)
        values[idx] = values[idx] + alpha_fn(pull_nums[idx])*(this_pull - values[idx])
        choices[pull] = idx
    
    return total_reward, choices


# We'll start by trying an "epsilon-greedy" strategy with a fixed epsilon of 0.1
plt.figure(figsize=(12, 4))
np.random.seed(0)
total_reward, choices = learn_bandit(bandits, total_pulls=1000, eps_fn=0.1)
print(total_reward)

plt.subplot(121)
plt.plot(choices)
plt.xlabel("Pull Number")
plt.ylabel("Bandit index")
plt.subplot(122)
plt.hist(choices)
plt.xticks(np.arange(len(bandits)))
plt.xlabel("Bandit Index")
plt.ylabel("Total Pulls on Bandit")
plt.title("Bandit Choices")

4547.278733183436

Text(0.5, 1.0, 'Bandit Choices')


# Try these different numbers of pulls.  We'll space the numbers out more as we get larger
# to save time
total_pulls = np.array(np.round(20000*np.linspace(0.01, 1, 500)**2), dtype=int)

# How many experiments to run for each epsilon for each number of pulls.
# We'll take the average over all of these experiments
n_experiments = 200 

for eps in [0, 0.01, 0.1, 0.4]:
    # Start out with an array of zeros to keep track of the average payout at each number
    avg_reward = np.zeros_like(total_pulls) 
    # We'll also save the bandits that are chosen at each step
    all_choices = []
    for i, pulls in enumerate(total_pulls):
        ipd.clear_output()
        print(eps, total_pulls[i])
        # For each number of pulls, run some number of experiments
        reward = 0
        choices = []
        for j in range(n_experiments):
            reward_j, choices_j = learn_bandit(bandits, total_pulls=pulls, eps_fn=eps)
            reward += reward_j
            choices = np.concatenate((choices, choices_j))
        avg_reward[i] = reward/n_experiments
        all_choices.append(choices)

    results["Greedy $\\epsilon = {}$".format(eps)] = {"reward":avg_reward, "choices":all_choices}

0.4 20000


def plot_avg_rewards(total_pulls, results, mn, mx):
    """
    Make a plot of the average rewards vs number of pulls in 
    some range
    mn: min number of pulls
    mx: max number of pulls
    """
    for key in results.keys():
        idx = (total_pulls <= mx)*(total_pulls >= mn)
        plt.plot(total_pulls[idx], results[key]["reward"][idx])
        plt.legend(list(results.keys()))
        plt.xlabel("Number of pulls")
        plt.ylabel("Average reward over {} trials".format(n_experiments))
        
plt.figure(figsize=(12, 4))
plt.subplot(121)
plot_avg_rewards(total_pulls, results, 0, 1000)
plt.subplot(122)
plot_avg_rewards(total_pulls, results, 0, np.max(total_pulls))


def plot_choices(total_pulls, results, key):
    """
    Make a plot of the proportion of times each bandit was chosen
    """
    prop = [[np.sum(results[key]["choices"][p] == k) 
             for p in range(len(total_pulls))] 
             for k in range(len(bandits))]
    prop = np.array(prop)
    prop = prop/np.sum(prop, axis=0, keepdims=True)
    for k in range(len(bandits)):
        plt.plot(prop[k, :])
    plt.legend(["{}".format(i) for i in range(len(bandits))])
    plt.title(key)
    plt.xlabel("Number of pulls")
    plt.ylabel("Proportion chosen")
    plt.ylim([0, 1])

plt.figure(figsize=(12, 6))
plt.subplot(221)
plot_choices(total_pulls, results, "Greedy $\\epsilon = 0.01$")
plt.subplot(222)
plot_choices(total_pulls, results, "Greedy $\\epsilon = 0.1$")
plt.subplot(223)
plot_choices(total_pulls, results, "Greedy $\\epsilon = 0.4$")
plt.tight_layout()


fixed_pulls = [1000, 5000, 10000]

epsilons = np.linspace(0, 0.5, 51)
n_experiments_eps = 400
avg_rewards_eps = {}
for pulls in fixed_pulls:
    avg_rewards = np.zeros_like(epsilons)
    for i, eps in enumerate(epsilons):
        ipd.clear_output()
        print(pulls, eps)
        reward = 0
        for j in range(n_experiments_eps):
            reward_j, _ = learn_bandit(bandits, total_pulls=pulls, eps_fn=eps)
            reward += reward_j
        avg_rewards[i] = reward/n_experiments_eps
    avg_rewards_eps[pulls] = avg_rewards

10000 0.5


plt.figure(figsize=(12, 4))
for p in fixed_pulls:
    y = avg_rewards_eps[p]/np.max(avg_rewards_eps[p])
    plt.plot(epsilons, y)
plt.legend(["{} Pulls".format(p) for p in fixed_pulls])
plt.xlabel("$\epsilon$")
plt.ylabel("Ratio of Average Reward To Max Reward")
plt.title("Normalized Rewards for 1000 Pulls for $\epsilon$-Greedy Strategies")
plt.xticks(np.linspace(0, 0.5, 11))
print(epsilons[np.argmax(avg_rewards)])

0.04


for beta in [0.1, 0.01]:
    #### New Code
    eps_fn = lambda t: 1/(1 + beta*t) ## Our decaying epsilon method
    ####
    
    # Start out with an array of zeros to keep track of the average payout at each number
    avg_reward = np.zeros_like(total_pulls) 
    # We'll also save the bandits that are chosen at each step
    all_choices = []
    for i, pulls in enumerate(total_pulls):
        ipd.clear_output()
        print(beta, total_pulls[i])
        # For each number of pulls, run some number of experiments
        reward = 0
        choices = []
        for j in range(n_experiments):
            ### New Code
            reward_j, choices_j = learn_bandit(bandits, total_pulls=pulls, eps_fn=eps_fn)
            ###
            reward += reward_j
            choices = np.concatenate((choices, choices_j))
        avg_reward[i] = reward/n_experiments
        all_choices.append(choices)

    results["Decay $\\beta = {}$".format(beta)] = {"reward":avg_reward, "choices":all_choices}

0.01 20000


plt.figure(figsize=(12, 4))
plt.subplot(121)
plot_avg_rewards(total_pulls, results, 0, 1000)
plt.subplot(122)
plot_avg_rewards(total_pulls, results, 0, np.max(total_pulls))


### New Code
bias = 20
###

# Start out with an array of zeros to keep track of the average payout at each number
avg_reward = np.zeros_like(total_pulls) 
# We'll also save the bandits that are chosen at each step
all_choices = []
for i, pulls in enumerate(total_pulls):
    ipd.clear_output()
    print(total_pulls[i])
    # For each number of pulls, run some number of experiments
    reward = 0
    choices = []
    for j in range(n_experiments):
        ### New Code
        reward_j, choices_j = learn_bandit(bandits, total_pulls=pulls, eps_fn=0, bias=bias)
        ###
        reward += reward_j
        choices = np.concatenate((choices, choices_j))
    avg_reward[i] = reward/n_experiments
    all_choices.append(choices)

results["Optimistic, Bias = ${}$".format(bias)] = {"reward":avg_reward, "choices":all_choices}

20000


plt.figure(figsize=(12, 4))
plt.subplot(121)
plot_avg_rewards(total_pulls, results, 0, 1000)
plt.subplot(122)
plot_avg_rewards(total_pulls, results, 0, np.max(total_pulls))


class DynamicBandit:
    def __init__(self, seed, drift_std=0.25):
        """
        The constructor for bandit objects that drift over time
        
        Parameters
        ----------
        seed: int
            A seed for the pseudorandom number generator.  Different seeds
            will give different values of the mean reward over time
        drift_std: float
            The standard deviation of the change in the mean reward of the 
            bandit over time
        """
        
        self.seed = seed
        np.random.seed(seed)
        self.mu = 10*np.random.rand()
        self.sigma = 2*np.random.rand()
        # Compute small drifts that this bandit will go through in its reward over time
        self.drifts = drift_std*np.random.randn(10000)
        self.i = 0
    
    def pull(self):
        """
        Returns
        -------
        One random pull of this bandit, which returns a number with mean mu
        and standard deviation sigma
        """
        ret = np.random.randn()*self.sigma + self.mu
        return ret
    
    def drift(self):
        # Apply drift
        self.mu += self.drifts[self.i%len(self.drifts)]
        self.i += 1
    
    def __repr__(self):
        return "Bandit seed {}, Current Mu {:.3f}, Std {:.3f}".format(self.seed, self.mu, self.sigma)
    
# Setup a list of 10 bandits and print them out
dynamic_bandits = [DynamicBandit(i) for i in range(10)]

print("Initial state of dynamic bandits")
for b in dynamic_bandits:
    print(b)


for i in range(100):
    for b in dynamic_bandits:
        b.drift()

print("\n\nAfter drifting for 100 timesteps")
for b in dynamic_bandits:
    print(b)

Initial state of dynamic bandits
Bandit seed 0, Current Mu 5.488, Std 1.430
Bandit seed 1, Current Mu 4.170, Std 1.441
Bandit seed 2, Current Mu 4.360, Std 0.052
Bandit seed 3, Current Mu 5.508, Std 1.416
Bandit seed 4, Current Mu 9.670, Std 1.094
Bandit seed 5, Current Mu 2.220, Std 1.741
Bandit seed 6, Current Mu 8.929, Std 0.664
Bandit seed 7, Current Mu 0.763, Std 1.560
Bandit seed 8, Current Mu 8.734, Std 1.937
Bandit seed 9, Current Mu 0.104, Std 1.004


After drifting for 100 timesteps
Bandit seed 0, Current Mu 6.576, Std 1.430
Bandit seed 1, Current Mu 5.626, Std 1.441
Bandit seed 2, Current Mu 2.272, Std 0.052
Bandit seed 3, Current Mu 2.131, Std 1.416
Bandit seed 4, Current Mu 10.762, Std 1.094
Bandit seed 5, Current Mu 4.722, Std 1.741
Bandit seed 6, Current Mu 11.360, Std 0.664
Bandit seed 7, Current Mu 1.065, Std 1.560
Bandit seed 8, Current Mu 10.275, Std 1.937
Bandit seed 9, Current Mu 0.055, Std 1.004


## Step 1: Do an experiment with an ordinary greedy epsilon technique

# Start out with an array of zeros to keep track of the average payout at each number
avg_reward = np.zeros_like(total_pulls) 
# We'll also save the bandits that are chosen at each step
all_choices = []
eps = 0.1
for i, pulls in enumerate(total_pulls):
    ipd.clear_output()
    print(eps, total_pulls[i])
    # For each number of pulls, run some number of experiments
    reward = 0
    choices = []
    for j in range(n_experiments):
        reward_j, choices_j = learn_bandit(dynamic_bandits, total_pulls=pulls, eps_fn=eps)
        ## New code: drift each bandit over time
        for b in dynamic_bandits:
            b.drift()
        ##
        reward += reward_j
        choices = np.concatenate((choices, choices_j))
    avg_reward[i] = reward/n_experiments
    all_choices.append(choices)

results_drift = {}
results_drift["Greedy $\\epsilon = {}$".format(eps)] = {"reward":avg_reward, "choices":all_choices}




## New code: Define alpha; how much each new pull impacts our value estimate
for alpha in [0.01, 0.1, 0.5]:
    # Start out with an array of zeros to keep track of the average payout at each number
    avg_reward = np.zeros_like(total_pulls) 
    # We'll also save the bandits that are chosen at each step
    all_choices = []
    for i, pulls in enumerate(total_pulls):
        ipd.clear_output()
        print(eps, total_pulls[i])
        # For each number of pulls, run some number of experiments
        reward = 0
        choices = []
        for j in range(n_experiments):
            reward_j, choices_j = learn_bandit(dynamic_bandits, total_pulls=pulls, eps_fn=eps, alpha_fn=alpha)
            ## New code: drift each bandit over time
            for b in dynamic_bandits:
                b.drift()
            ##
            reward += reward_j
            choices = np.concatenate((choices, choices_j))
        avg_reward[i] = reward/n_experiments
        all_choices.append(choices)

    results_drift["Greedy $\\epsilon = {}$, $\\alpha={}$".format(eps, alpha)] = {"reward":avg_reward, "choices":all_choices}

0.1 20000


plt.figure(figsize=(12, 4))
plt.subplot(121)
plot_avg_rewards(total_pulls, results_drift, 0, 1000)
plt.subplot(122)
plot_avg_rewards(total_pulls, results_drift, 0, np.max(total_pulls))

	Environment	Policy over time	Reward
Slot Machines	Slot machines with different payouts	Which machine to pull	Money accrued
Fishing	Different densities of fish in different places	Where to fish	Total fish caught
Efficient driving	Performance of car	How fast to drive	Miles per gallon

Introduction To Reinforcement Learning: The Multi-Armed Bandit Problem¶

Chris Tralie¶

Epsilon-Greedy Strategy¶

A note on updating averages¶

$\mu_n = \frac{x_1 + x_2 + ... + x_n}{n}$¶

$\mu_{n} = \frac{(n-1) \mu_{n-1} + x_n}{n}$¶

$\mu_{n} = \mu_{n-1} + \frac{1}{n} (x_n - \mu_{n-1}) $¶

$\mu_{n} = \mu_{n-1} + \alpha (x_n - \mu_{n-1}) $¶

Epsilon-Decay Strategy¶

$\epsilon(t) = \frac{1}{1 + \beta t}$¶

Optimistic Strategy¶

Changing Environment¶

$\mu_{n} = \mu_{n-1} + \alpha (x_n - \mu_{n-1}) $¶

$\mu_{n} = (1 - \alpha)\mu_{n-1} + \alpha x_n$¶

$\mu_{n} = (1 - \alpha)((1-\alpha)\mu_{n-2} + \alpha x_{n-1}) + \alpha x_n$¶

$\mu_{n} = (1 - \alpha)((1-\alpha)((1-\alpha)\mu_{n-3} + \alpha x_{n-2}) + \alpha x_{n-1}) + \alpha x_n$¶

$\alpha(1-\alpha)^k$¶