Ten Armed Bandits Environment
In this chapter, we'll use the MultiArmBanditsEnv
to study two main concepts in reinforcement learning: exploration and exploitation.
Let's take a look at the environment first.
xxxxxxxxxx
using ReinforcementLearning
# MultiArmBanditsEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.TerminalReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Int64}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Int64}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(1)`
## Action Space
`Base.OneTo(10)`
## Current State
```
1
```
xxxxxxxxxx
env = MultiArmBanditsEnv()
xxxxxxxxxx
using Plots
xxxxxxxxxx
using StatsPlots
xxxxxxxxxx
violin(
[
[
begin
reset!(env)
env(a)
reward(env)
end
for _ in 1:100
]
for a in action_space(env)
],
leg=false
)
The above figure shows the reward distribution of each action. (Figure 2.1)
Now we create a testbed to calulate the average reward and perfect action percentage.
CollectBestActions
xxxxxxxxxx
"""
A customized hook to record whether the action to take is the best action or not.
"""
Base. struct CollectBestActions <: AbstractHook
best_action::Int
isbest::Vector{Bool} = []
end
xxxxxxxxxx
function (h::CollectBestActions)(::PreActStage, agent, env, action)
push!(h.isbest, h.best_action==action)
end
Writing a customized hook is easy.
Define your
struct
and make it inherit fromAbstractHook
(optional).Write your customized runtime logic by overwriting some of the following functions. By default, they will do nothing if your hook inherits from
AbstractHook
.(h::YourHook)(::PreActStage, agent, env, action)
(h::YourHook)(::PostActStage, agent, env)
(h::YourHook)(::PreEpisodeStage, agent, env)
(h::YourHook)(::PostEpisodeStage, agent, env)
xxxxxxxxxx
using Flux
xxxxxxxxxx
using Statistics
bandit_testbed (generic function with 1 method)
xxxxxxxxxx
function bandit_testbed(
;explorer=EpsilonGreedyExplorer(0.1),
true_reward=0.0,
init=0.,
opt=InvDecay(1.0)
)
env = MultiArmBanditsEnv(;true_reward=true_reward)
agent = Agent(
policy=QBasedPolicy(
learner = TDLearner(
approximator = TabularQApproximator(
n_state=length(state_space(env)),
n_action=length(action_space(env)),
init=init,
opt = opt
),
γ = 1.0,
method=:SARSA,
n = 0,
),
explorer = explorer
),
trajectory=VectorSARTTrajectory()
)
h1 = CollectBestActions(;best_action=findmax(env.true_values)[2])
h2 = TotalRewardPerEpisode()
run(agent, env, StopAfterStep(1000), ComposedHook(h1, h2))
h1.isbest, h2.rewards
end
xxxxxxxxxx
begin
p = plot(layout=(2, 1))
for ϵ in [0.1, 0.01, 0.0]
stats = [
bandit_testbed(;explorer=EpsilonGreedyExplorer(ϵ))
for _ in 1:2000
]
plot!(p, mean(x[2] for x in stats);
subplot=1, legend=:bottomright, label="epsilon=$ϵ")
plot!(p, mean(x[1] for x in stats);
subplot=2, legend=:bottomright, label="epsilon=$ϵ")
end
p
end
xxxxxxxxxx
begin
# now compare the effect of setting init value
p_2_3 = plot(legend=:bottomright)
v1 = mean(
bandit_testbed(
;explorer=EpsilonGreedyExplorer(0.),
init=5.,
opt=Descent(0.1)
)[1]
for _ in 1:2000
)
plot!(p_2_3, v1, label="Q_1=5, epsilon=0.")
v2 = mean(
bandit_testbed(
;explorer=EpsilonGreedyExplorer(0.1),
init=0.,
opt=Descent(0.1)
)[1]
for _ in 1:2000
)
plot!(p_2_3, v2, label="Q_1=0, epsilon=0.1")
p_2_3
end
xxxxxxxxxx
begin
p_2_4 = plot(legend=:bottomright)
plot!(p_2_4, mean(bandit_testbed(;explorer=UCBExplorer(10), opt=Descent(0.1))[2] for _ in 1:5000), label="UpperConfidenceBound, c=2")
plot!(p_2_4, mean(bandit_testbed(;explorer=EpsilonGreedyExplorer(0.1), opt=Descent(0.1))[2] for _ in 1:5000), label="epsilon-greedy, epsilon=0.1")
p_2_4
end
Similar to the bandit_testbed
function, we'll create a new function to test the performance of GradientBanditLearner
.
xxxxxxxxxx
md"""
Similar to the `bandit_testbed` function, we'll create a new function to test the performance of `GradientBanditLearner`.
"""
gb_bandit_testbed (generic function with 1 method)
xxxxxxxxxx
function gb_bandit_testbed(
;baseline=0.,
explorer=WeightedExplorer(is_normalized=true),
true_reward=0.0,
init=0.,
opt=InvDecay(1.0)
)
env = MultiArmBanditsEnv(;true_reward=true_reward)
agent = Agent(
policy=QBasedPolicy(
learner = GradientBanditLearner(
approximator = TabularQApproximator(
n_state=length(state_space(env)),
n_action=length(action_space(env)),
init=init,
opt = opt
),
baseline=baseline
),
explorer = explorer
),
trajectory=VectorSARTTrajectory()
)
h1 = CollectBestActions(;best_action=findmax(env.true_values)[2])
h2 = TotalRewardPerEpisode()
run(agent, env, StopAfterStep(1000), ComposedHook(h1, h2))
h1.isbest, h2.rewards
end
Note that there's a keyword argument named baseline
in the GradientBanditLearner
. It can be either a number or a callable function (reward -> value
). One of such functions mentioned in the book is to calculate the average of seen rewards.
SampleAvg
xxxxxxxxxx
Base. mutable struct SampleAvg
t::Int = 0
avg::Float64 = 0.0
end
xxxxxxxxxx
function (s::SampleAvg)(x)
s.t += 1
s.avg += (x - s.avg) / s.t
s.avg
end
1.0
1.5
2.0
2.5
3.0
3.5
4.0
4.5
5.0
5.5
xxxxxxxxxx
begin
baseline = SampleAvg()
[baseline(x) for x in 1:10]
end
xxxxxxxxxx
begin
true_reward = 4.0
p_2_5 = plot(legend=:bottomright)
plot!(p_2_5, mean(gb_bandit_testbed(;opt=Descent(0.1), baseline=SampleAvg(), true_reward=true_reward)[1] for _ in 1:2000), label="alpha = 0.1, with baseline")
plot!(p_2_5, mean(gb_bandit_testbed(;opt=Descent(0.4), baseline=SampleAvg(), true_reward=true_reward)[1] for _ in 1:2000), label="alpha = 0.4, with baseline")
plot!(p_2_5, mean(gb_bandit_testbed(;opt=Descent(0.1), true_reward=true_reward)[1] for _ in 1:2000), label="alpha = 0.1, without baseline")
plot!(p_2_5, mean(gb_bandit_testbed(;opt=Descent(0.4), true_reward=true_reward)[1] for _ in 1:2000), label="alpha = 0.4, without baseline")
p_2_5
end
xxxxxxxxxx
begin
p_2_6 = plot(legend=:topleft)
plot!(p_2_6, -7:-2, [mean(mean(bandit_testbed(;explorer=EpsilonGreedyExplorer(2.0^i))[2] for _ in 1:2000)) for i in -7:-2], label="epsilon greedy")
plot!(p_2_6, -5:1, [mean(mean(gb_bandit_testbed(;explorer=WeightedExplorer(is_normalized=true), opt=Descent(2.0^i))[2] for _ in 1:2000)) for i in -5:1], label="gradient")
plot!(p_2_6, -4:2, [mean(mean(bandit_testbed(;explorer=UCBExplorer(10; c=2.0^i))[2] for _ in 1:2000)) for i in -4:2], label="UCB")
plot!(p_2_6, -2:2, [mean(mean(bandit_testbed(;explorer=EpsilonGreedyExplorer(0.), init=(2.0^i))[2] for _ in 1:2000)) for i in -2:2], label="greedy with initialization")
p_2_6
end