Chapter 6.2 Random Walk
xxxxxxxxxx
6
1
begin
2
using ReinforcementLearning
3
using Flux
4
using Statistics
5
using Plots
6
end
In this section, we'll use the RandomWalk1D
env provided in ReinforcementLearning
.
# RandomWalk1D
## Traits
| Trait Type | Value |
|:----------------- | ----------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.PerfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Deterministic() |
| RewardStyle | ReinforcementLearningBase.TerminalReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Int64}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Int64}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(7)`
## Action Space
`Base.OneTo(2)`
## Current State
```
4
```
xxxxxxxxxx
1
1
env = RandomWalk1D(;rewards=0.0=>1.0)
7
2
xxxxxxxxxx
1
1
NS, NA = length(state_space(env)), length(action_space(env))
As is explained in the book, the true values of state A to E are:
0.166667
0.333333
0.5
0.666667
0.833333
xxxxxxxxxx
1
1
true_values = [i/6 for i in 1:5]
To estimate the state values, we'll use the VBasedPolicy
with a random action generator.
create_TD_agent (generic function with 1 method)
xxxxxxxxxx
12
1
create_TD_agent(α) = Agent(
2
policy=VBasedPolicy(
3
learner = TDLearner(
4
approximator=TabularApproximator(fill(0.5, NS), Descent(α)),
5
method=:SRS,
6
γ=1.0,
7
n=0,
8
),
9
mapping = (env, V) -> rand(1:NA)
10
),
11
trajectory=VectorSARTTrajectory()
12
)
xxxxxxxxxx
14
1
begin
2
p_6_2_left = plot(;legend=:bottomright)
3
for i in [1, 10, 100]
4
agent = create_TD_agent(0.1)
5
run(agent, env, StopAfterEpisode(i))
6
plot!(
7
p_6_2_left,
8
agent.policy.learner.approximator.table[2:end - 1],
9
label="episode = $i"
10
)
11
end
12
plot!(p_6_2_left, true_values, label="true value")
13
p_6_2_left
14
end
To calculate the RMS error, we need to define such a hook first.
RecordRMS
xxxxxxxxxx
3
1
Base. struct RecordRMS <: AbstractHook
2
rms::Vector{Float64} = []
3
end
xxxxxxxxxx
4
1
(f::RecordRMS)(::PostEpisodeStage, agent, env) = push!(
2
f.rms,
3
sqrt(mean((agent.policy.learner.approximator.table[2:end - 1] - true_values).^2))
4
)
Now let's take a look at the performance of TDLearner
under different α.
xxxxxxxxxx
15
1
begin
2
p_6_2_right = plot()
3
4
for α in [0.05, 0.1, 0.15]
5
rms = []
6
for _ in 1:100
7
agent = create_TD_agent(α)
8
hook = RecordRMS()
9
run(agent, env, StopAfterEpisode(100),hook)
10
push!(rms, hook.rms)
11
end
12
plot!(p_6_2_right, mean(rms), label ="TD alpha=$α", linestyle=:dashdot)
13
end
14
p_6_2_right
15
end
Then we can compare the differences between TDLearner
and MonteCarloLearner
.
create_MC_agent (generic function with 1 method)
xxxxxxxxxx
10
1
create_MC_agent(α) = Agent(
2
policy=VBasedPolicy(
3
learner=MonteCarloLearner(
4
approximator=TabularApproximator(fill(0.5, NS), Descent(α)),
5
kind=EVERY_VISIT
6
),
7
mapping = (env, V) -> rand(1:NA)
8
),
9
trajectory=VectorSARTTrajectory()
10
)
xxxxxxxxxx
10
1
for α in [0.01, 0.02, 0.03, 0.04]
2
rms = []
3
for _ in 1:100
4
agent = create_MC_agent(α)
5
hook = RecordRMS()
6
run(agent, env, StopAfterEpisode(100),hook)
7
push!(rms, hook.rms)
8
end
9
plot!(p_6_2_right, mean(rms), label ="MC alpha=$α")
10
end
xxxxxxxxxx
1
1
p_6_2_right
xxxxxxxxxx
24
1
begin
2
fig_6_2 = plot()
3
4
rms = []
5
for _ in 1:100
6
agent = create_TD_agent(0.1)
7
hook = RecordRMS()
8
run(agent, env, StopAfterEpisode(100),hook)
9
push!(rms, hook.rms)
10
end
11
plot!(fig_6_2, mean(rms), label ="TD alpha=0.1", linestyle=:dashdot)
12
13
14
rms = []
15
for _ in 1:100
16
agent = create_MC_agent(0.1)
17
hook = RecordRMS()
18
run(agent, env, StopAfterEpisode(100),hook)
19
push!(rms, hook.rms)
20
end
21
plot!(fig_6_2, mean(rms), label ="MC alpha=0.1")
22
23
fig_6_2
24
end
Warning
Some of you might have noticed that the above figure is not the same with the one on the book of Figure 6.2. Actually we are not doing **BATCH TRAINING** here, because we're emptying the `trajectory` at the end of each episode. We leave it as an exercise for readers to practice developing new customized algorithms with `ReinforcementLearning.jl`. 😉