Chapter 9 On-policy Prediction with Approximation
In this notebook, we'll focus on the linear approximation methods.
xxxxxxxxxx
7
1
begin
2
using ReinforcementLearning
3
using Flux
4
using Statistics
5
using Plots
6
using SparseArrays
7
end
Figure 9.1
We've discussed the RandomWalk1D
environment before. In previous example, the state space is relatively small (1:7
). Here we expand it into 1:1000
and see how the LinearVApproximator
will work here.
-100
-99
-98
-97
-96
-95
-94
-93
-92
-91
-90
-89
-88
-87
-86
-85
-84
-83
-82
-81
91
92
93
94
95
96
97
98
99
100
xxxxxxxxxx
1
1
ACTIONS = collect(Iterators.flatten((-100:-1, 1:100)))
200
xxxxxxxxxx
1
1
NA = length(ACTIONS)
1002
xxxxxxxxxx
1
1
NS = 1002
First, let's roll out a large experiment to calculate the true state values of each state:
0.0
-0.902616
-0.90916
-0.90618
-0.906309
-0.914173
-0.89853
-0.911785
-0.882043
-0.892313
-0.899347
-0.890821
-0.885008
-0.878395
-0.893046
-0.874078
-0.89532
-0.885626
-0.878013
-0.883827
0.904209
0.890995
0.909652
0.901835
0.904345
0.903078
0.913111
0.917742
0.923114
0.0
xxxxxxxxxx
15
1
TRUE_STATE_VALUES = begin
2
env = RandomWalk1D(N=NS, actions=ACTIONS)
3
agent = Agent(
4
policy=VBasedPolicy(
5
learner=TDLearner(
6
approximator=TabularVApproximator(;n_state=NS,opt=Descent(0.01)),
7
method=:SRS,
8
),
9
mapping=(env,V) -> rand(action_space(env))
10
),
11
trajectory=VectorSARTTrajectory()
12
)
13
run(agent, env, StopAfterEpisode(10^5))
14
agent.policy.learner.approximator.table
15
end
xxxxxxxxxx
1
1
plot(TRUE_STATE_VALUES[2:end-1])
Next, we define a preprocessor to map adjacent states into groups.
10
xxxxxxxxxx
1
1
N_GROUPS = 10
GroupMapping
xxxxxxxxxx
5
1
Base. struct GroupMapping
2
n::Int
3
n_groups::Int = N_GROUPS
4
n_per_group::Int=div(n, N_GROUPS)
5
end
xxxxxxxxxx
10
1
function (p::GroupMapping)(x::Int)
2
if x == 1
3
res = 1
4
elseif x == p.n
5
res = p.n_groups + 2
6
else
7
res = div(x - 2, p.n_per_group) + 2
8
end
9
res
10
end
xxxxxxxxxx
1
1
plot([GroupMapping(;n=NS)(i) for i in 1:NS], legend=nothing)
To count the frequency of each state, we need to write a hook.
xxxxxxxxxx
4
1
struct CountStates <: AbstractHook
2
counts::Vector{Int}
3
CountStates(n) = new(zeros(Int, n))
4
end
xxxxxxxxxx
1
1
(f::CountStates)(::PreActStage, agent, env, action) = f.counts[state(env.env)] += 1
Now let's kickoff our experiment:
Agent
├─ policy => VBasedPolicy
│ ├─ learner => MonteCarloLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 12-element Array{Float64,1}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 2.0e-5
│ │ ├─ γ => 1.0
│ │ ├─ kind => ReinforcementLearningZoo.EveryVisit
│ │ └─ sampling => ReinforcementLearningZoo.NoSampling
│ └─ mapping => Main.var"#3#4"
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx
10
1
agent_1 = Agent(
2
policy=VBasedPolicy(
3
learner=MonteCarloLearner(
4
approximator=TabularVApproximator(n_state=N_GROUPS+2,opt=Descent(2e-5)),
5
kind=EVERY_VISIT, # this is very important!
6
),
7
mapping=(env,V) -> rand(action_space(env))
8
),
9
trajectory=VectorSARTTrajectory()
10
)
# RandomWalk1D |> StateOverriddenEnv
## Traits
| Trait Type | Value |
|:----------------- | ----------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.PerfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Deterministic() |
| RewardStyle | ReinforcementLearningBase.TerminalReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Int64}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Int64}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(1002)`
## Action Space
`Base.OneTo(200)`
## Current State
```
6
```
xxxxxxxxxx
4
1
env_1 = StateOverriddenEnv(
2
RandomWalk1D(N=NS, actions=ACTIONS),
3
GroupMapping(n=NS)
4
)
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
0
xxxxxxxxxx
1
1
hook=CountStates(NS)
0
1186
1198
1304
1358
1339
1274
1371
1339
1373
1332
1406
1439
1486
1465
1506
1538
1572
1600
1646
1374
1334
1262
1305
1254
1311
1292
1250
1193
0
xxxxxxxxxx
1
1
run(agent_1, env_1, StopAfterEpisode(10^5),hook)
xxxxxxxxxx
7
1
begin
2
fig_9_1 = plot(legend=:topleft)
3
fig_9_1_right = twinx(fig_9_1)
4
plot!(fig_9_1, hook.counts./sum(hook.counts), color=:gray, label="state distribution")
5
plot!(fig_9_1_right, agent_1.policy.learner.approximator.(env_1.f(s) for s in 2:NS-1), label="MC Learner", legend=:bottomright)
6
plot!(fig_9_1_right, TRUE_STATE_VALUES[2:end-1], label="true values",legend=:bottomright)
7
end
Figure 9.2
Agent
├─ policy => VBasedPolicy
│ ├─ learner => TDLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 12-element Array{Float64,1}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 0.0002
│ │ ├─ γ => 1.0
│ │ ├─ method => SRS
│ │ └─ n => 0
│ └─ mapping => Main.var"#5#6"
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx
10
1
agent_2 = Agent(
2
policy=VBasedPolicy(
3
learner=TDLearner(
4
approximator=TabularVApproximator(n_state=N_GROUPS+2,opt=Descent(2e-4)),
5
method=:SRS,
6
),
7
mapping=(env,V) -> rand(action_space(env))
8
),
9
trajectory=VectorSARTTrajectory()
10
)
xxxxxxxxxx
1
1
run(agent_2, env_1, StopAfterEpisode(10^5))
xxxxxxxxxx
6
1
begin
2
fig_9_2_left = plot(legend=:bottomright)
3
plot!(fig_9_2_left, agent_2.policy.learner.approximator.(env_1.f(s) for s in 2:NS-1), label="TD Learner", legend=:bottomright)
4
plot!(fig_9_2_left, TRUE_STATE_VALUES[2:end-1], label="true values",legend=:bottomright)
5
fig_9_2_left
6
end
Figure 9.2 right
xxxxxxxxxx
4
1
struct RecordRMS <: AbstractHook
2
rms::Vector{Float64}
3
RecordRMS() = new([])
4
end
xxxxxxxxxx
3
1
function (f::RecordRMS)(::PostEpisodeStage, agent, env)
2
push!(f.rms, sqrt(mean((agent.policy.learner.approximator.(env.f.(2:(NS-1))) - TRUE_STATE_VALUES[2:end-1]).^2)))
3
end
20
xxxxxxxxxx
1
1
n_groups = 20
run_once (generic function with 1 method)
xxxxxxxxxx
24
1
function run_once(n, α)
2
env = StateOverriddenEnv(
3
RandomWalk1D(N=NS, actions=ACTIONS),
4
GroupMapping(n=NS)
5
)
6
agent = Agent(
7
policy=VBasedPolicy(
8
learner=TDLearner(
9
approximator=TabularVApproximator(;
10
n_state=n_groups+2,
11
opt=Descent(α)
12
),
13
method=:SRS,
14
n=n
15
),
16
mapping=(env,V) -> rand(action_space(env))
17
),
18
trajectory=VectorSARTTrajectory()
19
)
20
21
hook = RecordRMS()
22
run(agent, env, StopAfterEpisode(10),hook)
23
mean(hook.rms)
24
end
xxxxxxxxxx
15
1
begin
2
A = [0., 0.03, 0.06, 0.1:0.1:1...]
3
fig_9_2_right = plot(legend=:bottomright, ylim=[0.25,0.55])
4
for n in [2^i for i in 0:9]
5
plot!(
6
fig_9_2_right,
7
A,
8
mean(
9
[run_once(n, α) for α in A]
10
for _ in 1:100
11
),
12
label="n = $n")
13
end
14
fig_9_2_right
15
end
Figure 9.5
xxxxxxxxxx
3
1
struct FourierPreprocessor
2
order::Int
3
end
xxxxxxxxxx
1
1
(fp::FourierPreprocessor)(s::Number) = [cos(i * π * s) for i = 0:fp.order]
xxxxxxxxxx
3
1
struct PolynomialPreprocessor
2
order::Int
3
end
xxxxxxxxxx
1
1
(pp::PolynomialPreprocessor)(s::Number) = [s^i for i = 0:pp.order]
run_once_MC (generic function with 1 method)
xxxxxxxxxx
20
1
function run_once_MC(preprocessor, order, α)
2
env = StateOverriddenEnv(
3
RandomWalk1D(N=NS, actions=ACTIONS),
4
preprocessor
5
)
6
agent = Agent(
7
policy=VBasedPolicy(
8
learner=MonteCarloLearner(
9
approximator=RLZoo.LinearVApproximator(;n=order+1,opt=Descent(α)),
10
kind=EVERY_VISIT,
11
),
12
mapping=(env,V) -> rand(1:NA)
13
),
14
trajectory=VectorSARTTrajectory(;state=Vector{Float64})
15
)
16
17
hook=RecordRMS()
18
run(agent, env, StopAfterEpisode(5000;is_show_progress=false),hook)
19
hook.rms
20
end
xxxxxxxxxx
36
1
begin
2
3
fig_9_5 = plot(legend=:topright)
4
5
for order in [5, 10, 20]
6
plot!(
7
fig_9_5,
8
mean(
9
run_once_MC(
10
x -> FourierPreprocessor(order)(x/NS),
11
order,
12
0.00005
13
)
14
for _ in 1:5
15
),
16
label="Fourier $order",
17
linestyle=:dash
18
)
19
20
plot!(
21
fig_9_5,
22
mean(
23
run_once_MC(
24
x -> PolynomialPreprocessor(order)(x/NS),
25
order,
26
0.0001
27
)
28
for _ in 1:5
29
),
30
label="Polynomial $order",
31
linestyle=:solid
32
)
33
end
34
35
fig_9_5
36
end
Figure 9.10
Implementing the tile encoding in Julia is quite easy!😀
Tiling
xxxxxxxxxx
11
1
begin
2
struct Tiling{N,Tr<:AbstractRange}
3
ranges::NTuple{N,Tr}
4
inds::LinearIndices{N,NTuple{N,Base.OneTo{Int}}}
5
end
6
7
Tiling(ranges...) =Tiling(
8
ranges,
9
LinearIndices(Tuple(length(r) - 1 for r in ranges))
10
)
11
end
xxxxxxxxxx
1
1
Base.length(t::Tiling) = reduce(*, (length(r) - 1 for r in t.ranges))
encode (generic function with 1 method)
xxxxxxxxxx
1
1
encode(range::AbstractRange, x) = floor(Int, div(x - range[1], step(range)) + 1)
encode (generic function with 2 methods)
xxxxxxxxxx
1
1
encode(t::Tiling, xs) = t.inds[CartesianIndex(Tuple(map(encode, t.ranges, xs)))]
1:200:1201
1
2
3
4
5
6
xxxxxxxxxx
1
1
t = Tiling(range(1, step=200, length=7))
1:200:1201
1
2
3
4
5
6
-3:200:1197
1
2
3
4
5
6
-7:200:1193
1
2
3
4
5
6
-11:200:1189
1
2
3
4
5
6
-15:200:1185
1
2
3
4
5
6
-19:200:1181
1
2
3
4
5
6
-23:200:1177
1
2
3
4
5
6
-27:200:1173
1
2
3
4
5
6
-31:200:1169
1
2
3
4
5
6
-35:200:1165
1
2
3
4
5
6
-39:200:1161
1
2
3
4
5
6
-43:200:1157
1
2
3
4
5
6
-47:200:1153
1
2
3
4
5
6
-51:200:1149
1
2
3
4
5
6
-55:200:1145
1
2
3
4
5
6
-59:200:1141
1
2
3
4
5
6
-63:200:1137
1
2
3
4
5
6
-67:200:1133
1
2
3
4
5
6
-71:200:1129
1
2
3
4
5
6
-75:200:1125
1
2
3
4
5
6
-159:200:1041
1
2
3
4
5
6
-163:200:1037
1
2
3
4
5
6
-167:200:1033
1
2
3
4
5
6
-171:200:1029
1
2
3
4
5
6
-175:200:1025
1
2
3
4
5
6
-179:200:1021
1
2
3
4
5
6
-183:200:1017
1
2
3
4
5
6
-187:200:1013
1
2
3
4
5
6
-191:200:1009
1
2
3
4
5
6
-195:200:1005
1
2
3
4
5
6
xxxxxxxxxx
1
1
tt = [Tiling((range(1-4*(i-1), step=200, length=7))) for i in 1:50]
run_once_MC_tiling (generic function with 1 method)
xxxxxxxxxx
20
1
function run_once_MC_tiling(preprocessor, α, n)
2
env = StateOverriddenEnv(
3
RandomWalk1D(N=NS, actions=ACTIONS),
4
preprocessor
5
)
6
agent = Agent(
7
policy=VBasedPolicy(
8
learner=MonteCarloLearner(
9
approximator=RLZoo.LinearVApproximator(;n=n,opt=Descent(α)),
10
kind=EVERY_VISIT,
11
),
12
mapping=(env,V) -> rand(1:NA)
13
),
14
trajectory=VectorSARTTrajectory(;state=Vector{Float64})
15
)
16
17
hook=RecordRMS()
18
run(agent, env, StopAfterEpisode(10000;is_show_progress=true),hook)
19
hook.rms
20
end
xxxxxxxxxx
25
1
begin
2
fig_9_10 = plot()
3
4
plot!(
5
fig_9_10,
6
run_once_MC_tiling(
7
x -> sparse([encode(t, x) for t in tt], 1:50, ones(50), 7, 50) |> vec,
8
1e-4/50,
9
7*50
10
),
11
label="50 tilings"
12
)
13
14
plot!(
15
fig_9_10,
16
run_once_MC_tiling(
17
x -> Flux.onehot(encode(t, x), 1:7),
18
1e-4,
19
7
20
),
21
label = "one tiling"
22
)
23
24
fig_9_10
25
end
Feel free to make a PR if you can improve the speed of generating this figure. ❤