Chapter 10 Mountain Car
xxxxxxxxxx
7
1
begin
2
using ReinforcementLearning
3
using Flux
4
using Statistics
5
using Plots
6
using SparseArrays
7
end
The MountainCarEnv
is already provided in ReinforcementLearning.jl
. So we can use it directly here. Note that by default this environment will terminate at the maximum step of 200
. While in the example on the book there's no such restriction.
# MountainCarEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`ReinforcementLearningBase.Space{Array{IntervalSets.Interval{:closed,:closed,Float64},1}}(IntervalSets.Interval{:closed,:closed,Float64}[-1.2..0.6, -0.07..0.07])`
## Action Space
`Base.OneTo(3)`
## Current State
```
[-0.5834095103051787, 0.0]
```
xxxxxxxxxx
1
1
env = MountainCarEnv()
-1.2..0.6
-0.07..0.07
xxxxxxxxxx
1
1
S = state_space(env)
First let's define a Tiling
structure to encode the state.
encode (generic function with 2 methods)
xxxxxxxxxx
21
1
begin
2
struct Tiling{N,Tr<:AbstractRange}
3
ranges::NTuple{N,Tr}
4
inds::LinearIndices{N,NTuple{N,Base.OneTo{Int}}}
5
end
6
7
Tiling(ranges::AbstractRange...) =Tiling(
8
ranges,
9
LinearIndices(Tuple(length(r) - 1 for r in ranges))
10
)
11
12
Base.length(t::Tiling) = reduce(*, (length(r) - 1 for r in t.ranges))
13
14
function Base.:-(t::Tiling, xs)
15
Tiling((r .- x for (r, x) in zip(t.ranges, xs))...)
16
end
17
18
encode(range::AbstractRange, x) = floor(Int, div(x - range[1], step(range)) + 1)
19
20
encode(t::Tiling, xs) = t.inds[CartesianIndex(Tuple(map(encode, t.ranges, xs)))]
21
end
-1.2:0.22499999999999998:0.825
-0.07:0.0175:0.0875
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.228125:0.22499999999999998:0.796875
-0.0721875:0.0175:0.0853125
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.2562499999999999:0.22499999999999998:0.7687499999999999
-0.074375:0.0175:0.083125
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.284375:0.22499999999999998:0.740625
-0.0765625:0.0175:0.0809375
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.3125:0.22499999999999998:0.7124999999999999
-0.07875:0.0175:0.07875
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.340625:0.22499999999999998:0.684375
-0.0809375:0.0175:0.0765625
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.36875:0.22499999999999998:0.65625
-0.083125:0.0175:0.074375
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
-1.3968749999999999:0.22499999999999998:0.628125
-0.0853125:0.0175:0.0721875
9×9 LinearIndices{2,Tuple{OneTo{Int64},OneTo{Int64}}}: 1 10 19 28 37 46 55 64 73 2 11 20 29 38 47 56 65 74 3 12 21 30 39 48 57 66 75 4 13 22 31 40 49 58 67 76 5 14 23 32 41 50 59 68 77 6 15 24 33 42 51 60 69 78 7 16 25 34 43 52 61 70 79 8 17 26 35 44 53 62 71 80 9 18 27 36 45 54 63 72 81
xxxxxxxxxx
12
1
begin
2
ntilings = 8
3
ntiles = 8
4
tiling = Tiling(
5
(
6
range(r.left, step=(r.right-r.left)/ntiles, length=ntiles+2)
7
for r in S
8
)...
9
)
10
offset = map(x-> x.right - x.left, S) ./ (ntiles * ntilings)
11
tilings = [tiling - offset .* (i-1) for i in 1:ntilings]
12
end
The rest parts are simple, we initialize agent
and env
, then roll out experiments:
create_env_agent (generic function with 3 methods)
xxxxxxxxxx
24
1
function create_env_agent(α=2e-4, n=0)
2
env = StateOverriddenEnv(
3
MountainCarEnv(;max_steps=10000),
4
s -> sparse(map(t -> encode(t, s), tilings), 1:8, ones(8), 81, 8) |> vec
5
)
6
7
agent = Agent(
8
policy=QBasedPolicy(
9
learner=TDLearner(
10
approximator=LinearQApproximator(
11
n_state=81*8,
12
n_action=3,
13
opt = Descent(α)
14
),
15
method=:SARSA,
16
n=n
17
),
18
explorer=GreedyExplorer()
19
),
20
trajectory=VectorSARTTrajectory(;state=Vector{Int})
21
)
22
23
env, agent
24
end
-1.2:0.046153846153846156:0.6
xxxxxxxxxx
1
1
X = range(S[1].left, stop=S[1].right, length=40)
-0.07:0.0035897435897435897:0.07
xxxxxxxxxx
1
1
Y = range(S[2].left, stop=S[2].right, length=40)
show_approximation (generic function with 1 method)
xxxxxxxxxx
8
1
function show_approximation(n)
2
env, agent = create_env_agent()
3
run(agent, env, StopAfterEpisode(n))
4
[
5
agent.policy.learner.approximator(env.f([p, v])) |> maximum
6
for p in X, v in Y
7
]
8
end
10
xxxxxxxxxx
1
1
n = 10
xxxxxxxxxx
1
1
plot(X, Y, -show_approximation(n), linetype=:wireframe)
xxxxxxxxxx
15
1
begin
2
fig_10_2 = plot(legend=:topright)
3
n_runs = 5 # quite slow here, need revisit
4
for α in [0.1/8, 0.2/8, 0.5/8]
5
avg_steps_per_episode = zeros(500)
6
for _ in 1:n_runs
7
env, agent = create_env_agent(α)
8
hook = StepsPerEpisode()
9
run(agent, env, StopAfterEpisode(500; is_show_progress=false),hook)
10
avg_steps_per_episode .+= hook.steps
11
end
12
plot!(fig_10_2, avg_steps_per_episode ./ n_runs, yscale=:log10, label="α=$α")
13
end
14
fig_10_2
15
end
xxxxxxxxxx
12
1
begin
2
function run_once(α, n;is_reduce=true, n_episode=50)
3
env, agent = create_env_agent(α, n)
4
hook = StepsPerEpisode()
5
run(agent, env, StopAfterEpisode(n_episode; is_show_progress=false),hook)
6
is_reduce ? mean(hook.steps) : hook.steps
7
end
8
fig_10_3 = plot()
9
plot!(fig_10_3, mean(run_once(0.5/8, 1; is_reduce=false, n_episode=500) for _ in 1:10), yscale=:log10)
10
plot!(fig_10_3, mean(run_once(0.3/8, 8; is_reduce=false, n_episode=500) for _ in 1:10), yscale=:log10)
11
fig_10_3
12
end
xxxxxxxxxx
7
1
begin
2
fig_10_4 = plot(legend=:topright)
3
for (A, n) in [(0.4:0.1:1.7, 1), (0.3:0.1:1.6, 2), (0.2:0.1:1.4, 4), (0.2:0.1:0.9, 8), (0.2:0.1:0.7, 16)]
4
plot!(fig_10_4, A, [mean(run_once(α/8, n) for _ in 1:5) for α in A], label="n = $n")
5
end
6
fig_10_4
7
end