Example 6.5: Windy Gridworld
First, let's define this environment by implementing the interfaces defined in RLBase
.
xxxxxxxxxx
38
1
begin
2
using ReinforcementLearning
3
using Flux
4
5
const NX = 7
6
const NY = 10
7
const Wind = [CartesianIndex(w, 0) for w in [0, 0, 0, -1, -1, -1, -2, -2, -1, 0]]
8
const StartPosition = CartesianIndex(4, 1)
9
const Goal = CartesianIndex(4, 8)
10
const ACTION = [
11
CartesianIndex(0, -1), # left
12
CartesianIndex(0, 1), # right
13
CartesianIndex(-1, 0), # up
14
CartesianIndex(1, 0), # down
15
]
16
17
const LinearInds = LinearIndices((NX, NY))
18
19
Base. mutable struct WindyGridWorldEnv <: AbstractEnv
20
position::CartesianIndex{2} = StartPosition
21
end
22
23
RLBase.state_space(env::WindyGridWorldEnv) = Base.OneTo(length(LinearInds))
24
RLBase.action_space(env::WindyGridWorldEnv) = Base.OneTo(length(ACTION))
25
26
function (env::WindyGridWorldEnv)(a::Int)
27
p = env.position + Wind[env.position[2]] + ACTION[a]
28
p = CartesianIndex(min(max(p[1], 1), NX), min(max(p[2], 1), NY))
29
env.position = p
30
nothing
31
end
32
33
RLBase.state(env::WindyGridWorldEnv) = LinearInds[env.position]
34
RLBase.is_terminated(env::WindyGridWorldEnv) = env.position == Goal
35
RLBase.reward(env::WindyGridWorldEnv) = env.position == Goal ? 0.0 : -1.0
36
37
RLBase.reset!(env::WindyGridWorldEnv) = env.position = StartPosition
38
end
# WindyGridWorldEnv
## Traits
| Trait Type | Value |
|:----------------- | ------------------------------------------------:|
| NumAgentStyle | ReinforcementLearningBase.SingleAgent() |
| DynamicStyle | ReinforcementLearningBase.Sequential() |
| InformationStyle | ReinforcementLearningBase.ImperfectInformation() |
| ChanceStyle | ReinforcementLearningBase.Stochastic() |
| RewardStyle | ReinforcementLearningBase.StepReward() |
| UtilityStyle | ReinforcementLearningBase.GeneralSum() |
| ActionStyle | ReinforcementLearningBase.MinimalActionSet() |
| StateStyle | ReinforcementLearningBase.Observation{Any}() |
| DefaultStateStyle | ReinforcementLearningBase.Observation{Any}() |
## Is Environment Terminated?
No
## State Space
`Base.OneTo(70)`
## Action Space
`Base.OneTo(4)`
## Current State
```
4
```
xxxxxxxxxx
1
1
world = WindyGridWorldEnv()
Agent
├─ policy => QBasedPolicy
│ ├─ learner => TDLearner
│ │ ├─ approximator => TabularApproximator
│ │ │ ├─ table => 4×70 Array{Float64,2}
│ │ │ └─ optimizer => Descent
│ │ │ └─ eta => 0.5
│ │ ├─ γ => 1.0
│ │ ├─ method => SARSA
│ │ └─ n => 0
│ └─ explorer => EpsilonGreedyExplorer
│ ├─ ϵ_stable => 0.1
│ ├─ ϵ_init => 1.0
│ ├─ warmup_steps => 0
│ ├─ decay_steps => 0
│ ├─ step => 1
│ ├─ rng => Random._GLOBAL_RNG
│ └─ is_training => true
└─ trajectory => Trajectory
└─ traces => NamedTuple
├─ state => 0-element Array{Int64,1}
├─ action => 0-element Array{Int64,1}
├─ reward => 0-element Array{Float32,1}
└─ terminal => 0-element Array{Bool,1}
xxxxxxxxxx
14
1
agent = Agent(
2
policy=QBasedPolicy(
3
learner=TDLearner(
4
approximator=TabularQApproximator(
5
;n_state=length(state_space(world)),
6
n_action=length(action_space(world)),
7
opt=Descent(0.5)
8
),
9
method=:SARSA
10
),
11
explorer=EpsilonGreedyExplorer(0.1)
12
),
13
trajectory=VectorSARTTrajectory()
14
)
0
xxxxxxxxxx
1
1
hook = StepsPerEpisode()
531
383
353
235
116
60
67
109
57
26
99
65
79
52
56
24
99
52
164
39
22
17
17
16
16
18
16
20
19
26
7
xxxxxxxxxx
1
1
run(agent, world, StopAfterStep(8000),hook)
xxxxxxxxxx
1
1
using Plots
xxxxxxxxxx
1
1
plot([i for (i, x) in enumerate(hook.steps) for _ in 1:x])