forked from JuliaPOMDP/POMDPModels.jl
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathTabular.jl
67 lines (47 loc) · 2.63 KB
/
Tabular.jl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
@doc raw"""
TabularMDP(T, R, discount)
Specify a discrete MDP in tabular form.
- The transition matrix is 3 dimensional of size ``|\mathcal{S}|\times |\mathcal{A}| \times |\mathcal{S}|``, then `T[sj, a, si]` corresponds to the probability of ending in `sj` while taking action `a` in `si`.
- The reward matrix is 2 dimensional of size ``|\mathcal{S}|\times |\mathcal{A}|``, where `R[s, a]` is the reward obtained when taking action `a` in state `s`.
"""
mutable struct TabularMDP <: MDP{Int64, Int64}
T::Array{Float64, 3} # SPxAxS
R::Matrix{Float64} # SxA
discount::Float64
end
@doc raw"""
TabularPOMDP(T, R, O, discount)
Specify a discrete POMDP in tabular form.
- The transition matrix is 3 dimensional of size ``|\mathcal{S}|\times |\mathcal{A}| \times |\mathcal{S}|``, then `T[sj, a, si]` corresponds to the probability of ending in `sj` while taking action `a` in `si`.
- The observation matrix is also 3 dimensional of size ``|\mathcal{O}| \times |\mathcal{A}| \times |\mathcal{S}|``, `O[o, a, sp]` represents the probability of observing `o` in in state `sp` and action `a`.
- The reward matrix is 2 dimensional of size ``|\mathcal{S}|\times |\mathcal{A}|``, where `R[s, a]` is the reward obtained when taking action `a` in state `s`.
"""
mutable struct TabularPOMDP <: POMDP{Int64, Int64, Int64}
T::Array{Float64, 3} # SPxAxS
R::Matrix{Float64} # SxA
O::Array{Float64, 3} # OxAxSP
discount::Float64
end
const TabularProblem = Union{TabularMDP, TabularPOMDP}
# Distribution Type and methods
# XXX: this should be replaced with Categorical when https://github.com/JuliaStats/Distributions.jl/issues/743 is fixed
struct DiscreteDistribution{P<:AbstractVector{Float64}}
p::P
end
support(d::DiscreteDistribution) = 1:length(d.p)
pdf(d::DiscreteDistribution, sp::Int64) = d.p[sp] # T(s', a, s)
rand(rng::AbstractRNG, d::DiscreteDistribution) = sample(rng, Weights(d.p))
# MDP and POMDP common methods
states(p::TabularProblem) = 1:size(p.T, 1)
actions(p::TabularProblem) = 1:size(p.T, 2)
stateindex(::TabularProblem, s::Int64) = s
actionindex(::TabularProblem, a::Int64) = a
discount(p::TabularProblem) = p.discount
transition(p::TabularProblem, s::Int64, a::Int64) = DiscreteDistribution(view(p.T, :, a, s))
reward(prob::TabularProblem, s::Int64, a::Int64) = prob.R[s, a]
initialstate(p::TabularProblem) = DiscreteDistribution(ones(length(states(p)))./length(states(p)))
# POMDP only methods
n_observations(p::TabularProblem) = size(p.O, 1)
observations(p::TabularPOMDP) = 1:n_observations(p)
observation(p::TabularPOMDP, a::Int64, sp::Int64) = DiscreteDistribution(view(p.O, :, a, sp))
obsindex(p::TabularPOMDP, o::Int64) = o