diff --git a/Project.toml b/Project.toml index 6a8f8c6..f14a2b1 100644 --- a/Project.toml +++ b/Project.toml @@ -34,7 +34,6 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" YAML = "ddb6d928-2868-570f-bddf-ab3f9cf99eb6" [compat] -PackageCompiler = "2.1.16" ArgParse = "1.1.4" Arrow = "2.5.2" CSV = "0.10" @@ -55,7 +54,8 @@ MLJModels = "0.16" MLJXGBoostInterface = "0.3.4" MultipleTesting = "0.6.0" Optim = "1.7" -TMLE = "0.15.0" +PackageCompiler = "2.1.16" +TMLE = "0.16" Tables = "1.10.1" YAML = "0.4.9" julia = "1.7, 1" diff --git a/src/utils.jl b/src/utils.jl index e8ff7dc..071d8e2 100644 --- a/src/utils.jl +++ b/src/utils.jl @@ -152,27 +152,14 @@ function make_float!(dataset, colnames) end end -function coerce_types!(dataset, Ψ::ComposedEstimand) - for arg in Ψ.args - coerce_types!(dataset, arg) - end +function coerce_types!(dataset, colnames) + infered_types = autotype(dataset[!, colnames]) + coerce!(dataset, infered_types) end -function coerce_types!(dataset, Ψ) - # Make Treatments categorical but preserve order - categorical_variables = Set(keys(Ψ.treatment_values)) - make_categorical!(dataset, categorical_variables, infer_ordered=true) - # Make Confounders and extra covariates continuous - continuous_variables = Set(Iterators.flatten(values(Ψ.treatment_confounders))) - union!(continuous_variables, Ψ.outcome_extra_covariates) - make_float!(dataset, continuous_variables) - # Make outcome categorical if binary but do not infer order - if TMLE.is_binary(dataset, Ψ.outcome) - make_categorical!(dataset, Ψ.outcome, infer_ordered=false) - else - make_float!(dataset, Ψ.outcome) - end -end +coerce_types!(dataset, Ψ::TMLE.Estimand) = + coerce_types!(dataset, collect(variables(Ψ))) + variables(Ψ::TMLE.ComposedEstimand) = union((variables(arg) for arg in Ψ.args)...) diff --git a/test/Project.toml b/test/Project.toml index 440b866..6f45e7f 100644 --- a/test/Project.toml +++ b/test/Project.toml @@ -11,6 +11,7 @@ LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" MLJ = "add582a8-e3ab-11e8-2d5e-e98b27df1bc7" MLJBase = "a7f614a8-145f-11e9-1d2a-a57a1082229d" MLJLinearModels = "6ee0df7b-362f-4a72-a706-9e79364fb692" +MLJModels = "d491faf4-2d78-11e9-2867-c94bc002c0b7" MLJXGBoostInterface = "54119dfa-1dab-4055-a167-80440f4f7a91" Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c" Serialization = "9e88b42a-f829-5b0c-bbe9-9e923198166b" diff --git a/test/runner.jl b/test/runner.jl index daece69..52b6056 100644 --- a/test/runner.jl +++ b/test/runner.jl @@ -9,6 +9,7 @@ using Serialization using YAML using JSON using MLJBase +using MLJModels PKGDIR = pkgdir(TargetedEstimation) TESTDIR = joinpath(PKGDIR, "test") @@ -29,7 +30,8 @@ include(joinpath(TESTDIR, "testutils.jl")) @test estimators.OSE isa TMLE.OSE @test estimators.TMLE.weighted === true @test estimators.TMLE.models.G_default === estimators.OSE.models.G_default - @test estimators.TMLE.models.G_default isa MLJBase.ProbabilisticStack + @test estimators.TMLE.models.G_default.continuous_encoder isa MLJModels.ContinuousEncoder + @test estimators.TMLE.models.G_default.probabilistic_stack isa MLJBase.ProbabilisticStack # From already constructed estimators estimators_new = TargetedEstimation.instantiate_estimators(estimators) @test estimators_new === estimators diff --git a/test/testutils.jl b/test/testutils.jl index 99adfd6..db9f450 100644 --- a/test/testutils.jl +++ b/test/testutils.jl @@ -114,17 +114,17 @@ function build_dataset(;n=1000, format="csv") dataset = DataFrame( SAMPLE_ID = 1:n, - T1 = categorical(T₁), - T2 = categorical(T₂), + T1 = T₁, + T2 = T₂, W1 = W₁, W2 = W₂, C1 = C₁, ) # Comma in name - dataset[!, "CONTINUOUS, OUTCOME"] = categorical(y₁) + dataset[!, "CONTINUOUS, OUTCOME"] = y₁ # Slash in name - dataset[!, "BINARY/OUTCOME"] = categorical(y₂) - dataset[!, "EXTREME_BINARY"] = categorical(vcat(0, ones(n-1))) + dataset[!, "BINARY/OUTCOME"] = y₂ + dataset[!, "EXTREME_BINARY"] = vcat(0, ones(n-1)) return dataset end diff --git a/test/utils.jl b/test/utils.jl index 077c2c7..9a31afb 100644 --- a/test/utils.jl +++ b/test/utils.jl @@ -7,6 +7,7 @@ using DataFrames using CSV using MLJLinearModels using CategoricalArrays +using MLJBase check_type(treatment_value, ::Type{T}) where T = @test treatment_value isa T @@ -74,21 +75,21 @@ end ) dataset = DataFrame( - Ycont = [1.1, 2.2, missing], - Ycat = [1., 0., missing], - T₁ = [1, 0, missing], - T₂ = [missing, "AC", "CC"], - W₁ = [1., 0., 0.], - W₂ = [missing, 0., 0.], - C = [1, 2, 3] + Ycont = [1.1, 2.2, missing, 3.5, 6.6, 0., 4.], + Ycat = [1., 0., missing, 1., 0, 0, 0], + T₁ = [1, 0, missing, 0, 0, 0, missing], + T₂ = [missing, "AC", "CC", "CC", missing, "AA", "AA"], + W₁ = [1., 0., 0., 1., 0., 1, 1], + W₂ = [missing, 0., 0., 0., 0., 0., 0.], + C = [1, 2, 3, 4, 5, 6, 6] ) TargetedEstimation.coerce_types!(dataset, Ψ) - @test dataset.T₁ isa CategoricalArray - @test dataset.T₂ isa CategoricalArray - for var in [:W₁, :W₂, :Ycont] - @test eltype(dataset[!, var]) <: Union{Missing, Float64} - end + @test scitype(dataset.T₁) == AbstractVector{Union{Missing, OrderedFactor{2}}} + @test scitype(dataset.T₂) == AbstractVector{Union{Missing, Multiclass{3}}} + @test scitype(dataset.Ycont) == AbstractVector{Union{Missing, MLJBase.Continuous}} + @test scitype(dataset.W₁) == AbstractVector{OrderedFactor{2}} + @test scitype(dataset.W₂) == AbstractVector{Union{Missing, OrderedFactor{1}}} Ψ = IATE( outcome=:Ycat, @@ -98,8 +99,8 @@ end ) TargetedEstimation.coerce_types!(dataset, Ψ) - @test dataset.Ycat isa CategoricalArray - @test eltype(dataset.C) <: Union{Missing, Float64} + @test scitype(dataset.Ycat) == AbstractVector{Union{Missing, OrderedFactor{2}}} + @test scitype(dataset.C) == AbstractVector{Count} end @testset "Test misc" begin