Skip to content

Commit

Permalink
Merge pull request #2 from MariaHei/deep_learning
Browse files Browse the repository at this point in the history
Add support for DDL models
  • Loading branch information
MariaHei authored Apr 5, 2024
2 parents 1f3b94c + 65d6a77 commit d112512
Show file tree
Hide file tree
Showing 6 changed files with 152 additions and 14 deletions.
2 changes: 2 additions & 0 deletions .github/workflows/CI.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,8 @@ jobs:
version:
- '1.7'
- '1.8'
- '1.9'
- '1.10'
os:
- ubuntu-latest
- macOS-latest
Expand Down
1 change: 0 additions & 1 deletion Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ version = "0.1.0"
Conda = "8f4d0f93-b110-5947-807f-2305c1781a2d"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Distances = "b4f34e82-e78d-54a5-968a-f98e89d6e8f7"
Formatting = "59287772-0a20-5a39-b81b-1366585eb4c0"
JudiLing = "b43a184b-0e9d-488b-813a-80fd5dbc9fd8"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
Plots = "91a5bcdd-55d7-5caf-9e0b-520d859cae80"
Expand Down
2 changes: 0 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,6 @@ This is code for JudiLingMeasures. Most measures are based on R implementations

You can find the documentation [here](https://mariahei.github.io/JudiLingMeasures.jl/dev/index.html).

PLEASE NOTE THAT THIS PACKAGE IS WORK IN PROGRESS. MAJOR CHANGES TO THE CODE ARE POSSIBLE AT ANY POINT AND NEW MEASURES ARE STILL BEING ADDED.

## Installation

```
Expand Down
135 changes: 126 additions & 9 deletions src/helpers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ julia> sem_density_mean(cor_s, 2)
function sem_density_mean(s_cor::Union{JudiLing.SparseMatrixCSC, Matrix},
n::Int)
if n > size(s_cor,2)
throw(MethodError("n larger than the dimension of the semantic vectors"))
throw(ArgumentError("n larger than the dimension of the semantic vectors"))
end
sems = Vector{Union{Missing, Float32}}(missing, size(s_cor,1))
for i in 1:size(s_cor)[1]
Expand Down Expand Up @@ -477,8 +477,8 @@ function compute_all_measures_train(data_train::DataFrame,
Chat_train::Union{JudiLing.SparseMatrixCSC, Matrix},
S_train::Union{JudiLing.SparseMatrixCSC, Matrix},
Shat_train::Union{JudiLing.SparseMatrixCSC, Matrix},
F_train::Union{JudiLing.SparseMatrixCSC, Matrix},
G_train::Union{JudiLing.SparseMatrixCSC, Matrix};
F_train::Union{JudiLing.SparseMatrixCSC, Matrix, Missing},
G_train::Union{JudiLing.SparseMatrixCSC, Matrix, Missing};
res_learn_train::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing,
gpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
rpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
Expand Down Expand Up @@ -514,7 +514,7 @@ function compute_all_measures_train(data_train::DataFrame,
results[!,"EDNN"] = EDNN(Shat_train, S_train)
results[!,"NNC"] = NNC(cor_s)

if !low_cost_measures_only
if !low_cost_measures_only && !ismissing(F_train)
results[!,"DistanceTravelledF"] = total_distance(cue_obj_train, F_train, :F)
end

Expand All @@ -531,7 +531,7 @@ function compute_all_measures_train(data_train::DataFrame,
if calculate_production_uncertainty && !low_cost_measures_only
results[!,"ProductionUncertainty"] = vec(uncertainty(cue_obj_train.C, Chat_train, method="cosine"))
end
if !low_cost_measures_only
if !low_cost_measures_only && !ismissing(G_train)
results[!,"DistanceTravelledG"] = total_distance(cue_obj_train, G_train, :G)
end

Expand Down Expand Up @@ -567,6 +567,61 @@ function compute_all_measures_train(data_train::DataFrame,
results
end

"""
function compute_all_measures_train(data_train::DataFrame,
cue_obj_train::JudiLing.Cue_Matrix_Struct,
Chat_train::Union{JudiLing.SparseMatrixCSC, Matrix},
S_train::Union{JudiLing.SparseMatrixCSC, Matrix},
Shat_train::Union{JudiLing.SparseMatrixCSC, Matrix};
res_learn_train::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing,
gpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
rpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
sem_density_n::Int64=8,
calculate_production_uncertainty::Bool=false,
low_cost_measures_only::Bool=false)
Compute all measures currently available in JudiLingMeasures for the training data if F and G are not available (usually for DDL models).
# Arguments
- `data_train::DataFrame`: The data for which measures should be calculated (the training data).
- `cue_obj_train::JudiLing.Cue_Matrix_Struct`: The cue object of the training data.
- `Chat_train::Union{JudiLing.SparseMatrixCSC, Matrix}`: The Chat matrix of the training data.
- `S_train::Union{JudiLing.SparseMatrixCSC, Matrix}`: The S matrix of the training data.
- `Shat_train::Union{JudiLing.SparseMatrixCSC, Matrix}`: The Shat matrix of the training data.
- `res_learn_train::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing`: The first output of JudiLing.learn_paths_rpi (with `check_gold_path=true`)
- `gpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing`: The second output of JudiLing.learn_paths_rpi (with `check_gold_path=true`)
- `rpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing`: The third output of JudiLing.learn_paths_rpi (with `check_gold_path=true`)
- `sem_density_n::Int64=8`: Number of neighbours to take into account in Semantic Density measure.
- `calculate_production_uncertainty`: "Production Uncertainty" is computationally very heavy for large C matrices, therefore its computation is turned off by default.
- `low_cost_measures_only::Bool=false`: Only compute measures which are not computationally heavy. Recommended for very large datasets.
# Returns
- `results::DataFrame`: A dataframe with all information in `data_train` plus all the computed measures.
"""
function compute_all_measures_train(data_train::DataFrame,
cue_obj_train::JudiLing.Cue_Matrix_Struct,
Chat_train::Union{JudiLing.SparseMatrixCSC, Matrix},
S_train::Union{JudiLing.SparseMatrixCSC, Matrix},
Shat_train::Union{JudiLing.SparseMatrixCSC, Matrix};
res_learn_train::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing,
gpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
rpi_learn_train::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
sem_density_n::Int64=8,
calculate_production_uncertainty::Bool=false,
low_cost_measures_only::Bool=false)

compute_all_measures_train(data_train,
cue_obj_train,
Chat_train,
S_train,
Shat_train,
missing,
missing;
res_learn_train=res_learn_train,
gpi_learn_train=gpi_learn_train,
rpi_learn_train=rpi_learn_train,
sem_density_n=sem_density_n,
calculate_production_uncertainty=calculate_production_uncertainty,
low_cost_measures_only=low_cost_measures_only)
end

"""
function compute_all_measures_val(data_val::DataFrame,
cue_obj_train::JudiLing.Cue_Matrix_Struct,
Expand Down Expand Up @@ -608,8 +663,8 @@ function compute_all_measures_val(data_val::DataFrame,
S_train::Union{JudiLing.SparseMatrixCSC, Matrix},
S_val::Union{JudiLing.SparseMatrixCSC, Matrix},
Shat_val::Union{JudiLing.SparseMatrixCSC, Matrix},
F_train::Union{JudiLing.SparseMatrixCSC, Matrix},
G_train::Union{JudiLing.SparseMatrixCSC, Matrix};
F_train::Union{JudiLing.SparseMatrixCSC, Matrix, Missing},
G_train::Union{JudiLing.SparseMatrixCSC, Matrix, Missing};
res_learn_val::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing,
gpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
rpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
Expand Down Expand Up @@ -644,7 +699,7 @@ function compute_all_measures_val(data_val::DataFrame,
results[!,"ALC"] = JudiLingMeasures.ALC(cor_s)
results[!,"EDNN"] = EDNN(Shat_val, S_val, S_train)
results[!,"NNC"] = JudiLingMeasures.NNC(cor_s)
if !low_cost_measures_only
if !low_cost_measures_only && !ismissing(F_train)
results[!,"DistanceTravelledF"] = total_distance(cue_obj_val, F_train, :F)
end

Expand All @@ -660,7 +715,7 @@ function compute_all_measures_val(data_val::DataFrame,
if calculate_production_uncertainty && !low_cost_measures_only
results[!,"ProductionUncertainty"] = vec(JudiLingMeasures.uncertainty(cue_obj_val.C, Chat_val, cue_obj_train.C, method="cosine"))
end
if !low_cost_measures_only
if !low_cost_measures_only && !ismissing(G_train)
results[!,"DistanceTravelledG"] = JudiLingMeasures.total_distance(cue_obj_val, G_train, :G)
end

Expand Down Expand Up @@ -696,6 +751,68 @@ function compute_all_measures_val(data_val::DataFrame,
results
end

"""
function compute_all_measures_val(data_val::DataFrame,
cue_obj_train::JudiLing.Cue_Matrix_Struct,
cue_obj_val::JudiLing.Cue_Matrix_Struct,
Chat_val::Union{JudiLing.SparseMatrixCSC, Matrix},
S_train::Union{JudiLing.SparseMatrixCSC, Matrix},
S_val::Union{JudiLing.SparseMatrixCSC, Matrix},
Shat_val::Union{JudiLing.SparseMatrixCSC, Matrix};
res_learn_val::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing,
gpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
rpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
sem_density_n::Int64=8,
calculate_production_uncertainty::Bool=false,
low_cost_measures_only::Bool=false)
Compute all measures currently available in JudiLingMeasures for the validation data if F and G are not available (usually for DDL models).
# Arguments
- `data_val::DataFrame`: The data for which measures should be calculated (the validation data).
- `cue_obj_train::JudiLing.Cue_Matrix_Struct`: The cue object of the training data.
- `cue_obj_val::JudiLing.Cue_Matrix_Struct`: The cue object of the validation data.
- `Chat_val::Union{JudiLing.SparseMatrixCSC, Matrix}`: The Chat matrix of the validation data.
- `S_train::Union{JudiLing.SparseMatrixCSC, Matrix}`: The S matrix of the training data.
- `S_val::Union{JudiLing.SparseMatrixCSC, Matrix}`: The S matrix of the validation data.
- `Shat_val::Union{JudiLing.SparseMatrixCSC, Matrix}`: The Shat matrix of the data of interest.
- `res_learn_val::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing`: The first output of JudiLing.learn_paths_rpi (with `check_gold_path=true`)
- `gpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing`: The second output of JudiLing.learn_paths_rpi (with `check_gold_path=true`)
- `rpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing`: The third output of JudiLing.learn_paths_rpi (with `check_gold_path=true`)
- `low_cost_measures_only::Bool=false`: Only compute measures which are not computationally heavy. Recommended for very large datasets.
# Returns
- `results::DataFrame`: A dataframe with all information in `data_val` plus all the computed measures.
"""
function compute_all_measures_val(data_val::DataFrame,
cue_obj_train::JudiLing.Cue_Matrix_Struct,
cue_obj_val::JudiLing.Cue_Matrix_Struct,
Chat_val::Union{JudiLing.SparseMatrixCSC, Matrix},
S_train::Union{JudiLing.SparseMatrixCSC, Matrix},
S_val::Union{JudiLing.SparseMatrixCSC, Matrix},
Shat_val::Union{JudiLing.SparseMatrixCSC, Matrix};
res_learn_val::Union{Array{Array{JudiLing.Result_Path_Info_Struct,1},1}, Missing}=missing,
gpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
rpi_learn_val::Union{Array{JudiLing.Gold_Path_Info_Struct,1}, Missing}=missing,
sem_density_n::Int64=8,
calculate_production_uncertainty::Bool=false,
low_cost_measures_only::Bool=false)

compute_all_measures_val(data_val,
cue_obj_train,
cue_obj_val,
Chat_val,
S_train,
S_val,
Shat_val,
missing,
missing;
res_learn_val=res_learn_val,
gpi_learn_val=gpi_learn_val,
rpi_learn_val=rpi_learn_val,
sem_density_n=sem_density_n,
calculate_production_uncertainty=calculate_production_uncertainty,
low_cost_measures_only=low_cost_measures_only)
end


function safe_divide(x, y)
if (y != 0) & (!ismissing(y)) & (!ismissing(x))
x/y
Expand Down
2 changes: 1 addition & 1 deletion test/test_helpers.jl
Original file line number Diff line number Diff line change
Expand Up @@ -45,7 +45,7 @@ end
cs = JudiLingMeasures.correlation_rowwise([1. 2. 3.], [5. 1. 19.])
@test isapprox(vec(JudiLingMeasures.sem_density_mean(cs,1)),
vec([0.7406128966515281]))
@test_throws MethodError JudiLingMeasures.sem_density_mean(cs,5)
@test_throws ArgumentError JudiLingMeasures.sem_density_mean(cs,5)
cs = JudiLingMeasures.correlation_rowwise(ma2, ma3)
@test isapprox(vec(JudiLingMeasures.sem_density_mean(cs, 3)),
vec([0.550947 0.28884766666666667 0.19702316666666667 0.15713063333333335]), rtol=1e-4)
Expand Down
24 changes: 23 additions & 1 deletion test/test_measures.jl
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ cor_s2 = JudiLingMeasures.correlation_rowwise(ma2, ma5)
@test JudiLingMeasures.density(zeros((1,1)), n=1) == [0]
@test JudiLingMeasures.density(ones((1,1)), n=1) == [1]
@test isequal(JudiLingMeasures.density([[1 2 missing]; [-1 -2 -3]; [1 2 3]], n=2), [missing; -1.5; 2.5])
@test_throws MethodError JudiLingMeasures.density(zeros((1,1))) == [0]
@test_throws ArgumentError JudiLingMeasures.density(zeros((1,1))) == [0]
end
@testset "Validation data" begin
@test isapprox(JudiLingMeasures.density(cor_s2, n=2), vec([0.7393784999999999 0.6420815 0.44968675 0.2811505]), rtol=1e-4)
Expand Down Expand Up @@ -644,6 +644,16 @@ end
@test "DistanceTravelledF" in names(all_measures)
@test "DistanceTravelledG" in names(all_measures)
@test !("WithinPathEntropies" in names(all_measures))

all_measures = JudiLingMeasures.compute_all_measures_train(dat, # the data of interest
cue_obj, # the cue_obj of the training data
Chat, # the Chat of the data of interest
S, # the S matrix of the data of interest
Shat, # the Shat matrix of the data of interest
sem_density_n=2)
@test all_measures != 1
@test !("DistanceTravelledF" in names(all_measures))
@test !("DistanceTravelledG" in names(all_measures))
end
@testset "Validation data" begin
# just make sure that this function runs without error
Expand Down Expand Up @@ -710,5 +720,17 @@ end
@test "DistanceTravelledF" in names(all_measures)
@test "DistanceTravelledG" in names(all_measures)
@test !("WithinPathEntropies" in names(all_measures))

all_measures = JudiLingMeasures.compute_all_measures_val(val_dat, # the data of interest
cue_obj, # the cue_obj of the training data
cue_obj_val,
Chat_val, # the Chat of the data of interest
S, # the S matrix of the data of interest
S_val,
Shat_val,
sem_density_n=2)
@test all_measures != 1
@test !("DistanceTravelledF" in names(all_measures))
@test !("DistanceTravelledG" in names(all_measures))
end
end

0 comments on commit d112512

Please sign in to comment.