From 3b5ec57375a87d486292a25ea2835ed5b6eaa543 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Fri, 10 May 2024 14:55:34 +0100 Subject: [PATCH 01/12] Allow manual trigger of ci --- .github/workflows/ci.yml | 1 + 1 file changed, 1 insertion(+) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d688c43..8bfebb2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,6 +2,7 @@ name: CI on: - push - pull_request + - workflow_dispatch jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} From 9a4e87aa03ac01f931baa58176a1912e8df47dd4 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Mon, 13 May 2024 09:29:47 +0100 Subject: [PATCH 02/12] Don't run pyndl tests on windows --- test/runtests.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/runtests.jl b/test/runtests.jl index 5f4a753..23ef8a6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,9 @@ using SafeTestsets -@safetestset "pyndl tests" begin - include("pyndl_tests.jl") +if !Sys.iswindows() + @safetestset "pyndl tests" begin + include("pyndl_tests.jl") + end end @safetestset "input tests" begin From 121930e77e88155c164fa5debf8c492717a80cf6 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Fri, 17 May 2024 12:06:53 +0100 Subject: [PATCH 03/12] Add make_combined_pS_matrix Fixes #111 Add more tests for make_pS_matrix and refactor variables in there --- docs/src/man/make_semantic_matrix.md | 10 ++- src/make_semantic_matrix.jl | 99 +++++++++++++++++++++++----- test/make_semantic_matrix_tests.jl | 33 ++++++++++ 3 files changed, 124 insertions(+), 18 deletions(-) diff --git a/docs/src/man/make_semantic_matrix.md b/docs/src/man/make_semantic_matrix.md index a058b6b..7a89b94 100644 --- a/docs/src/man/make_semantic_matrix.md +++ b/docs/src/man/make_semantic_matrix.md @@ -9,8 +9,14 @@ CurrentModule = JudiLing ```@docs PS_Matrix_Struct make_pS_matrix - make_pS_matrix(utterances) - make_pS_matrix(utterances, utterances_train) + make_pS_matrix(data) + make_pS_matrix(data_val, pS_obj) + make_combined_pS_matrix( + data_train, + data_val; + features_col = :CommunicativeIntention, + sep_token = "_", + ) ``` ## Simulate semantic vectors diff --git a/src/make_semantic_matrix.jl b/src/make_semantic_matrix.jl index 948437d..7c7f027 100644 --- a/src/make_semantic_matrix.jl +++ b/src/make_semantic_matrix.jl @@ -49,7 +49,7 @@ Make combined simulated Lexome matrix, where combined features from both trainin function make_combined_L_matrix end """ - make_pS_matrix(utterances) + make_pS_matrix(data) Create a discrete semantic matrix given a dataframe. @@ -69,27 +69,27 @@ s_obj_train = JudiLing.make_pS_matrix( ``` """ function make_pS_matrix( - utterances; + data; features_col = :CommunicativeIntention, sep_token = "_", ) # find out all possible features in this dataset - features = unique(vcat(split.(utterances[:, features_col], sep_token)...)) + features = unique(vcat(split.(data[:, features_col], sep_token)...)) # using dict to store feature names f2i = Dict(v => i for (i, v) in enumerate(features)) i2f = Dict(i => v for (i, v) in enumerate(features)) # find out features for each utterance - vs = unique.(split.(utterances[:, features_col], sep_token)) + vs = unique.(split.(data[:, features_col], sep_token)) # total number of feature in the entire dataset # to initialize a sparse matrix n_f = sum([length(v) for v in vs]) # initialize sparse matrix components - m = size(utterances, 1) + m = size(data, 1) n = length(i2f) I = zeros(Int64, n_f) J = zeros(Int64, n_f) @@ -112,14 +112,14 @@ function make_pS_matrix( end """ - make_pS_matrix(utterances, utterances_train) + make_pS_matrix(data_val, pS_obj) Construct discrete semantic matrix for the validation datasets given by the exemplar in the dataframe, and given the S matrix for the training datasets. # Obligatory Arguments -- `utterances::DataFrame`: the dataset -- `utterances_train::PS_Matrix_Struct`: training PS object +- `data_val::DataFrame`: the dataset +- `pS_obj::PS_Matrix_Struct`: training PS object # Optional Arguments - `features_col::Symbol=:CommunicativeIntention`: the column name for target @@ -128,35 +128,35 @@ exemplar in the dataframe, and given the S matrix for the training datasets. # Examples ```julia s_obj_val = JudiLing.make_pS_matrix( - utterance_val, + data_val, s_obj_train, features_col=:CommunicativeIntention, sep_token="_") ``` """ function make_pS_matrix( - utterances, - utterances_train; + data_val, + pS_obj; features_col = :CommunicativeIntention, sep_token = "_", ) # find out all possible features in this dataset - features = unique(vcat(split.(utterances[:, features_col], sep_token)...)) + features = unique(vcat(split.(data_val[:, features_col], sep_token)...)) # using dict to store feature names - f2i = utterances_train.f2i - i2f = utterances_train.i2f + f2i = pS_obj.f2i + i2f = pS_obj.i2f # find out features for each utterance - vs = unique.(split.(utterances[:, features_col], sep_token)) + vs = unique.(split.(data_val[:, features_col], sep_token)) # total number of feature in the entire dataset # to initialize a sparse matrix n_f = sum([length(v) for v in vs]) # initialize sparse matrix components - m = size(utterances, 1) + m = size(data_val, 1) n = length(i2f) I = zeros(Int64, n_f) J = zeros(Int64, n_f) @@ -178,6 +178,73 @@ function make_pS_matrix( PS_Matrix_Struct(pS, f2i, i2f) end + +""" + make_combined_pS_matrix( + data_train, + data_val; + features_col = :CommunicativeIntention, + sep_token = "_", + ) + +Create discrete semantic matrices for a train and validation dataframe. + +# Obligatory Arguments +- `data_train::DataFrame`: the training dataset +- `data_val::DataFrame`: the validation dataset + +# Optional Arguments +- `features_col::Symbol=:CommunicativeIntention`: the column name for target +- `sep_token::String="_"`: separator + +# Examples +```julia +s_obj_train, s_obj_val = JudiLing.make_combined_pS_matrix( + data_train, + data_val, + features_col=:CommunicativeIntention, + sep_token="_") +``` +""" +function make_combined_pS_matrix( + data_train, + data_val; + features_col = :CommunicativeIntention, + sep_token = "_", +) + + data_combined = copy(data_train) + data_val = copy(data_val) + for col in names(data_combined) + data_combined[!, col] = inlinestring2string.(data_combined[!,col]) + data_val[!, col] = inlinestring2string.(data_val[!,col]) + end + append!(data_combined, data_val, promote=true) + + pS_obj_combined = make_pS_matrix( + data_combined; + features_col = features_col, + sep_token = sep_token, + ) + + pS_obj_train = make_pS_matrix( + data_train, + pS_obj_combined; + features_col = features_col, + sep_token = sep_token, + ) + + pS_obj_val = make_pS_matrix( + data_val, + pS_obj_combined; + features_col = features_col, + sep_token = sep_token, + ) + + return pS_obj_train, pS_obj_val +end + + """ make_S_matrix(data::DataFrame, base::Vector, inflections::Vector) diff --git a/test/make_semantic_matrix_tests.jl b/test/make_semantic_matrix_tests.jl index 3a2902f..e40537f 100644 --- a/test/make_semantic_matrix_tests.jl +++ b/test/make_semantic_matrix_tests.jl @@ -4,15 +4,48 @@ using Test using DataFrames @testset "make prelinguistic semantic matrix for utterance" begin + + function test_cues(features, idx, s_obj) + some_cues = split(features[idx], "_") + tgt_vec = zeros(length(keys(s_obj.f2i))) + for cue in some_cues + i = s_obj.f2i[cue] + tgt_vec[i] = 1 + end + @test tgt_vec == s_obj.pS[idx,:] + end + utterance = DataFrame(CSV.File(joinpath( @__DIR__, "data", "utterance_mini.csv", ))) + + cues_split = [split(d, "_") for d in utterance.CommunicativeIntention] + unique_cues = Set(vcat(cues_split...)) + s_obj_train = JudiLing.make_pS_matrix(utterance) + @test length(unique_cues) == size(s_obj_train.pS, 2) + test_cues(utterance.CommunicativeIntention, 5, s_obj_train) + test_cues(utterance.CommunicativeIntention, 15, s_obj_train) + test_cues(utterance.CommunicativeIntention, 23, s_obj_train) + utterance_val = utterance[101:end, :] s_obj_val = JudiLing.make_pS_matrix(utterance_val, s_obj_train) + test_cues(utterance_val.CommunicativeIntention, 5, s_obj_val) + test_cues(utterance_val.CommunicativeIntention, 6, s_obj_val) + + @test length(unique_cues) == size(s_obj_val.pS, 2) + + s_obj_train, s_obj_val = JudiLing.make_combined_pS_matrix(utterance, utterance_val) + + @test length(unique_cues) == size(s_obj_train.pS, 2) + test_cues(utterance.CommunicativeIntention, 5, s_obj_train) + test_cues(utterance.CommunicativeIntention, 15, s_obj_train) + test_cues(utterance.CommunicativeIntention, 23, s_obj_train) + test_cues(utterance_val.CommunicativeIntention, 5, s_obj_val) + test_cues(utterance_val.CommunicativeIntention, 6, s_obj_val) end @testset "make semantic matrix" begin From f074823116bc9ef3bc41088452acb41ffb4208bb Mon Sep 17 00:00:00 2001 From: MariaHei Date: Fri, 17 May 2024 14:49:09 +0100 Subject: [PATCH 04/12] Add documentation and tests to loading_data functions Fixes #114 --- docs/src/man/input.md | 23 +++++ src/input.jl | 217 ++++++++++++++++++++++++++++++++++++++++++ src/test_combo.jl | 68 ------------- test/input_tests.jl | 77 ++++++++++++++- 4 files changed, 315 insertions(+), 70 deletions(-) diff --git a/docs/src/man/input.md b/docs/src/man/input.md index f7e9fa6..b657669 100644 --- a/docs/src/man/input.md +++ b/docs/src/man/input.md @@ -8,4 +8,27 @@ CurrentModule = JudiLing load_dataset(filepath::String; delim::String=",", kargs...) +loading_data_randomly_split( + data_path::String, + output_dir_path::String, + data_prefix::String; + val_sample_size::Int = 0, + val_ratio::Float = 0.0, + random_seed::Int = 314) +loading_data_careful_split( + data_path::String, + data_prefix::String, + output_dir_path::String, + n_features_columns::Union{Vector{Symbol},Vector{String}}; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + n_grams_target_col::Union{Symbol, String} = :Word, + n_grams_tokenized::Bool = false, + n_grams_sep_token::Union{Nothing, String} = nothing, + grams::Int = 3, + n_grams_keep_sep::Bool = false, + start_end_token::String = "#", + random_seed::Int = 314, + verbose::Bool = false) ``` diff --git a/src/input.jl b/src/input.jl index 9606a20..acdd278 100644 --- a/src/input.jl +++ b/src/input.jl @@ -24,3 +24,220 @@ function load_dataset(filepath::String; kargs...) return(DataFrame(CSV.File(filepath, stringtype=String, delim=delim; kargs...))) end + + +""" + load_data_random_split(args...; kargs...) + +Alias for loading_data_randomly_split +""" +function load_data_random_split( + args...; kargs...) + JudiLing.loading_data_randomly_split(args...; kargs...) +end + +""" + loading_data_random_split(args...; kargs...) + +Alias for loading_data_randomly_split +""" +function loading_data_random_split( + args...; kargs...) + JudiLing.loading_data_randomly_split(args...; kargs...) +end + +""" + load_data_randomly_split(args...; kargs...) + +Alias for loading_data_randomly_split +""" +function load_data_randomly_split( + args...; kargs...) + JudiLing.loading_data_randomly_split(args...; kargs...) +end + +""" + loading_data_randomly_split( + data_path::String, + output_dir_path::String, + data_prefix::String; + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + random_seed::Int = 314) + +Read in a dataframe, splitting the dataframe into a training and validation dataset. The two are also written to `output_dir_path` at the same time. + +!!! note + The order of `data_prefix` and `output_dir_path` is exactly reversed compared to `loading_data_careful_split`. + + +# Obligatory arguments +- `data_path::String`: Path to where the dataset is stored. +- `output_dir_path::String`: Path to where the new dataframes should be stored. +- `data_prefix::String`: Prefix of the two new files, will be called `data_prefix_train.csv` and `data_prefix_val.csv`. + +# Optional arguments +- `val_sample_size::Int = 0`: Size of the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `val_ratio::Float64 = 0.0`: Fraction of the data that should be in the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `random_seed::Int = 314`: Random seed for controlling random split. + +# Example +``` +data_train, data_val = JudiLing.loading_data_randomly_split( + "latin.csv", + "careful", + "latin", + ["Lexeme","Person","Number","Tense","Voice","Mood"] +) +``` +""" +function loading_data_randomly_split( + data_path::String, + output_dir_path::String, + data_prefix::String; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + random_seed::Int = 314, + verbose::Bool = false) + verbose && println("Spliting data...") + + train_val_random_split( + data_path, + output_dir_path, + data_prefix, + train_sample_size = train_sample_size, + val_sample_size = val_sample_size, + val_ratio = val_ratio, + random_seed = random_seed, + verbose = verbose, + ) + + # load data + verbose && println("Loading CSV...") + loading_data_pre_split(output_dir_path, data_prefix) +end + +""" + load_data_careful_split(args...; kargs...) + +Alias for load_data_carefully_split +""" +function load_data_careful_split( + args...; kargs...) + JudiLing.loading_data_careful_split(args...; kargs...) +end + +""" + load_data_carefully_split(args...; kargs...) + +Alias for load_data_carefully_split +""" +function load_data_carefully_split( + args...; kargs...) + JudiLing.loading_data_careful_split(args...; kargs...) +end + +""" + loading_data_carefully_split(args...; kargs...) + +Alias for load_data_carefully_split +""" +function loading_data_carefully_split( + args...; kargs...) + JudiLing.loading_data_careful_split(args...; kargs...) +end + +""" + loading_data_careful_split( + data_path::String, + data_prefix::String, + output_dir_path::String, + n_features_columns::Union{Vector{Symbol},Vector{String}}; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + n_grams_target_col::Union{Symbol, String} = :Word, + n_grams_tokenized::Bool = false, + n_grams_sep_token::Union{Nothing, String} = nothing, + grams::Int = 3, + n_grams_keep_sep::Bool = false, + start_end_token::String = "#", + random_seed::Int = 314, + verbose::Bool = false) + +Read in a dataframe, splitting the dataframe into a training and validation dataset. The split is done such that all features in the columns specified +in `n_features_columns` occur both in the training and validation data. It is also ensured that the unique grams resulting from splitting the strings in column +`n_grams_target_col` into `grams`-grams occur in both datasets. +The two are also written to `output_dir_path` at the same time. + +!!! note + The order of `data_prefix` and `output_dir_path` is exactly reversed compared to `loading_data_randomly_split`. + +# Obligatory arguments +- `data_path::String`: Path to where the dataset is stored. +- `output_dir_path::String`: Path to where the new dataframes should be stored. +- `data_prefix::String`: Prefix of the two new files, will be called `data_prefix_train.csv` and `data_prefix_val.csv`. +- `n_features_columns::Vector{Union{Symbol, String}}`: Vector with columns whose features have to occur in both the training and validation data. + +# Optional arguments +- `val_sample_size::Int = 0`: Size of the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `val_ratio::Float64 = 0.0`: Fraction of the data that should be in the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `n_grams_target_col::Union{Symbol, String} = :Word`: Column with target words. +- `n_grams_tokenized::Bool = false`: Whether the words in `n_grams_target_col` are already tokenized. +- `n_grams_sep_token::Union{Nothing, String} = nothing`: String with which tokens in `n_grams_target_col` are separated (only used if `n_grams_tokenized=true`). +- `grams::Int = 3`: Granularity of the n-grams. +- `n_grams_keep_sep::Bool = false`: Whether the token separators should be kept in the ngrams (this is useful e.g. when working with syllables). +- `start_end_token::String = "#"`: Token with which the start and end of words should be marked. +- `random_seed::Int = 314`: Random seed for controlling random split. + +# Example +``` +data_train, data_val = JudiLing.loading_data_careful_split( + "latin.csv", + "latin", + "careful", + ["Lexeme","Person","Number","Tense","Voice","Mood"] +) +``` +""" +function loading_data_careful_split( + data_path::String, + data_prefix::String, + output_dir_path::String, + n_features_columns::Union{Vector{Symbol},Vector{String}}; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + n_grams_target_col::Union{Symbol, String} = :Word, + n_grams_tokenized::Bool = false, + n_grams_sep_token::Union{Nothing, String} = nothing, + grams::Int = 3, + n_grams_keep_sep::Bool = false, + start_end_token::String = "#", + random_seed::Int = 314, + verbose::Bool = false) + + verbose && println("Splitting data...") + train_val_careful_split( + data_path, + output_dir_path, + data_prefix, + n_features_columns, + train_sample_size = train_sample_size, + val_sample_size = val_sample_size, + val_ratio = val_ratio, + n_grams_target_col = n_grams_target_col, + n_grams_tokenized = n_grams_tokenized, + n_grams_sep_token = n_grams_sep_token, + grams = grams, + n_grams_keep_sep = n_grams_keep_sep, + start_end_token = start_end_token, + random_seed = random_seed, + verbose = verbose, + ) + + # load data + verbose && println("Loading CSV...") + loading_data_pre_split(output_dir_path, data_prefix) +end diff --git a/src/test_combo.jl b/src/test_combo.jl index 794daf8..a1aa63d 100644 --- a/src/test_combo.jl +++ b/src/test_combo.jl @@ -704,74 +704,6 @@ function loading_data_pre_split( data_train, data_val end -function loading_data_randomly_split( - data_path, - output_dir_path, - data_prefix; - train_sample_size = 0, - val_sample_size = 0, - val_ratio = 0.0, - random_seed = 314, - verbose = false) - verbose && println("Spliting data...") - - train_val_random_split( - data_path, - output_dir_path, - data_prefix, - train_sample_size = train_sample_size, - val_sample_size = val_sample_size, - val_ratio = val_ratio, - random_seed = random_seed, - verbose = verbose, - ) - - # load data - verbose && println("Loading CSV...") - loading_data_pre_split(output_dir_path, data_prefix) -end - -function loading_data_careful_split( - data_path, - data_prefix, - output_dir_path, - n_features_columns; - train_sample_size = 0, - val_sample_size = 0, - val_ratio = 0.0, - n_grams_target_col = :Word, - n_grams_tokenized = false, - n_grams_sep_token = nothing, - grams = 3, - n_grams_keep_sep = false, - start_end_token = "#", - random_seed = 314, - verbose = false) - - verbose && println("Spliting data...") - train_val_careful_split( - data_path, - output_dir_path, - data_prefix, - n_features_columns, - train_sample_size = train_sample_size, - val_sample_size = val_sample_size, - val_ratio = val_ratio, - n_grams_target_col = n_grams_target_col, - n_grams_tokenized = n_grams_tokenized, - n_grams_sep_token = n_grams_sep_token, - grams = grams, - n_grams_keep_sep = n_grams_keep_sep, - start_end_token = start_end_token, - random_seed = random_seed, - verbose = verbose, - ) - - # load data - verbose && println("Loading CSV...") - loading_data_pre_split(output_dir_path, data_prefix) -end - function make_cue_train_only(data, grams, target_col, tokenized, sep_token, keep_sep, start_end_token, verbose) diff --git a/test/input_tests.jl b/test/input_tests.jl index 7acc38e..11c8542 100644 --- a/test/input_tests.jl +++ b/test/input_tests.jl @@ -2,9 +2,9 @@ using JudiLing using Test @testset "load dataset" begin - data = JudiLing.load_dataset("data/latin_train.csv") + data = JudiLing.load_dataset("data/latin_mini.csv") - @test size(data,1) == 3 + @test size(data,1) == 200 @test typeof(data[!, "Word"]) == Vector{String} @@ -12,3 +12,76 @@ using Test @test size(data,1) == 2 end + +@testset "random_split" begin + # testing aliases + data_train1, data_val1 = JudiLing.loading_data_randomly_split("data/latin_mini.csv", "data/random", "latin", + val_ratio=0.1) + data_train2, data_val2 = JudiLing.loading_data_random_split("data/latin_mini.csv", "data/random", "latin", + val_ratio=0.1) + data_train3, data_val3 = JudiLing.load_data_randomly_split("data/latin_mini.csv", "data/random", "latin", + val_ratio=0.1) + data_train4, data_val4 = JudiLing.load_data_random_split("data/latin_mini.csv", "data/random", "latin", + val_ratio=0.1) + + @test data_train1 == data_train2 + @test data_train1 == data_train3 + @test data_train1 == data_train4 + @test data_val1 == data_val2 + @test data_val1 == data_val3 + @test data_val1 == data_val4 + + # testing sizes + data = JudiLing.load_dataset("data/latin_mini.csv") + target_val_size = round(Int64, size(data,1) * 0.1) + @test size(data_val1, 1) == target_val_size + @test size(data_train1, 1) == size(data,1) - target_val_size + + # testing non-overlap (this only works because the words in latin_mini are unique) + @test length(intersect(Set(data_train1.Word), Set(data_val1.Word))) == 0 + + # clean up + rm("data/random", recursive=true) +end + + +@testset "careful_split" begin + # testing aliases + data_train1, data_val1 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + data_train2, data_val2 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + data_train3, data_val3 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + data_train4, data_val4 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + + @test data_train1 == data_train2 + @test data_train1 == data_train3 + @test data_train1 == data_train4 + @test data_val1 == data_val2 + @test data_val1 == data_val3 + @test data_val1 == data_val4 + + # testing sizes + data = JudiLing.load_dataset("data/latin_mini.csv") + target_val_size = round(Int64, size(data, 1) * 0.1) + @test size(data_val1, 1) == target_val_size + @test size(data_train1, 1) == size(data,1) - target_val_size + + # testing non-overlap (this only works because the words in latin_mini are unique) + @test length(intersect(Set(data_train1.Word), Set(data_val1.Word))) == 0 + + # testing that all the unique features do indeed occur in both training and validation data + for col in ["Lexeme","Person","Number","Tense","Voice","Mood"] + @test length(setdiff(Set(data[:, col]), Set(data_train1[:, col]))) == 0 + @test length(setdiff(Set(data[:, col]), Set(data_val1[:, col]))) == 0 + end + + # clean up + rm("data/careful", recursive=true) +end From a48e1a279faf525265214bcc0a253a0241353e59 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Fri, 17 May 2024 15:34:29 +0100 Subject: [PATCH 05/12] Bump version to 0.8.4 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 291d6da..6bf5796 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JudiLing" uuid = "b43a184b-0e9d-488b-813a-80fd5dbc9fd8" authors = ["Xuefeng Luo", "Maria Heitmeier"] -version = "0.8.3" +version = "0.8.4" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" From ec9153c2cf6d0d0632c6cf5be96527fb0c8c3f70 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Mon, 20 May 2024 09:26:53 +0100 Subject: [PATCH 06/12] Ensure no needed files are deleted during testing --- test/input_tests.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/test/input_tests.jl b/test/input_tests.jl index 11c8542..cc2c374 100644 --- a/test/input_tests.jl +++ b/test/input_tests.jl @@ -15,13 +15,13 @@ end @testset "random_split" begin # testing aliases - data_train1, data_val1 = JudiLing.loading_data_randomly_split("data/latin_mini.csv", "data/random", "latin", + data_train1, data_val1 = JudiLing.loading_data_randomly_split("data/latin_mini.csv", "data/random_test", "latin", val_ratio=0.1) - data_train2, data_val2 = JudiLing.loading_data_random_split("data/latin_mini.csv", "data/random", "latin", + data_train2, data_val2 = JudiLing.loading_data_random_split("data/latin_mini.csv", "data/random_test", "latin", val_ratio=0.1) - data_train3, data_val3 = JudiLing.load_data_randomly_split("data/latin_mini.csv", "data/random", "latin", + data_train3, data_val3 = JudiLing.load_data_randomly_split("data/latin_mini.csv", "data/random_test", "latin", val_ratio=0.1) - data_train4, data_val4 = JudiLing.load_data_random_split("data/latin_mini.csv", "data/random", "latin", + data_train4, data_val4 = JudiLing.load_data_random_split("data/latin_mini.csv", "data/random_test", "latin", val_ratio=0.1) @test data_train1 == data_train2 @@ -41,22 +41,22 @@ end @test length(intersect(Set(data_train1.Word), Set(data_val1.Word))) == 0 # clean up - rm("data/random", recursive=true) + rm("data/random_test", recursive=true) end @testset "careful_split" begin # testing aliases - data_train1, data_val1 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful", + data_train1, data_val1 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful_test", ["Lexeme","Person","Number","Tense","Voice","Mood"], val_ratio=0.1, n_grams_target_col = "Word") - data_train2, data_val2 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful", + data_train2, data_val2 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful_test", ["Lexeme","Person","Number","Tense","Voice","Mood"], val_ratio=0.1, n_grams_target_col = "Word") - data_train3, data_val3 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful", + data_train3, data_val3 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful_test", ["Lexeme","Person","Number","Tense","Voice","Mood"], val_ratio=0.1, n_grams_target_col = "Word") - data_train4, data_val4 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful", + data_train4, data_val4 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful_test", ["Lexeme","Person","Number","Tense","Voice","Mood"], val_ratio=0.1, n_grams_target_col = "Word") @@ -83,5 +83,5 @@ end end # clean up - rm("data/careful", recursive=true) + rm("data/careful_test", recursive=true) end From d813c444608a2d36465972149dfd904ca89bab42 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Mon, 20 May 2024 10:23:49 +0100 Subject: [PATCH 07/12] Add documentation for pyndl functions --- docs/src/man/make_cue_matrix.md | 3 +- docs/src/man/make_semantic_matrix.md | 1 - docs/src/man/pyndl.md | 54 +++++++++- src/pyndl.jl | 143 +++++++++++++++++++++++---- test/pyndl_tests.jl | 3 +- 5 files changed, 180 insertions(+), 24 deletions(-) diff --git a/docs/src/man/make_cue_matrix.md b/docs/src/man/make_cue_matrix.md index 668a989..b6d24ba 100644 --- a/docs/src/man/make_cue_matrix.md +++ b/docs/src/man/make_cue_matrix.md @@ -12,7 +12,6 @@ CurrentModule = JudiLing make_cue_matrix(data::DataFrame) make_cue_matrix(data::DataFrame, cue_obj::Cue_Matrix_Struct) make_cue_matrix(data_train::DataFrame, data_val::DataFrame) - make_cue_matrix(data::DataFrame, pyndl_weights::Pyndl_Weight_Struct) make_combined_cue_matrix(data_train, data_val) make_ngrams(tokens, grams, keep_sep, sep_token, start_end_token) -``` \ No newline at end of file +``` diff --git a/docs/src/man/make_semantic_matrix.md b/docs/src/man/make_semantic_matrix.md index 7a89b94..9941bfd 100644 --- a/docs/src/man/make_semantic_matrix.md +++ b/docs/src/man/make_semantic_matrix.md @@ -31,7 +31,6 @@ CurrentModule = JudiLing make_S_matrix(data_train::DataFrame, data_val::DataFrame, base::Vector, inflections::Vector) make_S_matrix(data::DataFrame, base::Vector) make_S_matrix(data_train::DataFrame, data_val::DataFrame, base::Vector) - make_S_matrix(data_train::DataFrame, data_val::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector) make_S_matrix(data_train::DataFrame, base::Vector, inflections::Vector, L::L_Matrix_Struct) make_S_matrix(data_train::DataFrame, data_val::Union{DataFrame, Nothing}, base::Vector, L::L_Matrix_Struct) make_S_matrix(data::DataFrame, base::Vector, L::L_Matrix_Struct) diff --git a/docs/src/man/pyndl.md b/docs/src/man/pyndl.md index f1bc76f..c9c8d92 100644 --- a/docs/src/man/pyndl.md +++ b/docs/src/man/pyndl.md @@ -2,9 +2,57 @@ CurrentModule = JudiLing ``` -# Preprocess +JudiLing is able to call the python package [pyndl](https://github.com/quantling/pyndl) internally to compute NDL models. pyndl uses event files to compute the mapping matrices, which have to be generated manually or by using pyndl in Python, see documentation [here](https://pyndl.readthedocs.io/en/latest/#creating-grapheme-clusters-from-corpus-data). +The advantage of calling pyndl from JudiLing is that the resulting weights, cue and semantic matrices can be directly translated into JudiLing format and further processing can be done in JudiLing. + +!!! note + For pyndl to be available in JudiLing, PyCall has to be imported before JudiLing: + ```julia + using PyCall + using JudiLing + ``` + +## Calling pyndl from JudiLing ```@docs Pyndl_Weight_Struct - pyndl(data_path) -``` \ No newline at end of file + pyndl( + data_path::String; + alpha::Float64 = 0.1, + betas::Tuple{Float64,Float64} = (0.1, 0.1), + method::String = "openmp" + ) +``` + +## Translating output of pyndl to cue and semantic matrices in JudiLing + +With the weights in hand, the cue and semantic matrices can be computed: + +```@docs + make_cue_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct; + grams = 3, + target_col = "Words", + tokenized = false, + sep_token = nothing, + keep_sep = false, + start_end_token = "#", + verbose = false, + ) + make_S_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) + make_S_matrix( + data_train::DataFrame, + data_val::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) +``` diff --git a/src/pyndl.jl b/src/pyndl.jl index 927cb9a..94d1664 100644 --- a/src/pyndl.jl +++ b/src/pyndl.jl @@ -1,5 +1,12 @@ """ -Pyndl object. + Pyndl_Weight_Struct + cues::Vector{String} + outcomes::Vector{String} + weight::Matrix{Float64} + +- `cues::Vector{String}`: Vector of cues, in the order that they appear in the weight matrix. +- `outcomes::Vector{String}`: Vector of outcomes, in the order that they appear in the weight matrix. +- `weight::Matrix{Float64}`: Weight matrix. """ struct Pyndl_Weight_Struct cues::Vector{String} @@ -8,15 +15,33 @@ struct Pyndl_Weight_Struct end """ - pyndl(data_path) + pyndl( + data_path::String; + alpha::Float64 = 0.1, + betas::Tuple{Float64,Float64} = (0.1, 0.1), + method::String = "openmp" + ) + +Compute weights using pyndl. See the documentation of pyndl for more information: https://pyndl.readthedocs.io/en/latest/ + +# Obligatory arguments +- `data_path::String`: Path to an events file as generated by pyndl's preprocess.create_event_file -Perform pyndl. +# Optional arguments +- `alpha::Float64 = 0.1`: α learning rate. +- `betas::Tuple{Float64,Float64} = (0.1, 0.1)`: β_1 and β_2 learning rates +- `method::String = "openmp"`: One of {"openmp", "threading"}. "openmp" only works on Linux. + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +``` """ function pyndl( - data_path; - alpha = 0.1, - betas = (0.1, 0.1), - method = "openmp" + data_path::String; + alpha::Float64 = 0.1, + betas::Tuple{Float64,Float64} = (0.1, 0.1), + method::String = "openmp" ) ndl = pyimport("pyndl.ndl") @@ -44,9 +69,40 @@ end """ - make_cue_matrix(data::DataFrame, pyndl_weights::Pyndl_Weight_Struct) + make_cue_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct; + grams = 3, + target_col = "Words", + tokenized = false, + sep_token = nothing, + keep_sep = false, + start_end_token = "#", + verbose = false, + ) -Make the cue matrix for pyndl mode. +Make the cue matrix based on a dataframe and weights computed with pyndl. Practically this means that the cues are extracted from the weights object and translated to the JudiLing format. + +# Obligatory arguments +- `data::DataFrame`: Dataset with all the word types on which the weights were trained. +- `pyndl_weights::Pyndl_Weight_Struct`: Weights trained with JudiLing.pyndl + +# Optional argyments +- `grams = 3`: N-gram size (has to match the n-gram granularity of the cues on which the weights were trained). +- `target_col = "Words"`: Column with target words. +- `tokenized = false`: Whether the target words are already tokenized +- `sep_token = nothing`: The string separating the tokens (only used if `tokenized=true`). +- `keep_sep = false`: Whether the `sep_token` should be retained in the cues. +- `start_end_token = "#"`: The string with which to mark word boundaries. +- `verbose = false`: Verbose mode. + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +cue_obj = JudiLing.make_cue_matrix("latin_train.csv", weights, + grams = 3, + target_col = "Word") +``` """ function make_cue_matrix( data::DataFrame, @@ -137,18 +193,46 @@ function make_cue_matrix( tokenized, sep_token, keep_sep, start_end_token) end + """ - make_S_matrix(data_train::DataFrame, data_val::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector) + make_S_matrix( + data_train::DataFrame, + data_val::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) -Create semantic matrix for pyndl mode +Create semantic matrix based on a training and validation dataframe and weights computed with pyndl. Practically this means that the semantic features are extracted from the weights object and translated to the JudiLing format. + +# Obligatory arguments +- `data_train::DataFrame`: The training dataset. +- `data_val::DataFrame`: The validation dataset. +- `pyndl_weights::Pyndl_Weight_Struct`: Weights trained with JudiLing.pyndl. +- `n_features_columns::Vector`: Vector of columns with the features in the training and validation datasets. + +# Optional arguments +- `tokenized=false`: Whether the features in `n_features_columns` columns are already tokenized (e.g. `"feature1_feature2_feature3"`) +- `sep_token="_"`: The string with which the features are separated (only used if `tokenized=false`). + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +S_train, S_val = JudiLing.make_S_matrix(train, + val, + weights_latin, + ["Lexeme", "Person", "Number", "Tense", "Voice", "Mood"], + tokenized=false) +``` """ function make_S_matrix( data_train::DataFrame, data_val::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector; - tokenized=false, - sep_token="_" + tokenized::Bool=false, + sep_token::String="_" ) f2i = Dict(v => i for (i, v) in enumerate(pyndl_weights.outcomes)) @@ -185,17 +269,42 @@ function make_S_matrix( St_train', St_val' end + """ - make_S_matrix(data::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector) + make_S_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) -Create semantic matrix for pyndl mode +Create semantic matrix based on a dataframe and weights computed with pyndl. Practically this means that the semantic features are extracted from the weights object and translated to the JudiLing format. + +# Obligatory arguments +- `data::DataFrame`: The dataset with word types. +- `pyndl_weights::Pyndl_Weight_Struct`: Weights trained with JudiLing.pyndl. +- `n_features_columns::Vector`: Vector of columns with the features in the dataset. + +# Optional arguments +- `tokenized=false`: Whether the features in `n_features_columns` columns are already tokenized (e.g. `"feature1_feature2_feature3"`) +- `sep_token="_"`: The string with which the features are separated (only used if `tokenized=false`). + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +S = JudiLing.make_S_matrix(data, + weights_latin, + ["Lexeme", "Person", "Number", "Tense", "Voice", "Mood"], + tokenized=false) +``` """ function make_S_matrix( data::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector; - tokenized=false, - sep_token="_" + tokenized::Bool=false, + sep_token::String="_" ) f2i = Dict(v => i for (i, v) in enumerate(pyndl_weights.outcomes)) diff --git a/test/pyndl_tests.jl b/test/pyndl_tests.jl index 5909cf7..637f542 100644 --- a/test/pyndl_tests.jl +++ b/test/pyndl_tests.jl @@ -23,7 +23,8 @@ cue_obj_train, cue_obj_val = JudiLing.make_combined_cue_matrix(train, val, targe train, weights_latin; grams = 3, - target_col = "Word" + target_col = "Word", + verbose=true ); cue_obj_val = JudiLing.make_cue_matrix( From 21dfe4420d81fd6dc61ee22439dc08668dcbf0be Mon Sep 17 00:00:00 2001 From: MariaHei Date: Mon, 20 May 2024 16:37:54 +0100 Subject: [PATCH 08/12] Add documentation of make_learn_seq Fixes #116 --- docs/src/man/wh.md | 17 ++++++++++++++--- src/wh.jl | 29 ++++++++++++++++++++++++++--- 2 files changed, 40 insertions(+), 6 deletions(-) diff --git a/docs/src/man/wh.md b/docs/src/man/wh.md index f35ce98..92200e5 100644 --- a/docs/src/man/wh.md +++ b/docs/src/man/wh.md @@ -5,6 +5,17 @@ CurrentModule = JudiLing # Utils ```@docs - wh_learn(X, Y) - make_learn_seq(freq) -``` \ No newline at end of file + wh_learn( + X, + Y; + eta = 0.01, + n_epochs = 1, + weights = nothing, + learn_seq = nothing, + save_history = false, + history_cols = nothing, + history_rows = nothing, + verbose = false, + ) + make_learn_seq(freq; random_seed = 314) +``` diff --git a/src/wh.jl b/src/wh.jl index 93d0a23..2a5887b 100644 --- a/src/wh.jl +++ b/src/wh.jl @@ -1,5 +1,16 @@ """ - wh_learn(X, Y) + wh_learn( + X, + Y; + eta = 0.01, + n_epochs = 1, + weights = nothing, + learn_seq = nothing, + save_history = false, + history_cols = nothing, + history_rows = nothing, + verbose = false, + ) Widrow-Hoff Learning. @@ -90,9 +101,21 @@ function wh_learn( end """ - make_learn_seq(freq) + make_learn_seq(freq; random_seed = 314) -Make Widrow-Hoff learning sequence. +Make Widrow-Hoff learning sequence from frequencies. +Creates a randomly ordered sequences of indices where each index appears according to its frequncy. + +# Obligatory arguments +- `freq`: Vector with frequencies. + +# Optional arguments +- `random_seed = 314`: Random seed to control randomness. + +# Example +```julia +learn_seq = JudiLing.make_learn_seq(data.frequency) +``` """ function make_learn_seq(freq; random_seed = 314) if isnothing(freq) From bc90b87cb8b3fb1b3072edc115b53d661b947f2b Mon Sep 17 00:00:00 2001 From: MariaHei Date: Mon, 20 May 2024 17:01:41 +0100 Subject: [PATCH 09/12] Remove warning regarding dataframes --- docs/src/index.md | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index 5a0deb7..da311d5 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,9 @@ # JudiLing -!!! note - If you encounter an error like "ERROR: UndefVarError: DataFrame! not defined", this is because our dependency CSV.jl changed their APIs in v0.8. Please use "data = DataFrame(CSV.File(path_to_csv_file))" to read a CSV file and include DataFrames package by "using DataFrames". +JudiLing: An implementation for Linear Discriminative Learning in Julia + +Maintainer: Maria Heitmeier [@MariaHei](https://github.com/MariaHei) +Original codebase: Xuefeng Luo [@MegamindHenry](https://github.com/MegamindHenry) ## Installation From 78e1b984bdb623128a8263fc1d9da4210bb86a6c Mon Sep 17 00:00:00 2001 From: MariaHei Date: Fri, 24 May 2024 09:38:15 +0100 Subject: [PATCH 10/12] Update links to docs in Readme --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 59f29cf..1757e1d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # JudiLing -[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://MegamindHenry.github.io/JudiLing.jl/stable) -[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://MegamindHenry.github.io/JudiLing.jl/dev) -[![Build Status](https://github.com/MegamindHenry/JudiLing.jl/workflows/CI/badge.svg)](https://github.com/MegamindHenry/JudiLing.jl/actions) +[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://quantling.github.io/JudiLing.jl/stable) +[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://quantling.github.io/JudiLing.jl/dev) +[![Build Status](https://github.com/quantling/JudiLing.jl/workflows/CI/badge.svg)](https://github.com/quantling/JudiLing.jl/actions) [![codecov](https://codecov.io/gh/MegamindHenry/JudiLing.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/MegamindHenry/JudiLing.jl) JudiLing: An implementation for Linear Discriminative Learning in Julia From 03b7e990b74a50893c572b98ca0ffc61397512d6 Mon Sep 17 00:00:00 2001 From: MariaHei Date: Fri, 24 May 2024 10:15:09 +0100 Subject: [PATCH 11/12] Update links in docs --- docs/src/index.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/index.md b/docs/src/index.md index da311d5..2174c99 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -14,11 +14,11 @@ Pkg.add("JudiLing") ``` For brave adventurers, install test version of JudiLing by: ``` -julia> Pkg.add(url="https://github.com/MegamindHenry/JudiLing.jl.git") +julia> Pkg.add(url="https://github.com/quantling/JudiLing.jl.git") ``` Or from the Julia REPL, type `]` to enter the Pkg REPL mode and run ``` -pkg> add https://github.com/MegamindHenry/JudiLing.jl.git +pkg> add https://github.com/quantling/JudiLing.jl.git ``` ## Running Julia with multiple threads From 98e5d6b167e6f4435c8957d8d3e35ed431f3b07c Mon Sep 17 00:00:00 2001 From: MariaHei Date: Thu, 30 May 2024 10:41:38 +0200 Subject: [PATCH 12/12] Bump version to 0.9.0 --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 6bf5796..653989c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JudiLing" uuid = "b43a184b-0e9d-488b-813a-80fd5dbc9fd8" authors = ["Xuefeng Luo", "Maria Heitmeier"] -version = "0.8.4" +version = "0.9.0" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0"