diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d688c43..8bfebb2 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -2,6 +2,7 @@ name: CI on: - push - pull_request + - workflow_dispatch jobs: test: name: Julia ${{ matrix.version }} - ${{ matrix.os }} - ${{ matrix.arch }} - ${{ github.event_name }} diff --git a/Project.toml b/Project.toml index 291d6da..653989c 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "JudiLing" uuid = "b43a184b-0e9d-488b-813a-80fd5dbc9fd8" authors = ["Xuefeng Luo", "Maria Heitmeier"] -version = "0.8.3" +version = "0.9.0" [deps] BSON = "fbb218c0-5317-5bc6-957e-2ee96dd4b1f0" diff --git a/README.md b/README.md index 59f29cf..1757e1d 100644 --- a/README.md +++ b/README.md @@ -1,8 +1,8 @@ # JudiLing -[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://MegamindHenry.github.io/JudiLing.jl/stable) -[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://MegamindHenry.github.io/JudiLing.jl/dev) -[![Build Status](https://github.com/MegamindHenry/JudiLing.jl/workflows/CI/badge.svg)](https://github.com/MegamindHenry/JudiLing.jl/actions) +[![Stable](https://img.shields.io/badge/docs-stable-blue.svg)](https://quantling.github.io/JudiLing.jl/stable) +[![Dev](https://img.shields.io/badge/docs-dev-blue.svg)](https://quantling.github.io/JudiLing.jl/dev) +[![Build Status](https://github.com/quantling/JudiLing.jl/workflows/CI/badge.svg)](https://github.com/quantling/JudiLing.jl/actions) [![codecov](https://codecov.io/gh/MegamindHenry/JudiLing.jl/branch/master/graph/badge.svg)](https://codecov.io/gh/MegamindHenry/JudiLing.jl) JudiLing: An implementation for Linear Discriminative Learning in Julia diff --git a/docs/src/index.md b/docs/src/index.md index 5a0deb7..2174c99 100644 --- a/docs/src/index.md +++ b/docs/src/index.md @@ -1,7 +1,9 @@ # JudiLing -!!! note - If you encounter an error like "ERROR: UndefVarError: DataFrame! not defined", this is because our dependency CSV.jl changed their APIs in v0.8. Please use "data = DataFrame(CSV.File(path_to_csv_file))" to read a CSV file and include DataFrames package by "using DataFrames". +JudiLing: An implementation for Linear Discriminative Learning in Julia + +Maintainer: Maria Heitmeier [@MariaHei](https://github.com/MariaHei) +Original codebase: Xuefeng Luo [@MegamindHenry](https://github.com/MegamindHenry) ## Installation @@ -12,11 +14,11 @@ Pkg.add("JudiLing") ``` For brave adventurers, install test version of JudiLing by: ``` -julia> Pkg.add(url="https://github.com/MegamindHenry/JudiLing.jl.git") +julia> Pkg.add(url="https://github.com/quantling/JudiLing.jl.git") ``` Or from the Julia REPL, type `]` to enter the Pkg REPL mode and run ``` -pkg> add https://github.com/MegamindHenry/JudiLing.jl.git +pkg> add https://github.com/quantling/JudiLing.jl.git ``` ## Running Julia with multiple threads diff --git a/docs/src/man/input.md b/docs/src/man/input.md index f7e9fa6..b657669 100644 --- a/docs/src/man/input.md +++ b/docs/src/man/input.md @@ -8,4 +8,27 @@ CurrentModule = JudiLing load_dataset(filepath::String; delim::String=",", kargs...) +loading_data_randomly_split( + data_path::String, + output_dir_path::String, + data_prefix::String; + val_sample_size::Int = 0, + val_ratio::Float = 0.0, + random_seed::Int = 314) +loading_data_careful_split( + data_path::String, + data_prefix::String, + output_dir_path::String, + n_features_columns::Union{Vector{Symbol},Vector{String}}; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + n_grams_target_col::Union{Symbol, String} = :Word, + n_grams_tokenized::Bool = false, + n_grams_sep_token::Union{Nothing, String} = nothing, + grams::Int = 3, + n_grams_keep_sep::Bool = false, + start_end_token::String = "#", + random_seed::Int = 314, + verbose::Bool = false) ``` diff --git a/docs/src/man/make_cue_matrix.md b/docs/src/man/make_cue_matrix.md index 779ede5..e43df5f 100644 --- a/docs/src/man/make_cue_matrix.md +++ b/docs/src/man/make_cue_matrix.md @@ -12,7 +12,6 @@ CurrentModule = JudiLing make_cue_matrix(data::DataFrame) make_cue_matrix(data::DataFrame, cue_obj::Cue_Matrix_Struct) make_cue_matrix(data_train::DataFrame, data_val::DataFrame) - make_cue_matrix(data::DataFrame, pyndl_weights::Pyndl_Weight_Struct) make_combined_cue_matrix(data_train, data_val) make_cue_matrix_from_CFBS(features::Vector{Vector{T}}; pad_val::T = 0., diff --git a/docs/src/man/make_semantic_matrix.md b/docs/src/man/make_semantic_matrix.md index a058b6b..9941bfd 100644 --- a/docs/src/man/make_semantic_matrix.md +++ b/docs/src/man/make_semantic_matrix.md @@ -9,8 +9,14 @@ CurrentModule = JudiLing ```@docs PS_Matrix_Struct make_pS_matrix - make_pS_matrix(utterances) - make_pS_matrix(utterances, utterances_train) + make_pS_matrix(data) + make_pS_matrix(data_val, pS_obj) + make_combined_pS_matrix( + data_train, + data_val; + features_col = :CommunicativeIntention, + sep_token = "_", + ) ``` ## Simulate semantic vectors @@ -25,7 +31,6 @@ CurrentModule = JudiLing make_S_matrix(data_train::DataFrame, data_val::DataFrame, base::Vector, inflections::Vector) make_S_matrix(data::DataFrame, base::Vector) make_S_matrix(data_train::DataFrame, data_val::DataFrame, base::Vector) - make_S_matrix(data_train::DataFrame, data_val::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector) make_S_matrix(data_train::DataFrame, base::Vector, inflections::Vector, L::L_Matrix_Struct) make_S_matrix(data_train::DataFrame, data_val::Union{DataFrame, Nothing}, base::Vector, L::L_Matrix_Struct) make_S_matrix(data::DataFrame, base::Vector, L::L_Matrix_Struct) diff --git a/docs/src/man/pyndl.md b/docs/src/man/pyndl.md index f1bc76f..c9c8d92 100644 --- a/docs/src/man/pyndl.md +++ b/docs/src/man/pyndl.md @@ -2,9 +2,57 @@ CurrentModule = JudiLing ``` -# Preprocess +JudiLing is able to call the python package [pyndl](https://github.com/quantling/pyndl) internally to compute NDL models. pyndl uses event files to compute the mapping matrices, which have to be generated manually or by using pyndl in Python, see documentation [here](https://pyndl.readthedocs.io/en/latest/#creating-grapheme-clusters-from-corpus-data). +The advantage of calling pyndl from JudiLing is that the resulting weights, cue and semantic matrices can be directly translated into JudiLing format and further processing can be done in JudiLing. + +!!! note + For pyndl to be available in JudiLing, PyCall has to be imported before JudiLing: + ```julia + using PyCall + using JudiLing + ``` + +## Calling pyndl from JudiLing ```@docs Pyndl_Weight_Struct - pyndl(data_path) -``` \ No newline at end of file + pyndl( + data_path::String; + alpha::Float64 = 0.1, + betas::Tuple{Float64,Float64} = (0.1, 0.1), + method::String = "openmp" + ) +``` + +## Translating output of pyndl to cue and semantic matrices in JudiLing + +With the weights in hand, the cue and semantic matrices can be computed: + +```@docs + make_cue_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct; + grams = 3, + target_col = "Words", + tokenized = false, + sep_token = nothing, + keep_sep = false, + start_end_token = "#", + verbose = false, + ) + make_S_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) + make_S_matrix( + data_train::DataFrame, + data_val::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) +``` diff --git a/docs/src/man/wh.md b/docs/src/man/wh.md index f35ce98..92200e5 100644 --- a/docs/src/man/wh.md +++ b/docs/src/man/wh.md @@ -5,6 +5,17 @@ CurrentModule = JudiLing # Utils ```@docs - wh_learn(X, Y) - make_learn_seq(freq) -``` \ No newline at end of file + wh_learn( + X, + Y; + eta = 0.01, + n_epochs = 1, + weights = nothing, + learn_seq = nothing, + save_history = false, + history_cols = nothing, + history_rows = nothing, + verbose = false, + ) + make_learn_seq(freq; random_seed = 314) +``` diff --git a/src/input.jl b/src/input.jl index 9606a20..acdd278 100644 --- a/src/input.jl +++ b/src/input.jl @@ -24,3 +24,220 @@ function load_dataset(filepath::String; kargs...) return(DataFrame(CSV.File(filepath, stringtype=String, delim=delim; kargs...))) end + + +""" + load_data_random_split(args...; kargs...) + +Alias for loading_data_randomly_split +""" +function load_data_random_split( + args...; kargs...) + JudiLing.loading_data_randomly_split(args...; kargs...) +end + +""" + loading_data_random_split(args...; kargs...) + +Alias for loading_data_randomly_split +""" +function loading_data_random_split( + args...; kargs...) + JudiLing.loading_data_randomly_split(args...; kargs...) +end + +""" + load_data_randomly_split(args...; kargs...) + +Alias for loading_data_randomly_split +""" +function load_data_randomly_split( + args...; kargs...) + JudiLing.loading_data_randomly_split(args...; kargs...) +end + +""" + loading_data_randomly_split( + data_path::String, + output_dir_path::String, + data_prefix::String; + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + random_seed::Int = 314) + +Read in a dataframe, splitting the dataframe into a training and validation dataset. The two are also written to `output_dir_path` at the same time. + +!!! note + The order of `data_prefix` and `output_dir_path` is exactly reversed compared to `loading_data_careful_split`. + + +# Obligatory arguments +- `data_path::String`: Path to where the dataset is stored. +- `output_dir_path::String`: Path to where the new dataframes should be stored. +- `data_prefix::String`: Prefix of the two new files, will be called `data_prefix_train.csv` and `data_prefix_val.csv`. + +# Optional arguments +- `val_sample_size::Int = 0`: Size of the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `val_ratio::Float64 = 0.0`: Fraction of the data that should be in the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `random_seed::Int = 314`: Random seed for controlling random split. + +# Example +``` +data_train, data_val = JudiLing.loading_data_randomly_split( + "latin.csv", + "careful", + "latin", + ["Lexeme","Person","Number","Tense","Voice","Mood"] +) +``` +""" +function loading_data_randomly_split( + data_path::String, + output_dir_path::String, + data_prefix::String; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + random_seed::Int = 314, + verbose::Bool = false) + verbose && println("Spliting data...") + + train_val_random_split( + data_path, + output_dir_path, + data_prefix, + train_sample_size = train_sample_size, + val_sample_size = val_sample_size, + val_ratio = val_ratio, + random_seed = random_seed, + verbose = verbose, + ) + + # load data + verbose && println("Loading CSV...") + loading_data_pre_split(output_dir_path, data_prefix) +end + +""" + load_data_careful_split(args...; kargs...) + +Alias for load_data_carefully_split +""" +function load_data_careful_split( + args...; kargs...) + JudiLing.loading_data_careful_split(args...; kargs...) +end + +""" + load_data_carefully_split(args...; kargs...) + +Alias for load_data_carefully_split +""" +function load_data_carefully_split( + args...; kargs...) + JudiLing.loading_data_careful_split(args...; kargs...) +end + +""" + loading_data_carefully_split(args...; kargs...) + +Alias for load_data_carefully_split +""" +function loading_data_carefully_split( + args...; kargs...) + JudiLing.loading_data_careful_split(args...; kargs...) +end + +""" + loading_data_careful_split( + data_path::String, + data_prefix::String, + output_dir_path::String, + n_features_columns::Union{Vector{Symbol},Vector{String}}; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + n_grams_target_col::Union{Symbol, String} = :Word, + n_grams_tokenized::Bool = false, + n_grams_sep_token::Union{Nothing, String} = nothing, + grams::Int = 3, + n_grams_keep_sep::Bool = false, + start_end_token::String = "#", + random_seed::Int = 314, + verbose::Bool = false) + +Read in a dataframe, splitting the dataframe into a training and validation dataset. The split is done such that all features in the columns specified +in `n_features_columns` occur both in the training and validation data. It is also ensured that the unique grams resulting from splitting the strings in column +`n_grams_target_col` into `grams`-grams occur in both datasets. +The two are also written to `output_dir_path` at the same time. + +!!! note + The order of `data_prefix` and `output_dir_path` is exactly reversed compared to `loading_data_randomly_split`. + +# Obligatory arguments +- `data_path::String`: Path to where the dataset is stored. +- `output_dir_path::String`: Path to where the new dataframes should be stored. +- `data_prefix::String`: Prefix of the two new files, will be called `data_prefix_train.csv` and `data_prefix_val.csv`. +- `n_features_columns::Vector{Union{Symbol, String}}`: Vector with columns whose features have to occur in both the training and validation data. + +# Optional arguments +- `val_sample_size::Int = 0`: Size of the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `val_ratio::Float64 = 0.0`: Fraction of the data that should be in the validation dataset (only `val_sample_size` or `val_ratio` may be used). +- `n_grams_target_col::Union{Symbol, String} = :Word`: Column with target words. +- `n_grams_tokenized::Bool = false`: Whether the words in `n_grams_target_col` are already tokenized. +- `n_grams_sep_token::Union{Nothing, String} = nothing`: String with which tokens in `n_grams_target_col` are separated (only used if `n_grams_tokenized=true`). +- `grams::Int = 3`: Granularity of the n-grams. +- `n_grams_keep_sep::Bool = false`: Whether the token separators should be kept in the ngrams (this is useful e.g. when working with syllables). +- `start_end_token::String = "#"`: Token with which the start and end of words should be marked. +- `random_seed::Int = 314`: Random seed for controlling random split. + +# Example +``` +data_train, data_val = JudiLing.loading_data_careful_split( + "latin.csv", + "latin", + "careful", + ["Lexeme","Person","Number","Tense","Voice","Mood"] +) +``` +""" +function loading_data_careful_split( + data_path::String, + data_prefix::String, + output_dir_path::String, + n_features_columns::Union{Vector{Symbol},Vector{String}}; + train_sample_size::Int = 0, + val_sample_size::Int = 0, + val_ratio::Float64 = 0.0, + n_grams_target_col::Union{Symbol, String} = :Word, + n_grams_tokenized::Bool = false, + n_grams_sep_token::Union{Nothing, String} = nothing, + grams::Int = 3, + n_grams_keep_sep::Bool = false, + start_end_token::String = "#", + random_seed::Int = 314, + verbose::Bool = false) + + verbose && println("Splitting data...") + train_val_careful_split( + data_path, + output_dir_path, + data_prefix, + n_features_columns, + train_sample_size = train_sample_size, + val_sample_size = val_sample_size, + val_ratio = val_ratio, + n_grams_target_col = n_grams_target_col, + n_grams_tokenized = n_grams_tokenized, + n_grams_sep_token = n_grams_sep_token, + grams = grams, + n_grams_keep_sep = n_grams_keep_sep, + start_end_token = start_end_token, + random_seed = random_seed, + verbose = verbose, + ) + + # load data + verbose && println("Loading CSV...") + loading_data_pre_split(output_dir_path, data_prefix) +end diff --git a/src/make_semantic_matrix.jl b/src/make_semantic_matrix.jl index 948437d..7c7f027 100644 --- a/src/make_semantic_matrix.jl +++ b/src/make_semantic_matrix.jl @@ -49,7 +49,7 @@ Make combined simulated Lexome matrix, where combined features from both trainin function make_combined_L_matrix end """ - make_pS_matrix(utterances) + make_pS_matrix(data) Create a discrete semantic matrix given a dataframe. @@ -69,27 +69,27 @@ s_obj_train = JudiLing.make_pS_matrix( ``` """ function make_pS_matrix( - utterances; + data; features_col = :CommunicativeIntention, sep_token = "_", ) # find out all possible features in this dataset - features = unique(vcat(split.(utterances[:, features_col], sep_token)...)) + features = unique(vcat(split.(data[:, features_col], sep_token)...)) # using dict to store feature names f2i = Dict(v => i for (i, v) in enumerate(features)) i2f = Dict(i => v for (i, v) in enumerate(features)) # find out features for each utterance - vs = unique.(split.(utterances[:, features_col], sep_token)) + vs = unique.(split.(data[:, features_col], sep_token)) # total number of feature in the entire dataset # to initialize a sparse matrix n_f = sum([length(v) for v in vs]) # initialize sparse matrix components - m = size(utterances, 1) + m = size(data, 1) n = length(i2f) I = zeros(Int64, n_f) J = zeros(Int64, n_f) @@ -112,14 +112,14 @@ function make_pS_matrix( end """ - make_pS_matrix(utterances, utterances_train) + make_pS_matrix(data_val, pS_obj) Construct discrete semantic matrix for the validation datasets given by the exemplar in the dataframe, and given the S matrix for the training datasets. # Obligatory Arguments -- `utterances::DataFrame`: the dataset -- `utterances_train::PS_Matrix_Struct`: training PS object +- `data_val::DataFrame`: the dataset +- `pS_obj::PS_Matrix_Struct`: training PS object # Optional Arguments - `features_col::Symbol=:CommunicativeIntention`: the column name for target @@ -128,35 +128,35 @@ exemplar in the dataframe, and given the S matrix for the training datasets. # Examples ```julia s_obj_val = JudiLing.make_pS_matrix( - utterance_val, + data_val, s_obj_train, features_col=:CommunicativeIntention, sep_token="_") ``` """ function make_pS_matrix( - utterances, - utterances_train; + data_val, + pS_obj; features_col = :CommunicativeIntention, sep_token = "_", ) # find out all possible features in this dataset - features = unique(vcat(split.(utterances[:, features_col], sep_token)...)) + features = unique(vcat(split.(data_val[:, features_col], sep_token)...)) # using dict to store feature names - f2i = utterances_train.f2i - i2f = utterances_train.i2f + f2i = pS_obj.f2i + i2f = pS_obj.i2f # find out features for each utterance - vs = unique.(split.(utterances[:, features_col], sep_token)) + vs = unique.(split.(data_val[:, features_col], sep_token)) # total number of feature in the entire dataset # to initialize a sparse matrix n_f = sum([length(v) for v in vs]) # initialize sparse matrix components - m = size(utterances, 1) + m = size(data_val, 1) n = length(i2f) I = zeros(Int64, n_f) J = zeros(Int64, n_f) @@ -178,6 +178,73 @@ function make_pS_matrix( PS_Matrix_Struct(pS, f2i, i2f) end + +""" + make_combined_pS_matrix( + data_train, + data_val; + features_col = :CommunicativeIntention, + sep_token = "_", + ) + +Create discrete semantic matrices for a train and validation dataframe. + +# Obligatory Arguments +- `data_train::DataFrame`: the training dataset +- `data_val::DataFrame`: the validation dataset + +# Optional Arguments +- `features_col::Symbol=:CommunicativeIntention`: the column name for target +- `sep_token::String="_"`: separator + +# Examples +```julia +s_obj_train, s_obj_val = JudiLing.make_combined_pS_matrix( + data_train, + data_val, + features_col=:CommunicativeIntention, + sep_token="_") +``` +""" +function make_combined_pS_matrix( + data_train, + data_val; + features_col = :CommunicativeIntention, + sep_token = "_", +) + + data_combined = copy(data_train) + data_val = copy(data_val) + for col in names(data_combined) + data_combined[!, col] = inlinestring2string.(data_combined[!,col]) + data_val[!, col] = inlinestring2string.(data_val[!,col]) + end + append!(data_combined, data_val, promote=true) + + pS_obj_combined = make_pS_matrix( + data_combined; + features_col = features_col, + sep_token = sep_token, + ) + + pS_obj_train = make_pS_matrix( + data_train, + pS_obj_combined; + features_col = features_col, + sep_token = sep_token, + ) + + pS_obj_val = make_pS_matrix( + data_val, + pS_obj_combined; + features_col = features_col, + sep_token = sep_token, + ) + + return pS_obj_train, pS_obj_val +end + + """ make_S_matrix(data::DataFrame, base::Vector, inflections::Vector) diff --git a/src/pyndl.jl b/src/pyndl.jl index 927cb9a..94d1664 100644 --- a/src/pyndl.jl +++ b/src/pyndl.jl @@ -1,5 +1,12 @@ """ -Pyndl object. + Pyndl_Weight_Struct + cues::Vector{String} + outcomes::Vector{String} + weight::Matrix{Float64} + +- `cues::Vector{String}`: Vector of cues, in the order that they appear in the weight matrix. +- `outcomes::Vector{String}`: Vector of outcomes, in the order that they appear in the weight matrix. +- `weight::Matrix{Float64}`: Weight matrix. """ struct Pyndl_Weight_Struct cues::Vector{String} @@ -8,15 +15,33 @@ struct Pyndl_Weight_Struct end """ - pyndl(data_path) + pyndl( + data_path::String; + alpha::Float64 = 0.1, + betas::Tuple{Float64,Float64} = (0.1, 0.1), + method::String = "openmp" + ) + +Compute weights using pyndl. See the documentation of pyndl for more information: https://pyndl.readthedocs.io/en/latest/ + +# Obligatory arguments +- `data_path::String`: Path to an events file as generated by pyndl's preprocess.create_event_file -Perform pyndl. +# Optional arguments +- `alpha::Float64 = 0.1`: α learning rate. +- `betas::Tuple{Float64,Float64} = (0.1, 0.1)`: β_1 and β_2 learning rates +- `method::String = "openmp"`: One of {"openmp", "threading"}. "openmp" only works on Linux. + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +``` """ function pyndl( - data_path; - alpha = 0.1, - betas = (0.1, 0.1), - method = "openmp" + data_path::String; + alpha::Float64 = 0.1, + betas::Tuple{Float64,Float64} = (0.1, 0.1), + method::String = "openmp" ) ndl = pyimport("pyndl.ndl") @@ -44,9 +69,40 @@ end """ - make_cue_matrix(data::DataFrame, pyndl_weights::Pyndl_Weight_Struct) + make_cue_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct; + grams = 3, + target_col = "Words", + tokenized = false, + sep_token = nothing, + keep_sep = false, + start_end_token = "#", + verbose = false, + ) -Make the cue matrix for pyndl mode. +Make the cue matrix based on a dataframe and weights computed with pyndl. Practically this means that the cues are extracted from the weights object and translated to the JudiLing format. + +# Obligatory arguments +- `data::DataFrame`: Dataset with all the word types on which the weights were trained. +- `pyndl_weights::Pyndl_Weight_Struct`: Weights trained with JudiLing.pyndl + +# Optional argyments +- `grams = 3`: N-gram size (has to match the n-gram granularity of the cues on which the weights were trained). +- `target_col = "Words"`: Column with target words. +- `tokenized = false`: Whether the target words are already tokenized +- `sep_token = nothing`: The string separating the tokens (only used if `tokenized=true`). +- `keep_sep = false`: Whether the `sep_token` should be retained in the cues. +- `start_end_token = "#"`: The string with which to mark word boundaries. +- `verbose = false`: Verbose mode. + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +cue_obj = JudiLing.make_cue_matrix("latin_train.csv", weights, + grams = 3, + target_col = "Word") +``` """ function make_cue_matrix( data::DataFrame, @@ -137,18 +193,46 @@ function make_cue_matrix( tokenized, sep_token, keep_sep, start_end_token) end + """ - make_S_matrix(data_train::DataFrame, data_val::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector) + make_S_matrix( + data_train::DataFrame, + data_val::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) -Create semantic matrix for pyndl mode +Create semantic matrix based on a training and validation dataframe and weights computed with pyndl. Practically this means that the semantic features are extracted from the weights object and translated to the JudiLing format. + +# Obligatory arguments +- `data_train::DataFrame`: The training dataset. +- `data_val::DataFrame`: The validation dataset. +- `pyndl_weights::Pyndl_Weight_Struct`: Weights trained with JudiLing.pyndl. +- `n_features_columns::Vector`: Vector of columns with the features in the training and validation datasets. + +# Optional arguments +- `tokenized=false`: Whether the features in `n_features_columns` columns are already tokenized (e.g. `"feature1_feature2_feature3"`) +- `sep_token="_"`: The string with which the features are separated (only used if `tokenized=false`). + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +S_train, S_val = JudiLing.make_S_matrix(train, + val, + weights_latin, + ["Lexeme", "Person", "Number", "Tense", "Voice", "Mood"], + tokenized=false) +``` """ function make_S_matrix( data_train::DataFrame, data_val::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector; - tokenized=false, - sep_token="_" + tokenized::Bool=false, + sep_token::String="_" ) f2i = Dict(v => i for (i, v) in enumerate(pyndl_weights.outcomes)) @@ -185,17 +269,42 @@ function make_S_matrix( St_train', St_val' end + """ - make_S_matrix(data::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector) + make_S_matrix( + data::DataFrame, + pyndl_weights::Pyndl_Weight_Struct, + n_features_columns::Vector; + tokenized::Bool=false, + sep_token::String="_" + ) -Create semantic matrix for pyndl mode +Create semantic matrix based on a dataframe and weights computed with pyndl. Practically this means that the semantic features are extracted from the weights object and translated to the JudiLing format. + +# Obligatory arguments +- `data::DataFrame`: The dataset with word types. +- `pyndl_weights::Pyndl_Weight_Struct`: Weights trained with JudiLing.pyndl. +- `n_features_columns::Vector`: Vector of columns with the features in the dataset. + +# Optional arguments +- `tokenized=false`: Whether the features in `n_features_columns` columns are already tokenized (e.g. `"feature1_feature2_feature3"`) +- `sep_token="_"`: The string with which the features are separated (only used if `tokenized=false`). + +# Example +```julia +weights = JudiLing.pyndl("data/latin_train_events.tab.gz") +S = JudiLing.make_S_matrix(data, + weights_latin, + ["Lexeme", "Person", "Number", "Tense", "Voice", "Mood"], + tokenized=false) +``` """ function make_S_matrix( data::DataFrame, pyndl_weights::Pyndl_Weight_Struct, n_features_columns::Vector; - tokenized=false, - sep_token="_" + tokenized::Bool=false, + sep_token::String="_" ) f2i = Dict(v => i for (i, v) in enumerate(pyndl_weights.outcomes)) diff --git a/src/test_combo.jl b/src/test_combo.jl index 794daf8..a1aa63d 100644 --- a/src/test_combo.jl +++ b/src/test_combo.jl @@ -704,74 +704,6 @@ function loading_data_pre_split( data_train, data_val end -function loading_data_randomly_split( - data_path, - output_dir_path, - data_prefix; - train_sample_size = 0, - val_sample_size = 0, - val_ratio = 0.0, - random_seed = 314, - verbose = false) - verbose && println("Spliting data...") - - train_val_random_split( - data_path, - output_dir_path, - data_prefix, - train_sample_size = train_sample_size, - val_sample_size = val_sample_size, - val_ratio = val_ratio, - random_seed = random_seed, - verbose = verbose, - ) - - # load data - verbose && println("Loading CSV...") - loading_data_pre_split(output_dir_path, data_prefix) -end - -function loading_data_careful_split( - data_path, - data_prefix, - output_dir_path, - n_features_columns; - train_sample_size = 0, - val_sample_size = 0, - val_ratio = 0.0, - n_grams_target_col = :Word, - n_grams_tokenized = false, - n_grams_sep_token = nothing, - grams = 3, - n_grams_keep_sep = false, - start_end_token = "#", - random_seed = 314, - verbose = false) - - verbose && println("Spliting data...") - train_val_careful_split( - data_path, - output_dir_path, - data_prefix, - n_features_columns, - train_sample_size = train_sample_size, - val_sample_size = val_sample_size, - val_ratio = val_ratio, - n_grams_target_col = n_grams_target_col, - n_grams_tokenized = n_grams_tokenized, - n_grams_sep_token = n_grams_sep_token, - grams = grams, - n_grams_keep_sep = n_grams_keep_sep, - start_end_token = start_end_token, - random_seed = random_seed, - verbose = verbose, - ) - - # load data - verbose && println("Loading CSV...") - loading_data_pre_split(output_dir_path, data_prefix) -end - function make_cue_train_only(data, grams, target_col, tokenized, sep_token, keep_sep, start_end_token, verbose) diff --git a/src/wh.jl b/src/wh.jl index 93d0a23..2a5887b 100644 --- a/src/wh.jl +++ b/src/wh.jl @@ -1,5 +1,16 @@ """ - wh_learn(X, Y) + wh_learn( + X, + Y; + eta = 0.01, + n_epochs = 1, + weights = nothing, + learn_seq = nothing, + save_history = false, + history_cols = nothing, + history_rows = nothing, + verbose = false, + ) Widrow-Hoff Learning. @@ -90,9 +101,21 @@ function wh_learn( end """ - make_learn_seq(freq) + make_learn_seq(freq; random_seed = 314) -Make Widrow-Hoff learning sequence. +Make Widrow-Hoff learning sequence from frequencies. +Creates a randomly ordered sequences of indices where each index appears according to its frequncy. + +# Obligatory arguments +- `freq`: Vector with frequencies. + +# Optional arguments +- `random_seed = 314`: Random seed to control randomness. + +# Example +```julia +learn_seq = JudiLing.make_learn_seq(data.frequency) +``` """ function make_learn_seq(freq; random_seed = 314) if isnothing(freq) diff --git a/test/input_tests.jl b/test/input_tests.jl index 7acc38e..cc2c374 100644 --- a/test/input_tests.jl +++ b/test/input_tests.jl @@ -2,9 +2,9 @@ using JudiLing using Test @testset "load dataset" begin - data = JudiLing.load_dataset("data/latin_train.csv") + data = JudiLing.load_dataset("data/latin_mini.csv") - @test size(data,1) == 3 + @test size(data,1) == 200 @test typeof(data[!, "Word"]) == Vector{String} @@ -12,3 +12,76 @@ using Test @test size(data,1) == 2 end + +@testset "random_split" begin + # testing aliases + data_train1, data_val1 = JudiLing.loading_data_randomly_split("data/latin_mini.csv", "data/random_test", "latin", + val_ratio=0.1) + data_train2, data_val2 = JudiLing.loading_data_random_split("data/latin_mini.csv", "data/random_test", "latin", + val_ratio=0.1) + data_train3, data_val3 = JudiLing.load_data_randomly_split("data/latin_mini.csv", "data/random_test", "latin", + val_ratio=0.1) + data_train4, data_val4 = JudiLing.load_data_random_split("data/latin_mini.csv", "data/random_test", "latin", + val_ratio=0.1) + + @test data_train1 == data_train2 + @test data_train1 == data_train3 + @test data_train1 == data_train4 + @test data_val1 == data_val2 + @test data_val1 == data_val3 + @test data_val1 == data_val4 + + # testing sizes + data = JudiLing.load_dataset("data/latin_mini.csv") + target_val_size = round(Int64, size(data,1) * 0.1) + @test size(data_val1, 1) == target_val_size + @test size(data_train1, 1) == size(data,1) - target_val_size + + # testing non-overlap (this only works because the words in latin_mini are unique) + @test length(intersect(Set(data_train1.Word), Set(data_val1.Word))) == 0 + + # clean up + rm("data/random_test", recursive=true) +end + + +@testset "careful_split" begin + # testing aliases + data_train1, data_val1 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful_test", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + data_train2, data_val2 = JudiLing.loading_data_careful_split("data/latin_mini.csv", "latin", "data/careful_test", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + data_train3, data_val3 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful_test", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + data_train4, data_val4 = JudiLing.load_data_carefully_split("data/latin_mini.csv", "latin", "data/careful_test", + ["Lexeme","Person","Number","Tense","Voice","Mood"], + val_ratio=0.1, n_grams_target_col = "Word") + + @test data_train1 == data_train2 + @test data_train1 == data_train3 + @test data_train1 == data_train4 + @test data_val1 == data_val2 + @test data_val1 == data_val3 + @test data_val1 == data_val4 + + # testing sizes + data = JudiLing.load_dataset("data/latin_mini.csv") + target_val_size = round(Int64, size(data, 1) * 0.1) + @test size(data_val1, 1) == target_val_size + @test size(data_train1, 1) == size(data,1) - target_val_size + + # testing non-overlap (this only works because the words in latin_mini are unique) + @test length(intersect(Set(data_train1.Word), Set(data_val1.Word))) == 0 + + # testing that all the unique features do indeed occur in both training and validation data + for col in ["Lexeme","Person","Number","Tense","Voice","Mood"] + @test length(setdiff(Set(data[:, col]), Set(data_train1[:, col]))) == 0 + @test length(setdiff(Set(data[:, col]), Set(data_val1[:, col]))) == 0 + end + + # clean up + rm("data/careful_test", recursive=true) +end diff --git a/test/make_semantic_matrix_tests.jl b/test/make_semantic_matrix_tests.jl index 3a2902f..e40537f 100644 --- a/test/make_semantic_matrix_tests.jl +++ b/test/make_semantic_matrix_tests.jl @@ -4,15 +4,48 @@ using Test using DataFrames @testset "make prelinguistic semantic matrix for utterance" begin + + function test_cues(features, idx, s_obj) + some_cues = split(features[idx], "_") + tgt_vec = zeros(length(keys(s_obj.f2i))) + for cue in some_cues + i = s_obj.f2i[cue] + tgt_vec[i] = 1 + end + @test tgt_vec == s_obj.pS[idx,:] + end + utterance = DataFrame(CSV.File(joinpath( @__DIR__, "data", "utterance_mini.csv", ))) + + cues_split = [split(d, "_") for d in utterance.CommunicativeIntention] + unique_cues = Set(vcat(cues_split...)) + s_obj_train = JudiLing.make_pS_matrix(utterance) + @test length(unique_cues) == size(s_obj_train.pS, 2) + test_cues(utterance.CommunicativeIntention, 5, s_obj_train) + test_cues(utterance.CommunicativeIntention, 15, s_obj_train) + test_cues(utterance.CommunicativeIntention, 23, s_obj_train) + utterance_val = utterance[101:end, :] s_obj_val = JudiLing.make_pS_matrix(utterance_val, s_obj_train) + test_cues(utterance_val.CommunicativeIntention, 5, s_obj_val) + test_cues(utterance_val.CommunicativeIntention, 6, s_obj_val) + + @test length(unique_cues) == size(s_obj_val.pS, 2) + + s_obj_train, s_obj_val = JudiLing.make_combined_pS_matrix(utterance, utterance_val) + + @test length(unique_cues) == size(s_obj_train.pS, 2) + test_cues(utterance.CommunicativeIntention, 5, s_obj_train) + test_cues(utterance.CommunicativeIntention, 15, s_obj_train) + test_cues(utterance.CommunicativeIntention, 23, s_obj_train) + test_cues(utterance_val.CommunicativeIntention, 5, s_obj_val) + test_cues(utterance_val.CommunicativeIntention, 6, s_obj_val) end @testset "make semantic matrix" begin diff --git a/test/pyndl_tests.jl b/test/pyndl_tests.jl index 5909cf7..637f542 100644 --- a/test/pyndl_tests.jl +++ b/test/pyndl_tests.jl @@ -23,7 +23,8 @@ cue_obj_train, cue_obj_val = JudiLing.make_combined_cue_matrix(train, val, targe train, weights_latin; grams = 3, - target_col = "Word" + target_col = "Word", + verbose=true ); cue_obj_val = JudiLing.make_cue_matrix( diff --git a/test/runtests.jl b/test/runtests.jl index 5f4a753..23ef8a6 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -1,7 +1,9 @@ using SafeTestsets -@safetestset "pyndl tests" begin - include("pyndl_tests.jl") +if !Sys.iswindows() + @safetestset "pyndl tests" begin + include("pyndl_tests.jl") + end end @safetestset "input tests" begin