diff --git a/R/distinct_prototypes.R b/R/distinct_prototypes.R index 9a71fef..1412b13 100644 --- a/R/distinct_prototypes.R +++ b/R/distinct_prototypes.R @@ -8,9 +8,12 @@ #' @examples #' distinct_prototypes(list(1,2,34,1)) distinct_prototypes = function(prototypes){ - for(i in 1:(length(prototypes)-1)){ - dist_prototypes = distance_to_prototypes(prototypes[[i]], lapply((i+1):length(prototypes), function(j){prototypes[[j]]}))$dist - if(dist_prototypes == 0){return(FALSE)} + if(length(prototypes) == 1){return(TRUE)} + else{ + for(i in 1:(length(prototypes)-1)){ + dist_prototypes = distance_to_prototypes(prototypes[[i]], lapply((i+1):length(prototypes), function(j){prototypes[[j]]}))$dist + if(dist_prototypes == 0){return(FALSE)} + } + return(TRUE) } - return(TRUE) } diff --git a/R/find_prototypes.R b/R/find_prototypes.R index dca8651..2d5d85a 100644 --- a/R/find_prototypes.R +++ b/R/find_prototypes.R @@ -15,7 +15,6 @@ #' @param threshold A real positive number. When the distance between the new centroids and the previous ones is lower than this value, then we stop the algorithm. #' @param trace A boolean. If TRUE, tracing information on the progress of the algorithm is produced. Default is FALSE. #' @param bias A vector indicating the bias that came out when computing the importance sampling estimators of the membership probabilities. Each element of the vector is associated to a Voronoi cell. Default is 0 for all Voronoi cells. -#' @param index_sampling_error Required only if method_IS = "percell". Indicates which of the data samples must be used for the computation of the quantization error. #' @param all_starts A boolean indicating whether the function should return the optimal prototypes obtained for each start. #' @param seed An optional random seed. #' @param batch A boolean indicating whether the computations must be performed by batch or not. If TRUE, data, cell_numbers and density_ratio must be lists. Default is False. @@ -44,7 +43,7 @@ #' find_prototypes(nb_cells = 3, data = data, #' multistart = 2, distance_func = distance_func) -find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL, multistart = 1, method_IS = "unique", sampling_cells = 1:length(data), density_ratio = rep(1, dim(data)[length(dim(data))]), budget = 10^3, threshold = 0, distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},print_progress = FALSE, trace = FALSE, all_starts = FALSE,bias = NULL, index_sampling_error = 1, seed = NULL, batch = FALSE, density_function = NULL, density_biased_function = NULL, inputs_ref = NULL, data_ref = NULL, inputs_function = NULL, outputs_function = NULL){ +find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL, multistart = 1, method_IS = "unique", sampling_cells = 1:length(data), density_ratio = rep(1, dim(data)[length(dim(data))]), budget = 10^3, threshold = 0, distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},print_progress = FALSE, trace = FALSE, all_starts = FALSE,bias = NULL, seed = NULL, batch = FALSE, density_function = NULL, density_biased_function = NULL, inputs_ref = NULL, data_ref = NULL, inputs_function = NULL, outputs_function = NULL){ data_bool = is.null(data) if(is.null(dim(data)) & !is.null(data)){data = t(as.matrix(data))} if(!is.null(seed)){set.seed(seed)} @@ -56,10 +55,28 @@ find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL, if(print_progress){print(paste("start number", it))} if(is.null(starting_proto)){ if(is.null(nb_cells)){stop("nb_cells must me provided if starting_proto is not")} - are_distinct = FALSE - while(are_distinct == FALSE){ - prototypes_it = lapply(sample(x = 1:dim(data)[length(dim(data))], size = nb_cells), function(i){asub(data,dims = length(dim(data)), idx = i)}) - are_distinct = distinct_prototypes(prototypes_it) + samples_init = c() + prototypes_it = list() + while(length(prototypes_it) < nb_cells){ + if(method_IS == "unique"){ + idx = sample(x = setdiff(1:dim(data)[length(dim(data))], samples_init), size = 1) + samples_init = c(samples_init, idx) + prototypes_prov = c(prototypes_it, list(asub(data,dims = length(dim(data)), idx = idx))) + if(distinct_prototypes(prototypes_prov)){prototypes_it = prototypes_prov} + } + else if(!data_bool & method_IS == "percell"){ + idx_sample = sample(1:length(data), size = 1) + idx = sample(x = 1:dim(data[[idx_sample]])[length(dim(data[[idx_sample]]))], size = 1) + samples_init = c(samples_init, idx) + prototypes_prov = c(prototypes_it, list(asub(data[[idx_sample]],dims = length(dim(data[[idx_sample]])), idx = idx))) + if(distinct_prototypes(prototypes_prov)){prototypes_it = prototypes_prov} + } + else if(data_bool & method_IS == "percell"){ + idx = sample(x = setdiff(1:dim(data_ref)[length(dim(data_ref))], samples_init), size = 1) + samples_init = c(samples_init, idx) + prototypes_prov = c(prototypes_it, list(asub(data_ref,dims = length(dim(data_ref)), idx = idx))) + if(distinct_prototypes(prototypes_prov)){prototypes_it = prototypes_prov} + } } } else{ @@ -100,7 +117,7 @@ find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL, } if(method_IS == "percell"){ batch_size = NULL - error = quanti_error(data = data[[index_sampling_error]], prototypes = prototypes_it, density_ratio = density_ratio[[index_sampling_error]], batch_size = batch_size, distance_func = distance_func) + error = quanti_error(data = data, prototypes = prototypes_it, density_ratio = density_ratio, batch_size = batch_size, distance_func = distance_func, method_IS = method_IS, sampling_cells = sampling_cells) } else if(method_IS == "unique"){ if(batch){batch_size = dim(data[[1]])[length(dim(data[[1]]))]} diff --git a/R/fit_metamodel.R b/R/fit_metamodel.R index bcf8a45..56c5e28 100644 --- a/R/fit_metamodel.R +++ b/R/fit_metamodel.R @@ -51,7 +51,7 @@ #' control_classification = list(nodesize = 4), threshold_classification = 2) fit_metamodel = function(design_train, outputs_train, seed = NULL, ncoeff,npc,kernel="matern5_2", wf = "d4", boundary = "periodic",J=1, - regmodel = "constant", normalize = FALSE, optim = "BFGS", objective = "LL", parameters = NULL,noise=FALSE, nugget = FALSE, classification = FALSE, control_classification = NULL,threshold_classification = NULL,threshold_fpca = NULL){ + regmodel = "constant", normalize = FALSE, optim = "BFGS", objective = "LL", parameters = NULL,noise=FALSE, nugget = FALSE, classification = FALSE, control_classification = NULL,threshold_classification = 0,threshold_fpca = NULL){ if(is.null(threshold_fpca)){threshold_fpca = threshold_classification} pred_fpca = TRUE if(classification){ diff --git a/R/probas_loo.R b/R/probas_loo.R index 2bcae54..63c9029 100644 --- a/R/probas_loo.R +++ b/R/probas_loo.R @@ -86,5 +86,7 @@ probas_loo = function(outputs, density_ratio, prototypes, distance_func = functi probas_pred_df = rbind(probas_pred_df, c(as.numeric(grid_cv[i,]), probas_pred_cv)) relative_error_df = rbind(relative_error_df, c(as.numeric(grid_cv[i,]), abs(probas_pred_cv - probas_true)/probas_true)) } + colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2)) + colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2)) return(list(probas_pred = probas_pred_df, error = relative_error_df, outputs_pred = outputs_loo_list)) } diff --git a/R/probas_training_test.R b/R/probas_training_test.R index 92682ed..e8dc355 100644 --- a/R/probas_training_test.R +++ b/R/probas_training_test.R @@ -91,5 +91,7 @@ probas_training_test = function(outputs_train,outputs_test, density_ratio, proto probas_pred_df = rbind(probas_pred_df, c(as.numeric(grid_cv[i,]), probas_pred_cv)) relative_error_df = rbind(relative_error_df, c(as.numeric(grid_cv[i,]), abs(probas_pred_cv - probas_true)/probas_true)) } + colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2)) + colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2)) return(list(probas_pred = probas_pred_df, error = relative_error_df, model_tuning = model_tuning, outputs_pred = outputs_pred_list)) } diff --git a/R/rf_proba_k_fold.R b/R/rf_proba_k_fold.R index bfb294b..314f0fe 100644 --- a/R/rf_proba_k_fold.R +++ b/R/rf_proba_k_fold.R @@ -121,6 +121,8 @@ rf_probas_k_fold = function(design, outputs, threshold_classification, threshold probas_pred_df = rbind(probas_pred_df,probas_pred_cv) relative_error_df = rbind(relative_error_df, abs(probas_pred_cv - probas_true)/probas_true) } + colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2)) + colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2)) if(return_pred == FALSE){outputs_pred = NULL} return(list(list_search = list_search,probas_pred = probas_pred_df, error = relative_error_df, outputs_pred = outputs_pred)) } diff --git a/R/rf_proba_training_test.R b/R/rf_proba_training_test.R index 42aa15e..46d6666 100644 --- a/R/rf_proba_training_test.R +++ b/R/rf_proba_training_test.R @@ -127,6 +127,8 @@ rf_probas_training_test = function(design_train, design_test, outputs_train, out relative_error[probas_pred_cv != 0 & probas_true == 0] = Inf relative_error_df = rbind(relative_error_df, relative_error) } + colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2)) + colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2)) if(return_pred == FALSE){outputs_pred = NULL} return(list(list_search = list_search,probas_pred = probas_pred_df, error = relative_error_df, outputs_pred = outputs_pred)) } diff --git a/R/std_centroid.R b/R/std_centroid.R index 96810e1..5adf934 100644 --- a/R/std_centroid.R +++ b/R/std_centroid.R @@ -34,7 +34,8 @@ #' list_std_centroid = std_centroid(data = data, prototypes_list = #' prototypes_list, density_ratio = density_ratio, distance_func = distance_func #' , cells = 1:length(prototypes_list[[1]]), nv = 50) -std_centroid = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(data)]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv,outputs_function = NULL, inputs = NULL, batch_size = nrow(inputs), return_cell_numbers = FALSE, bootstrap = NULL){ +std_centroid = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(dim(data))]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv = NULL,outputs_function = NULL, inputs = NULL, batch_size = nrow(inputs), return_cell_numbers = FALSE, bootstrap = NULL){ + if(is.null(nv)){nv = length(density_ratio)} bool_cell_numbers = is.null(cell_numbers) std_ratio_list = list() if(is.null(data)){ @@ -92,10 +93,6 @@ std_centroid = function(data = NULL, prototypes_list, density_ratio = rep(1,dim( for(boot in 1:bootstrap){ idxs = sample(1:length(cell_numbers_it), size = length(cell_numbers_it), replace = TRUE) cell_numbers_boot = cell_numbers_it[idxs] - data <<- asub(data,dims = length(dim(data)), idx = idxs) - density_ratio <<- density_ratio[idxs] - cell_numbers <<- cell_numbers_boot - aa <<- compute_centroids_and_proba(data = asub(data,dims = length(dim(data)), idx = idxs), density_ratio = density_ratio[idxs], method_IS = "unique",cell_numbers = cell_numbers_boot, cells = cells)$centroids list_centro_boot[[boot]] = compute_centroids_and_proba(data = asub(data,dims = length(dim(data)), idx = idxs), density_ratio = density_ratio[idxs], method_IS = "unique",cell_numbers = cell_numbers_boot, cells = cells)$centroids } for(cell in 1:length(cells)){ diff --git a/R/std_proba.R b/R/std_proba.R index 906aa79..38def41 100644 --- a/R/std_proba.R +++ b/R/std_proba.R @@ -34,7 +34,8 @@ #' list_std_proba = std_proba(data = data, prototypes_list = prototypes_list, #' density_ratio = density_ratio, distance_func = distance_func, #' cells = 1:length(prototypes_list[[1]]), nv = 50) -std_proba = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(data)]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv, outputs_function = NULL, inputs = NULL, batch_size = NULL, return_cell_numbers = FALSE, bootstrap = NULL){ +std_proba = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(dim(data))]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv = NULL, outputs_function = NULL, inputs = NULL, batch_size = NULL, return_cell_numbers = FALSE, bootstrap = NULL){ + if(is.null(nv)){nv = length(density_ratio)} std_list = list() if(is.null(data) & is.null(cell_numbers)){ nb_batch = nrow(inputs)%/%batch_size diff --git a/man/find_prototypes.Rd b/man/find_prototypes.Rd index 701df72..f061e6f 100644 --- a/man/find_prototypes.Rd +++ b/man/find_prototypes.Rd @@ -21,7 +21,6 @@ find_prototypes( trace = FALSE, all_starts = FALSE, bias = NULL, - index_sampling_error = 1, seed = NULL, batch = FALSE, density_function = NULL, @@ -61,8 +60,6 @@ find_prototypes( \item{bias}{A vector indicating the bias that came out when computing the importance sampling estimators of the membership probabilities. Each element of the vector is associated to a Voronoi cell. Default is 0 for all Voronoi cells.} -\item{index_sampling_error}{Required only if method_IS = "percell". Indicates which of the data samples must be used for the computation of the quantization error.} - \item{seed}{An optional random seed.} \item{batch}{A boolean indicating whether the computations must be performed by batch or not. If TRUE, data, cell_numbers and density_ratio must be lists. Default is False.} diff --git a/man/fit_metamodel.Rd b/man/fit_metamodel.Rd index 8f5ca56..6fa266c 100644 --- a/man/fit_metamodel.Rd +++ b/man/fit_metamodel.Rd @@ -23,7 +23,7 @@ fit_metamodel( nugget = FALSE, classification = FALSE, control_classification = NULL, - threshold_classification = NULL, + threshold_classification = 0, threshold_fpca = NULL ) } diff --git a/man/std_centroid.Rd b/man/std_centroid.Rd index d6e6085..4e223ac 100644 --- a/man/std_centroid.Rd +++ b/man/std_centroid.Rd @@ -7,13 +7,13 @@ std_centroid( data = NULL, prototypes_list, - density_ratio = rep(1, dim(data)[length(data)]), + density_ratio = rep(1, dim(data)[length(dim(data))]), distance_func = function(A1, A2) { return(sqrt(sum((A1 - A2)^2))) }, cells, cell_numbers = NULL, - nv, + nv = NULL, outputs_function = NULL, inputs = NULL, batch_size = nrow(inputs), diff --git a/man/std_proba.Rd b/man/std_proba.Rd index 426fff5..8ff3a36 100644 --- a/man/std_proba.Rd +++ b/man/std_proba.Rd @@ -7,13 +7,13 @@ std_proba( data = NULL, prototypes_list, - density_ratio = rep(1, dim(data)[length(data)]), + density_ratio = rep(1, dim(data)[length(dim(data))]), distance_func = function(A1, A2) { return(sqrt(sum((A1 - A2)^2))) }, cells, cell_numbers = NULL, - nv, + nv = NULL, outputs_function = NULL, inputs = NULL, batch_size = NULL,