formation

charliesire · Sep 19, 2023 · 72782d4 · 72782d4
1 parent 81d17dd
commit 72782d4
Show file tree

Hide file tree

Showing 13 changed files with 49 additions and 26 deletions.
diff --git a/R/distinct_prototypes.R b/R/distinct_prototypes.R
@@ -8,9 +8,12 @@
 #' @examples
 #' distinct_prototypes(list(1,2,34,1))
 distinct_prototypes = function(prototypes){
-  for(i in 1:(length(prototypes)-1)){
-    dist_prototypes = distance_to_prototypes(prototypes[[i]], lapply((i+1):length(prototypes), function(j){prototypes[[j]]}))$dist
-    if(dist_prototypes == 0){return(FALSE)}
+  if(length(prototypes) == 1){return(TRUE)}
+  else{
+    for(i in 1:(length(prototypes)-1)){
+      dist_prototypes = distance_to_prototypes(prototypes[[i]], lapply((i+1):length(prototypes), function(j){prototypes[[j]]}))$dist
+      if(dist_prototypes == 0){return(FALSE)}
+    }
+    return(TRUE)
   }
-  return(TRUE)
 }
diff --git a/R/find_prototypes.R b/R/find_prototypes.R
@@ -15,7 +15,6 @@
 #' @param threshold A real positive number. When the distance between the new centroids and the previous ones is lower than this value, then we stop the algorithm.
 #' @param trace A boolean. If TRUE, tracing information on the progress of the algorithm is produced. Default is FALSE.
 #' @param bias A vector indicating the bias that came out when computing the importance sampling estimators of the membership probabilities. Each element of the vector is associated to a Voronoi cell. Default is 0 for all Voronoi cells.
-#' @param index_sampling_error Required only if method_IS = "percell". Indicates which of the data samples must be used for the computation of the quantization error.
 #' @param all_starts A boolean indicating whether the function should return the optimal prototypes obtained for each start.
 #' @param seed An optional random seed.
 #' @param batch A boolean indicating whether the computations must be performed by batch or not. If TRUE, data, cell_numbers and density_ratio must be lists. Default is False.
@@ -44,7 +43,7 @@
 #' find_prototypes(nb_cells = 3, data = data,
 #' multistart = 2, distance_func = distance_func)
 
-find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL, multistart = 1, method_IS = "unique", sampling_cells = 1:length(data), density_ratio = rep(1, dim(data)[length(dim(data))]), budget = 10^3, threshold = 0, distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},print_progress = FALSE, trace = FALSE, all_starts = FALSE,bias = NULL, index_sampling_error = 1, seed = NULL, batch = FALSE, density_function = NULL, density_biased_function = NULL, inputs_ref = NULL, data_ref = NULL, inputs_function = NULL, outputs_function = NULL){
+find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL, multistart = 1, method_IS = "unique", sampling_cells = 1:length(data), density_ratio = rep(1, dim(data)[length(dim(data))]), budget = 10^3, threshold = 0, distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},print_progress = FALSE, trace = FALSE, all_starts = FALSE,bias = NULL, seed = NULL, batch = FALSE, density_function = NULL, density_biased_function = NULL, inputs_ref = NULL, data_ref = NULL, inputs_function = NULL, outputs_function = NULL){
   data_bool = is.null(data)
   if(is.null(dim(data)) & !is.null(data)){data = t(as.matrix(data))}
   if(!is.null(seed)){set.seed(seed)}
@@ -56,10 +55,28 @@ find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL,
     if(print_progress){print(paste("start number", it))}
     if(is.null(starting_proto)){
       if(is.null(nb_cells)){stop("nb_cells must me provided if starting_proto is not")}
-      are_distinct = FALSE
-      while(are_distinct == FALSE){
-        prototypes_it = lapply(sample(x = 1:dim(data)[length(dim(data))], size = nb_cells), function(i){asub(data,dims = length(dim(data)), idx = i)})
-        are_distinct = distinct_prototypes(prototypes_it)
+      samples_init = c()
+      prototypes_it = list()
+      while(length(prototypes_it) < nb_cells){
+        if(method_IS == "unique"){
+          idx = sample(x = setdiff(1:dim(data)[length(dim(data))], samples_init), size = 1)
+          samples_init = c(samples_init, idx)
+          prototypes_prov = c(prototypes_it, list(asub(data,dims = length(dim(data)), idx = idx)))
+          if(distinct_prototypes(prototypes_prov)){prototypes_it = prototypes_prov}
+        }
+        else if(!data_bool & method_IS == "percell"){
+          idx_sample = sample(1:length(data), size = 1)
+          idx = sample(x = 1:dim(data[[idx_sample]])[length(dim(data[[idx_sample]]))], size = 1)
+          samples_init = c(samples_init, idx)
+          prototypes_prov = c(prototypes_it, list(asub(data[[idx_sample]],dims = length(dim(data[[idx_sample]])), idx = idx)))
+          if(distinct_prototypes(prototypes_prov)){prototypes_it = prototypes_prov}
+        }
+        else if(data_bool & method_IS == "percell"){
+          idx = sample(x = setdiff(1:dim(data_ref)[length(dim(data_ref))], samples_init), size = 1)
+          samples_init = c(samples_init, idx)
+          prototypes_prov = c(prototypes_it, list(asub(data_ref,dims = length(dim(data_ref)), idx = idx)))
+          if(distinct_prototypes(prototypes_prov)){prototypes_it = prototypes_prov}
+        }
       }
     }
     else{
@@ -100,7 +117,7 @@ find_prototypes = function(starting_proto = NULL, nb_cells = NULL, data = NULL,
     }
     if(method_IS == "percell"){
       batch_size = NULL
-      error = quanti_error(data = data[[index_sampling_error]], prototypes = prototypes_it, density_ratio = density_ratio[[index_sampling_error]], batch_size = batch_size, distance_func = distance_func)
+      error = quanti_error(data = data, prototypes = prototypes_it, density_ratio = density_ratio, batch_size = batch_size, distance_func = distance_func, method_IS = method_IS, sampling_cells = sampling_cells)
     }
     else if(method_IS == "unique"){
       if(batch){batch_size = dim(data[[1]])[length(dim(data[[1]]))]}

diff --git a/R/fit_metamodel.R b/R/fit_metamodel.R
@@ -51,7 +51,7 @@
 #' control_classification = list(nodesize = 4), threshold_classification = 2)
 
 fit_metamodel = function(design_train, outputs_train, seed = NULL, ncoeff,npc,kernel="matern5_2", wf = "d4", boundary = "periodic",J=1,
-                             regmodel = "constant", normalize = FALSE, optim = "BFGS", objective = "LL", parameters = NULL,noise=FALSE, nugget = FALSE, classification = FALSE, control_classification = NULL,threshold_classification = NULL,threshold_fpca = NULL){
+                             regmodel = "constant", normalize = FALSE, optim = "BFGS", objective = "LL", parameters = NULL,noise=FALSE, nugget = FALSE, classification = FALSE, control_classification = NULL,threshold_classification = 0,threshold_fpca = NULL){
   if(is.null(threshold_fpca)){threshold_fpca = threshold_classification}
   pred_fpca = TRUE
   if(classification){

diff --git a/R/probas_loo.R b/R/probas_loo.R
@@ -86,5 +86,7 @@ probas_loo = function(outputs, density_ratio, prototypes, distance_func = functi
     probas_pred_df = rbind(probas_pred_df, c(as.numeric(grid_cv[i,]), probas_pred_cv))
     relative_error_df = rbind(relative_error_df, c(as.numeric(grid_cv[i,]), abs(probas_pred_cv - probas_true)/probas_true))
   }
+  colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2))
+  colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2))
   return(list(probas_pred = probas_pred_df, error = relative_error_df, outputs_pred = outputs_loo_list))
 }
diff --git a/R/probas_training_test.R b/R/probas_training_test.R
@@ -91,5 +91,7 @@ probas_training_test = function(outputs_train,outputs_test, density_ratio, proto
     probas_pred_df = rbind(probas_pred_df, c(as.numeric(grid_cv[i,]), probas_pred_cv))
     relative_error_df = rbind(relative_error_df, c(as.numeric(grid_cv[i,]), abs(probas_pred_cv - probas_true)/probas_true))
   }
+  colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2))
+  colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2))
   return(list(probas_pred = probas_pred_df, error = relative_error_df, model_tuning = model_tuning, outputs_pred = outputs_pred_list))
 }
diff --git a/R/rf_proba_k_fold.R b/R/rf_proba_k_fold.R
@@ -121,6 +121,8 @@ rf_probas_k_fold = function(design, outputs, threshold_classification, threshold
     probas_pred_df = rbind(probas_pred_df,probas_pred_cv)
     relative_error_df = rbind(relative_error_df, abs(probas_pred_cv - probas_true)/probas_true)
   }
+  colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2))
+  colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2))
   if(return_pred == FALSE){outputs_pred = NULL}
   return(list(list_search = list_search,probas_pred = probas_pred_df, error = relative_error_df, outputs_pred = outputs_pred))
 }

diff --git a/R/rf_proba_training_test.R b/R/rf_proba_training_test.R
@@ -127,6 +127,8 @@ rf_probas_training_test = function(design_train, design_test, outputs_train, out
     relative_error[probas_pred_cv != 0 & probas_true == 0] = Inf
     relative_error_df = rbind(relative_error_df, relative_error)
   }
+  colnames(relative_error_df) = c("ncoeff", "npc", 1:(ncol(relative_error_df)-2))
+  colnames(probas_pred_df) = c("ncoeff", "npc", 1:(ncol(probas_pred_df)-2))
   if(return_pred == FALSE){outputs_pred = NULL}
   return(list(list_search = list_search,probas_pred = probas_pred_df, error = relative_error_df, outputs_pred = outputs_pred))
 }
diff --git a/R/std_centroid.R b/R/std_centroid.R
@@ -34,7 +34,8 @@
 #' list_std_centroid = std_centroid(data = data, prototypes_list =
 #' prototypes_list, density_ratio = density_ratio, distance_func = distance_func
 #' , cells = 1:length(prototypes_list[[1]]), nv = 50)
-std_centroid = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(data)]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv,outputs_function = NULL, inputs = NULL, batch_size = nrow(inputs), return_cell_numbers = FALSE, bootstrap = NULL){
+std_centroid = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(dim(data))]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv = NULL,outputs_function = NULL, inputs = NULL, batch_size = nrow(inputs), return_cell_numbers = FALSE, bootstrap = NULL){
+  if(is.null(nv)){nv = length(density_ratio)}
   bool_cell_numbers = is.null(cell_numbers)
   std_ratio_list = list()
   if(is.null(data)){
@@ -92,10 +93,6 @@ std_centroid = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(
         for(boot in 1:bootstrap){
           idxs = sample(1:length(cell_numbers_it), size = length(cell_numbers_it), replace = TRUE)
           cell_numbers_boot = cell_numbers_it[idxs]
-          data <<- asub(data,dims = length(dim(data)), idx = idxs)
-          density_ratio <<- density_ratio[idxs]
-          cell_numbers <<- cell_numbers_boot
-          aa <<- compute_centroids_and_proba(data = asub(data,dims = length(dim(data)), idx = idxs), density_ratio = density_ratio[idxs], method_IS = "unique",cell_numbers = cell_numbers_boot, cells = cells)$centroids
           list_centro_boot[[boot]] = compute_centroids_and_proba(data = asub(data,dims = length(dim(data)), idx = idxs), density_ratio = density_ratio[idxs], method_IS = "unique",cell_numbers = cell_numbers_boot, cells = cells)$centroids
         }
         for(cell in 1:length(cells)){

diff --git a/R/std_proba.R b/R/std_proba.R
@@ -34,7 +34,8 @@
 #' list_std_proba = std_proba(data = data, prototypes_list = prototypes_list,
 #'  density_ratio = density_ratio, distance_func = distance_func,
 #'  cells = 1:length(prototypes_list[[1]]), nv = 50)
-std_proba = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(data)]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv, outputs_function = NULL, inputs = NULL, batch_size = NULL, return_cell_numbers = FALSE, bootstrap = NULL){
+std_proba = function(data = NULL, prototypes_list, density_ratio = rep(1,dim(data)[length(dim(data))]), distance_func = function(A1,A2){return(sqrt(sum((A1-A2)^2)))},cells, cell_numbers = NULL, nv = NULL, outputs_function = NULL, inputs = NULL, batch_size = NULL, return_cell_numbers = FALSE, bootstrap = NULL){
+  if(is.null(nv)){nv = length(density_ratio)}
   std_list = list()
   if(is.null(data) & is.null(cell_numbers)){
     nb_batch = nrow(inputs)%/%batch_size

diff --git a/man/find_prototypes.Rd b/man/find_prototypes.Rd
diff --git a/man/fit_metamodel.Rd b/man/fit_metamodel.Rd
diff --git a/man/std_centroid.Rd b/man/std_centroid.Rd
diff --git a/man/std_proba.Rd b/man/std_proba.Rd