diff --git a/src/esn/deepesn.jl b/src/esn/deepesn.jl
index 7ce458f..4b66975 100644
--- a/src/esn/deepesn.jl
+++ b/src/esn/deepesn.jl
@@ -44,27 +44,22 @@ temporal features.
     Default is an RNN model.
   - `nla_type`: The type of non-linear activation used in the reservoir.
     Default is `NLADefault()`.
-  - `states_type`: Defines the type of states used in the ESN (e.g., standard states).
-    Default is `StandardStates()`.
-  - `washout`: The number of initial timesteps to be discarded in the ESN's training phase.
-    Default is 0.
-  - `rng`: Random number generator used for initializing weights. Default is the package's
-    default random number generator.
+  - `states_type`: Defines the type of states used in the ESN
+    (e.g., standard states). Default is `StandardStates()`.
+  - `washout`: The number of initial timesteps to be discarded
+    in the ESN's training phase. Default is 0.
+  - `rng`: Random number generator used for initializing weights.
+    Default is `Utils.default_rng()`.
   - `matrix_type`: The type of matrix used for storing the training data.
     Default is inferred from `train_data`.
 
 # Example
 
 ```julia
-# Prepare your training data
-train_data = [your_training_data_here]
+train_data = rand(Float32, 3, 100)
 
 # Create a DeepESN with specific parameters
-deepESN = DeepESN(train_data, 10, 100; depth=3, washout=100)
-
-# Proceed with training and prediction (pseudocode)
-train(deepESN, target_data)
-prediction = predict(deepESN, new_data)
+deepESN = DeepESN(train_data, 3, 100; depth=3, washout=100)
 ```
 """
 function DeepESN(train_data,
diff --git a/src/esn/esn.jl b/src/esn/esn.jl
index b585db2..7c3010d 100644
--- a/src/esn/esn.jl
+++ b/src/esn/esn.jl
@@ -15,33 +15,42 @@ end
 """
     ESN(train_data; kwargs...) -> ESN
 
-Creates an Echo State Network (ESN) using specified parameters and training data, suitable for various machine learning tasks.
+Creates an Echo State Network (ESN).
 
-# Parameters
+# Arguments
 
-  - `train_data`: Matrix of training data (columns as time steps, rows as features).
+  - `train_data`: Matrix of training data `num_features x time_steps`.
   - `variation`: Variation of ESN (default: `Default()`).
   - `input_layer`: Input layer of ESN.
   - `reservoir`: Reservoir of the ESN.
   - `bias`: Bias vector for each time step.
+  - `rng`: Random number generator used for initializing weights.
+    Default is `Utils.default_rng()`.
   - `reservoir_driver`: Mechanism for evolving reservoir states (default: `RNN()`).
   - `nla_type`: Non-linear activation type (default: `NLADefault()`).
   - `states_type`: Format for storing states (default: `StandardStates()`).
   - `washout`: Initial time steps to discard (default: `0`).
   - `matrix_type`: Type of matrices used internally (default: type of `train_data`).
 
-# Returns
-
-  - An initialized ESN instance with specified parameters.
-
 # Examples
 
-```julia
-using ReservoirComputing
-
-train_data = rand(10, 100)  # 10 features, 100 time steps
+```jldoctest
+julia> train_data = rand(Float32, 10, 100)  # 10 features, 100 time steps
+10×100 Matrix{Float32}:
+ 0.567676   0.154756  0.584611  0.294015   …  0.573946    0.894333    0.429133
+ 0.327073   0.729521  0.804667  0.263944      0.559342    0.020167    0.897862
+ 0.453606   0.800058  0.568311  0.749441      0.0713146   0.464795    0.532854
+ 0.0173253  0.536959  0.722116  0.910328      0.00224048  0.00202501  0.631075
+ 0.366744   0.119761  0.100593  0.125122      0.700562    0.675474    0.102947
+ 0.539737   0.768351  0.54681   0.648672   …  0.256738    0.223784    0.94327
+ 0.558099   0.42676   0.1948    0.735625      0.0989234   0.119342    0.624182
+ 0.0603135  0.929999  0.263439  0.0372732     0.066125    0.332769    0.25562
+ 0.4463     0.334423  0.444679  0.311695      0.0494497   0.27171     0.214925
+ 0.987182   0.898593  0.295241  0.233098      0.789699    0.453692    0.759205
+
+julia> esn = ESN(train_data, 10, 300; washout=10)
+ESN(10 => 300)
 
-esn = ESN(train_data; reservoir=RandSparseReservoir(200), washout=10)
 ```
 """
 function ESN(train_data,
@@ -86,6 +95,9 @@ function (esn::AbstractEchoStateNetwork)(prediction::AbstractPrediction,
         kwargs...)
 end
 
+Base.show(io::IO, esn::ESN) =
+    print(io, "ESN(", size(esn.train_data, 1), " => ", size(esn.reservoir_matrix, 1), ")")
+
 #training dispatch on esn
 """
     train(esn::AbstractEchoStateNetwork, target_data, training_method = StandardRidge(0.0))
@@ -98,27 +110,29 @@ Trains an Echo State Network (ESN) using the provided target data and a specifie
   - `target_data`: Supervised training data for the ESN.
   - `training_method`: The method for training the ESN (default: `StandardRidge(0.0)`).
 
-# Returns
-
-  - The trained ESN model. Its type and structure depend on `training_method` and the ESN's implementation.
-
-# Returns
 
-The trained ESN model. The exact type and structure of the return value depends on the
-`training_method` and the specific ESN implementation.
+# Example
 
-```julia
-using ReservoirComputing
+```jldoctest
+julia> train_data = rand(Float32, 10, 100)  # 10 features, 100 time steps
+10×100 Matrix{Float32}:
+ 0.11437   0.425367  0.585867   0.34078   …  0.0531493  0.761425  0.883164
+ 0.301373  0.497806  0.279603   0.802417     0.49873    0.270156  0.333333
+ 0.135224  0.660179  0.394233   0.512753     0.901221   0.784377  0.687691
+ 0.510203  0.877234  0.614245   0.978405     0.332775   0.768826  0.527077
+ 0.955027  0.398322  0.312156   0.981938     0.473357   0.156704  0.476101
+ 0.353024  0.997632  0.164328   0.470783  …  0.745613   0.85797   0.465201
+ 0.966044  0.194299  0.599167   0.040475     0.0996013  0.325959  0.770103
+ 0.292068  0.495138  0.481299   0.214566     0.819573   0.155951  0.227168
+ 0.133498  0.451058  0.0761995  0.90421      0.994212   0.332164  0.545112
+ 0.214467  0.791524  0.124105   0.951805     0.947166   0.954244  0.889733
 
-# Initialize an ESN instance and target data
-esn = ESN(train_data; reservoir=RandSparseReservoir(200), washout=10)
-target_data = rand(size(train_data, 2))
+julia> esn = ESN(train_data, 10, 300; washout=10)
+ESN(10 => 300)
 
-# Train the ESN using the default training method
-trained_esn = train(esn, target_data)
+julia> output_layer = train(esn, rand(Float32, 3, 90))
+OutputLayer successfully trained with output size: 3
 
-# Train the ESN using a custom training method
-trained_esn = train(esn, target_data; training_method=StandardRidge(1.0))
 ```
 """
 function train(esn::AbstractEchoStateNetwork,
diff --git a/src/esn/esn_reservoir_drivers.jl b/src/esn/esn_reservoir_drivers.jl
index 65f6e62..7601a09 100644
--- a/src/esn/esn_reservoir_drivers.jl
+++ b/src/esn/esn_reservoir_drivers.jl
@@ -1,31 +1,26 @@
 abstract type AbstractReservoirDriver end
 
 """
-    create_states(
-        reservoir_driver::AbstractReservoirDriver,
-        train_data,
-        washout,
-        reservoir_matrix,
-        input_matrix,
-        bias_vector
-    )
+    create_states(reservoir_driver::AbstractReservoirDriver, train_data, washout,
+        reservoir_matrix, input_matrix, bias_vector)
 
-Create and return the trained Echo State Network (ESN) states according to the specified reservoir driver.
+Create and return the trained Echo State Network (ESN) states according to the
+specified reservoir driver.
 
 # Arguments
 
-  - `reservoir_driver::AbstractReservoirDriver`: The reservoir driver that determines how the ESN states evolve over time.
+  - `reservoir_driver`: The reservoir driver that determines how the ESN states evolve
+    over time.
   - `train_data`: The training data used to train the ESN.
-  - `washout::Int`: The number of initial time steps to discard during training to allow the reservoir dynamics to wash out the initial conditions.
-  - `reservoir_matrix`: The reservoir matrix representing the dynamic, recurrent part of the ESN.
-  - `input_matrix`: The input matrix that defines the connections between input features and reservoir nodes.
-  - `bias_vector`: The bias vector to be added at each time step during the reservoir update.
-
-# Returns
-
-  - A matrix of trained ESN states, where each column represents the state at a specific time step.
+  - `washout`: The number of initial time steps to discard during training to allow the
+    reservoir dynamics to wash out the initial conditions.
+  - `reservoir_matrix`: The reservoir matrix representing the dynamic, recurrent part of
+    the ESN.
+  - `input_matrix`: The input matrix that defines the connections between input features
+    and reservoir nodes.
+  - `bias_vector`: The bias vector to be added at each time step during the reservoir
+    update.
 
-This function is responsible for creating and returning the states of the ESN during training based on the provided training data and parameters.
 """
 function create_states(reservoir_driver::AbstractReservoirDriver,
         train_data,
@@ -99,7 +94,8 @@ end
     RNN(activation_function, leaky_coefficient)
     RNN(;activation_function=tanh, leaky_coefficient=1.0)
 
-Returns a Recurrent Neural Network (RNN) initializer for the Echo State Network (ESN).
+Returns a Recurrent Neural Network (RNN) initializer for
+echo state networks (`ESN`).
 
 # Arguments
 
@@ -108,11 +104,12 @@ Returns a Recurrent Neural Network (RNN) initializer for the Echo State Network
 
 # Keyword Arguments
 
-  - `activation_function`: The activation function used in the RNN. Defaults to `tanh`.
-  - `leaky_coefficient`: The leaky coefficient used in the RNN. Defaults to 1.0.
+  - `activation_function`: The activation function used in the RNN.
+    Defaults to `tanh_fast`.
+  - `leaky_coefficient`: The leaky coefficient used in the RNN.
+    Defaults to 1.0.
+
 
-This function creates an RNN object with the specified activation function and leaky coefficient,
-which can be used as a reservoir driver in the ESN.
 """
 function RNN(; activation_function=NNlib.fast_act(tanh), leaky_coefficient=1.0)
     RNN(activation_function, leaky_coefficient)
@@ -163,25 +160,33 @@ end
     MRNN(;activation_function=[tanh, sigmoid], leaky_coefficient=1.0,
         scaling_factor=fill(leaky_coefficient, length(activation_function)))
 
-Returns a Multiple RNN (MRNN) initializer for the Echo State Network (ESN), introduced in [^lun].
+Returns a Multiple RNN (MRNN) initializer for the Echo State Network (ESN),
+introduced in [^Lun2015].
 
 # Arguments
 
-  - `activation_function`: A vector of activation functions used in the MRNN.
+  - `activation_function`: A vector of activation functions used
+    in the MRNN.
   - `leaky_coefficient`: The leaky coefficient used in the MRNN.
-  - `scaling_factor`: A vector of scaling factors for combining activation functions.
+  - `scaling_factor`: A vector of scaling factors for combining activation
+    functions.
 
 # Keyword Arguments
 
-  - `activation_function`: A vector of activation functions used in the MRNN. Defaults to `[tanh, sigmoid]`.
-  - `leaky_coefficient`: The leaky coefficient used in the MRNN. Defaults to 1.0.
-  - `scaling_factor`: A vector of scaling factors for combining activation functions. Defaults to an array of the same size as `activation_function` with all elements set to `leaky_coefficient`.
+  - `activation_function`: A vector of activation functions used in the MRNN.
+    Defaults to `[tanh, sigmoid]`.
+  - `leaky_coefficient`: The leaky coefficient used in the MRNN.
+    Defaults to 1.0.
+  - `scaling_factor`: A vector of scaling factors for combining activation functions.
+    Defaults to an array of the same size as `activation_function` with all
+    elements set to `leaky_coefficient`.
 
-This function creates an MRNN object with the specified activation functions, leaky coefficient, and scaling factors, which can be used as a reservoir driver in the ESN.
+This function creates an MRNN object with the specified activation functions,
+leaky coefficient, and scaling factors, which can be used as a reservoir driver
+in the ESN.
 
-# Reference:
 
-[^lun]: Lun, Shu-Xian, et al.
+[^Lun2015]: Lun, Shu-Xian, et al.
     "_A novel model of leaky integrator echo state network for
     time-series prediction._" Neurocomputing 159 (2015): 58-66.
 """
@@ -208,16 +213,6 @@ function next_state!(out, mrnn::MRNN, x, y, W, W_in, b, tmp_array)
     return out
 end
 
-#=
-function next_state!(out, mrnn::MRNN, x, y, W, W_in, b, tmp_array)
-    rnn_next_state = (1-mrnn.leaky_coefficient).*x
-    for i=1:length(mrnn.scaling_factor)
-        rnn_next_state += mrnn.scaling_factor[i]*mrnn.activation_function[i].((W*x).+(W_in*y).+b)
-    end
-    rnn_next_state
-end
-=#
-
 function allocate_tmp(::MRNN, tmp_type, res_size)
     return [Adapt.adapt(tmp_type, zeros(res_size, 1)) for i in 1:2]
 end
@@ -236,19 +231,16 @@ end
 """
     FullyGated()
 
-Returns a Fully Gated Recurrent Unit (FullyGated) initializer for the Echo State Network (ESN).
+Returns a Fully Gated Recurrent Unit (FullyGated) initializer
+for the Echo State Network (ESN).
 
-This function creates a FullyGated object, which can be used as a reservoir driver in the ESN.
-The FullyGated variant is described in the literature reference [^cho].
+Returns the standard gated recurrent unit [^Cho2014] as a driver for the 
+echo state network (`ESN`).
 
-# Returns
 
-  - `FullyGated`: A FullyGated reservoir driver.
-
-# Reference
-
-[^cho]: Cho, Kyunghyun, et al.
-    "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
+[^Cho2014]: Cho, Kyunghyun, et al.
+    "_Learning phrase representations using RNN encoder-decoder
+    for statistical machine translation._"
     arXiv preprint arXiv:1406.1078 (2014).
 """
 struct FullyGated <: AbstractGRUVariant end
@@ -256,9 +248,10 @@ struct FullyGated <: AbstractGRUVariant end
 """
     Minimal()
 
-Returns a minimal GRU ESN initializer as described in [^Zhou].
+Returns a minimal GRU ESN initializer as described in [^Zhou2016].
 
-[^Zhou]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent neural networks._"
+[^Zhou2016]: Zhou, Guo-Bing, et al. "_Minimal gated unit for recurrent
+    neural networks._"
     International Journal of Automation and Computing 13.3 (2016): 226-234.
 """
 struct Minimal <: AbstractGRUVariant end
@@ -271,23 +264,25 @@ struct Minimal <: AbstractGRUVariant end
         bias = fill(DenseLayer(), 2),
         variant = FullyGated())
 
-Returns a Gated Recurrent Unit (GRU) reservoir driver for Echo State Networks (ESNs). This driver is based on the GRU architecture [^Cho], which is designed to capture temporal dependencies in data and is commonly used in various machine learning applications.
+Returns a Gated Recurrent Unit (GRU) reservoir driver for Echo State Network (`ESN`).
+This driver is based on the GRU architecture [^Cho2014].
 
 # Arguments
 
-  - `activation_function`: An array of activation functions for the GRU layers. By default, it uses sigmoid activation functions for the update gate, reset gate, and tanh for the hidden state.
-  - `inner_layer`: An array of inner layers used in the GRU architecture. By default, it uses two dense layers.
-  - `reservoir`: An array of reservoir layers. By default, it uses two random sparse reservoirs.
-  - `bias`: An array of bias layers for the GRU. By default, it uses two dense layers.
-  - `variant`: The GRU variant to use. By default, it uses the "FullyGated" variant.
-
-# Returns
-
-A GRUParams object containing the parameters needed for the GRU-based reservoir driver.
+  - `activation_function`: An array of activation functions for the GRU layers.
+    By default, it uses sigmoid activation functions for the update gate, reset gate,
+    and tanh for the hidden state.
+  - `inner_layer`: An array of inner layers used in the GRU architecture.
+    By default, it uses two dense layers.
+  - `reservoir`: An array of reservoir layers.
+    By default, it uses two random sparse reservoirs.
+  - `bias`: An array of bias layers for the GRU.
+    By default, it uses two dense layers.
+  - `variant`: The GRU variant to use.
+    By default, it uses the "FullyGated" variant.
 
-# References
 
-[^Cho]: Cho, Kyunghyun, et al.
+[^Cho2014]: Cho, Kyunghyun, et al.
     "_Learning phrase representations using RNN encoder-decoder for statistical machine translation._"
     arXiv preprint arXiv:1406.1078 (2014).
 """
diff --git a/src/esn/hybridesn.jl b/src/esn/hybridesn.jl
index 13c8b64..5453c81 100644
--- a/src/esn/hybridesn.jl
+++ b/src/esn/hybridesn.jl
@@ -24,8 +24,8 @@ end
 """
 KnowledgeModel(prior_model, u0, tspan, datasize)
 
-Constructs a `Hybrid` variation of Echo State Networks (ESNs) integrating a knowledge-based model
-(`prior_model`) with ESNs for advanced training and prediction in chaotic systems.
+Constructs a `Hybrid` variation of Echo State Networks (ESNs) [^Pathak2018]
+integrating a knowledge-based model (`prior_model`) with ESNs.
 
 # Parameters
 
@@ -34,15 +34,7 @@ Constructs a `Hybrid` variation of Echo State Networks (ESNs) integrating a know
   - `tspan`: Time span as a tuple, indicating the duration for model operation.
   - `datasize`: The size of the data to be processed.
 
-# Returns
-
-  - A `Hybrid` struct instance representing the combined ESN and knowledge-based model.
-
-This method is effective for chaotic processes as highlighted in [^Pathak].
-
-Reference:
-
-[^Pathak]: Jaideep Pathak et al.
+[^Pathak2018]: Jaideep Pathak et al.
     "Hybrid Forecasting of Chaotic Processes:
     Using Machine Learning in Conjunction with a Knowledge-Based Model" (2018).
 """
@@ -59,11 +51,7 @@ end
     HybridESN(model, train_data, in_size, res_size; kwargs...)
 
 Construct a Hybrid Echo State Network (ESN) model that integrates
-traditional Echo State Networks with a predefined knowledge model for
-enhanced performance on chaotic systems or complex datasets. This
-constructor allows for the creation of a customized ESN architecture by
-specifying the reservoir size, input size, and various other parameters that
-influence the network's behavior and learning capacity.
+traditional Echo State Networks with a predefined knowledge model [^Pathak2018].
 
 # Parameters
 
@@ -79,41 +67,29 @@ influence the network's behavior and learning capacity.
 
 # Optional Keyword Arguments
 
-  - `input_layer`: A function to initialize the input matrix. Default is `scaled_rand`.
-  - `reservoir`: A function to initialize the reservoir matrix. Default is `rand_sparse`.
-  - `bias`: A function to initialize the bias vector. Default is `zeros64`.
-  - `reservoir_driver`: The driving system for the reservoir. Default is an RNN model.
+  - `input_layer`: A function to initialize the input matrix.
+    Default is `scaled_rand`.
+  - `reservoir`: A function to initialize the reservoir matrix.
+    Default is `rand_sparse`.
+  - `bias`: A function to initialize the bias vector.
+    Default is `zeros32`.
+  - `reservoir_driver`: The driving system for the reservoir.
+    Default is an RNN model.
   - `nla_type`: The type of non-linear activation used in the reservoir.
     Default is `NLADefault()`.
-  - `states_type`: Defines the type of states used in the ESN (e.g., standard states).
-    Default is `StandardStates()`.
-  - `washout`: The number of initial timesteps to be discarded in the ESN's training phase.
-    Default is 0.
-  - `rng`: Random number generator used for initializing weights. Default is the package's
-    default random number generator.
-  - `T`: The data type for the matrices (e.g., `Float32`). Influences computational
-    efficiency and precision.
-  - `matrix_type`: The type of matrix used for storing the training data. Default is
-    inferred from `train_data`.
-
-# Returns
-
-  - A `HybridESN` instance configured according to the provided parameters and
-    suitable for further training and prediction tasks.
-
-# Example
-
-```julia
-# Define a KnowledgeModel
-km = KnowledgeModel(prior_model_function, u0, (0, 100), 1000)
-
-# Create a HybridESN
-hesn = HybridESN(km, train_data, 10, 100; washout=100)
-
-# Train and predict
-train(hesn, target_data)
-prediction = hesn(prediction_object, output_layer)
-```
+  - `states_type`: Defines the type of states used in the
+    ESN. Default is `StandardStates()`.
+  - `washout`: The number of initial timesteps to be
+    discarded in the ESN's training phase. Default is 0.
+  - `rng`: Random number generator used for initializing weights.
+    Default is `Utils.default_rng()`.
+  - `T`: The data type for the matrices (e.g., `Float32`).
+  - `matrix_type`: The type of matrix used for storing the training data.
+    Default is inferred from `train_data`.
+
+[^Pathak2018]: Jaideep Pathak et al.
+    "Hybrid Forecasting of Chaotic Processes:
+    Using Machine Learning in Conjunction with a Knowledge-Based Model" (2018).
 """
 function HybridESN(model,
         train_data,
diff --git a/src/predict.jl b/src/predict.jl
index 1cfca9a..41a226c 100644
--- a/src/predict.jl
+++ b/src/predict.jl
@@ -9,6 +9,9 @@ struct OutputLayer{T, I, S, L} <: AbstractOutputLayer
     last_value::L
 end
 
+Base.show(io::IO, ol::OutputLayer) = 
+    print(io, "OutputLayer successfully trained with output size: ", ol.out_size)
+
 #prediction types
 """
     Generative(prediction_len)
@@ -19,14 +22,12 @@ subsequent prediction steps.
 
 # Parameters
 
-  - `prediction_len::Int`: The number of future steps to predict.
+  - `prediction_len`: The number of future steps to predict.
 
 # Description
 
 The `Generative` prediction method allows a model to perform multi-step
 forecasting by using its own previous predictions as inputs for future predictions.
-This approach is especially useful in time series analysis, where each prediction
-depends on the preceding data points.
 
 At each step, the model takes the current input, generates a prediction,
 and then incorporates that prediction into the input for the next step.
@@ -51,21 +52,13 @@ of input features (`prediction_data`).
 
 # Parameters
 
-  - `prediction_data`: The input data used for prediction, typically structured as a matrix
-    where each column represents a sample, and each row represents a feature.
+  - `prediction_data`: The input data used for prediction, `feature` x `sample`
 
 # Description
 
-The `Predictive` prediction method is a standard approach
-in supervised machine learning tasks. It uses the provided input data
+The `Predictive` prediction method uses the provided input data
 (`prediction_data`) to produce corresponding labels or outputs based
-on the learned relationships in the model. Unlike generative prediction,
-this method does not recursively feed predictions into the model;
-instead, it operates on fixed input data to produce a single batch of predictions.
-
-This method is suitable for tasks like classification,
-regression, or other use cases where the input features
-and the number of steps are predefined.
+on the learned relationships in the model. 
 """
 function Predictive(prediction_data)
     prediction_len = size(prediction_data, 2)