From dbc0fffb61e0c11d1bd5c120f51988d526200f49 Mon Sep 17 00:00:00 2001 From: Montasser Ghachem <37964692+monty-se@users.noreply.github.com> Date: Mon, 21 Oct 2024 23:01:01 +0200 Subject: [PATCH] feat: Introduce ivpin() function and update related documentation and code - Added new ivpin() function, an improved version of the Volume-Synchronized Probability of Informed Trading (VPIN), based on Lin and Ke (2017). - Updated documentation files including README.md, NEWS.md, cran-comments.md, and vignettes/PINstimation.rmd to reflect the introduction of ivpin(). - Added new references to inst/REFERENCES.bib for the Lin and Ke (2017) paper. - Modified relevant help files in man/ (estimate.vpin-class.Rd and PINstimation-package.Rd) to include ivpin(). - Updated the NAMESPACE to export ivpin(). - Adjusted R/model_vpin.R and R/model_factorizations.R to incorporate the logic for ivpin(). - Added validation checks in R/args_validation.R and updated messages in R/utilities_messages.R. - Refactored output classes to include ivpin() support in R/output_classes.R. --- .gitignore | 3 +- DESCRIPTION | 2 +- NAMESPACE | 4 + NEWS.md | 9 + R/PINstimation.R | 2 + R/args_validation.R | 2 +- R/model_factorizations.R | 58 +++ R/model_vpin.R | 896 +++++++++++++++++++++++++++++++++--- R/output_classes.R | 16 +- R/utilities_messages.R | 40 +- README.md | 9 + authors.md | 94 ---- cran-comments.md | 52 ++- inst/REFERENCES.bib | 11 + man/PINstimation-package.Rd | 12 + man/estimate.vpin-class.Rd | 8 + man/vpin.Rd | 110 ----- man/vpin_measures.Rd | 186 ++++++++ vignettes/PINstimation.rmd | 1 + 19 files changed, 1219 insertions(+), 296 deletions(-) delete mode 100644 authors.md delete mode 100644 man/vpin.Rd create mode 100644 man/vpin_measures.Rd diff --git a/.gitignore b/.gitignore index ab2ace6..adb7517 100644 --- a/.gitignore +++ b/.gitignore @@ -10,5 +10,6 @@ docs/ rjarticle/ _pkgdown.yml pkgdown/ -man/*.Rd vignettes/*.html +/doc/ +/Meta/ diff --git a/DESCRIPTION b/DESCRIPTION index b66ddf0..3792a2d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -18,7 +18,7 @@ LazyDataCompression: xz RoxygenNote: 7.3.1 Roxygen: list(markdown = TRUE) VignetteBuilder: knitr -Imports: Rdpack, knitr, methods, skellam, nloptr, furrr, future, dplyr, rmarkdown, coda +Imports: Rdpack, knitr, methods, skellam, nloptr, furrr, future, dplyr, rmarkdown, coda, magrittr RdMacros: Rdpack Depends: R (>= 3.5.0) Suggests: fansi, htmltools diff --git a/NAMESPACE b/NAMESPACE index 9e68a3e..acb2117 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -21,6 +21,7 @@ export(initials_mpin) export(initials_pin_ea) export(initials_pin_gwj) export(initials_pin_yz) +export(ivpin) export(mpin_ecm) export(mpin_ml) export(pin) @@ -39,11 +40,13 @@ importFrom(coda,geweke.diag) importFrom(coda,mcmc) importFrom(dplyr,"%>%") importFrom(dplyr,group_by) +importFrom(dplyr,mutate) importFrom(dplyr,summarize) importFrom(furrr,future_map) importFrom(future,multisession) importFrom(future,plan) importFrom(future,sequential) +importFrom(magrittr,"%>%") importFrom(methods,is) importFrom(methods,new) importFrom(methods,show) @@ -55,6 +58,7 @@ importFrom(stats,cutree) importFrom(stats,dist) importFrom(stats,dpois) importFrom(stats,hclust) +importFrom(stats,lag) importFrom(stats,na.omit) importFrom(stats,optim) importFrom(stats,pnorm) diff --git a/NEWS.md b/NEWS.md index 747f58b..a44c46d 100644 --- a/NEWS.md +++ b/NEWS.md @@ -4,6 +4,15 @@ - **`adjpin()`**: The function now includes the time spent on generating initial parameter sets in the total time displayed in the output. This enhancement provides a more comprehensive view of the time taken for the entire process. + +- **`ivpin()`**: This function implements an improved version of the +Volume-Synchronized Probability of Informed Trading (VPIN) based on the work of +Lin and Ke (2017). By employing a maximum likelihood estimation, `ivpin()` +enhances the stability of VPIN estimates, especially in cases with small volume +buckets or infrequent informed trades. The function captures the information +embedded in volume time, generating more consistent and reliable results. It is +designed to improve the predictability of flow toxicity in trading environments. + ## Updates - **`initials_adjpin_rnd()`**: Updated the implementation for generating random diff --git a/R/PINstimation.R b/R/PINstimation.R index 8cd798b..543c53c 100644 --- a/R/PINstimation.R +++ b/R/PINstimation.R @@ -173,6 +173,8 @@ #' \insertCite{Yan2012;textual}{PINstimation}. #' \item \link{vpin} estimates the volume-synchronized probability of informed #' trading (`VPIN`). +#' \item \link{ivpin} estimates the improved volume-synchronized probability +#' of informed trading (`IVPIN`). #' } #' #' @section Datasets: diff --git a/R/args_validation.R b/R/args_validation.R index a84be29..bb06802 100644 --- a/R/args_validation.R +++ b/R/args_validation.R @@ -638,7 +638,7 @@ if (vn %in% c( "fact", "verbose", "is_parallel", "correction", "ea_correction", - "fullreport")) gn <- "xlogical" + "fullreport", "improved")) gn <- "xlogical" if (vn %in% c("algorithm", "method", "detectlayers", "factorization", "frequency")) { gn <- "xcharacter" diff --git a/R/model_factorizations.R b/R/model_factorizations.R index a720799..64465d3 100644 --- a/R/model_factorizations.R +++ b/R/model_factorizations.R @@ -802,6 +802,64 @@ factorizations <- list( return(-lkhd) } + }, + + ivpin = function(data) { + # returns the factorization of the likelihood function associed the ivpin + # model evaluated at the dataset 'data' to be used with optimization + # functions such as optim() or neldermead() + # + # Args: + # data : the dataset of Vb, Vs and t (See paper of Ke and Lin 2017) + # + # Returns: + # a function with argument 'params' + + function(params) { + + # If 'params' is not valid, return +Inf + # -------------------------------------------------------------- + if (!missing(params) && length(params) != 5) return(+Inf) + + # Prepare 'data' and initialize variables + # -------------------------------------------------------------- + + colnames(data) <- c("vb", "vs", "t") + a <- d <- mu <- eb <- es <- NULL + + # Get the names of the variable from the function .xmpin(). + # Without arguments , it returns c("a", "d", "mu", "eb", "es") + variables <- .xmpin$varnames() + for (i in 1:5) assign(variables[i], params[i]) + + # Start by constructing variables e1, e2, e3. Each of them is + # constructed daily and is stored in a column with the same name. + # The variable emax is constructed by taking the maximum among + # e1, e2 and e3; and is stored in a column with the same name. + # ------------------------------------------------------------- + # e1 <- rep(log(alpha * delta), nrow(data)) + data$vb * log(eps.b) + + # data$vs * log(eps.s + mu) - (eps.b + eps.s + mu) * data$t + # e2 <- rep(log(alpha * (1 - delta)), nrow(data)) + data$vb * log(eps.b + mu) + + # data$vs * log(eps.s) - (eps.b + eps.s + mu) * data$t + # e3 <- rep(log(1 - alpha), nrow(data)) + data$vb * log(eps.b) + + # data$vs * log(eps.b) - (eps.b + eps.s) * data$t + # emax <- pmax(e1, e2, e3) + + e1 <- log(a * d) + data$vb * log(eb) + + data$vs * log(es + mu) - (eb + es + mu) * data$t + e2 <- log(a * (1 - d)) + data$vb * log(eb + mu) + + data$vs * log(es) - (eb + es + mu) * data$t + e3 <- log(1 - a) + data$vb * log(eb) + + data$vs * log(es) - (eb + es) * data$t + emax <- pmax(e1, e2, e3, na.rm = TRUE) + + # Compute and return the value of the log-likelihood function + # -------------------------------------------------------------- + lkhd <- - sum(log(exp(e1 - emax) + exp(e2 - emax) + exp(e3 - emax)) + + emax, na.rm = TRUE) + + return(lkhd) + } } ) diff --git a/R/model_vpin.R b/R/model_vpin.R index 8510b4c..a52818a 100644 --- a/R/model_vpin.R +++ b/R/model_vpin.R @@ -7,11 +7,15 @@ ## Implement the estimation of the Volume-synchronized PIN ## measure (VPIN) of Easley et al. (2011, 2012) ## +## Implement the estimation of the improved Volume-synchronized PIN +## measure (IVPIN) of Ke and Lin (2017) +## ## Author: -## Montasser Ghachem +## Montasser Ghachem (vpin) +## Alexandre Borentain Cristea and Montasser Ghachem (ivpin) ## ## Last updated: -## 2022-10-18 +## 2024-08-05 ## ## License: ## GPL 3 @@ -25,9 +29,12 @@ ## ++++++++++++++++++ ## ## vpin(): -## Estimates the Volume-Synchronized Probability of Informed -## Trading as developed in Easley (2011, 2012). +## Estimates the Volume-synchronized probability of Informed +## trading as developed in Easley (2011, 2012). ## +## ivpin(): +## Estimates the Improved Volume-synchronized probability of Informed +## trading as developed in Ke and Lin (2017). ## ++++++++++++++++++ ## ## @@ -42,14 +49,14 @@ ## +++++++++++++++++++++++++ -#' @title Estimation of Volume-Synchronized PIN model +#' @title Estimation of Volume-Synchronized PIN model (vpin) and the improved +#' volume-synchronized PIN model (ivpin) #' #' @description Estimates the Volume-Synchronized Probability of Informed #' Trading as developed in \insertCite{Easley2011;textual}{PINstimation} -#' and \insertCite{Easley2012;textual}{PINstimation}. -#' -#' @usage vpin(data, timebarsize = 60, buckets = 50, samplength = 50, -#' tradinghours = 24, verbose = TRUE) +#' and \insertCite{Easley2012;textual}{PINstimation}. \cr +#' Estimates the improved Volume-Synchronized Probability of Informed +#' Trading as developed in \insertCite{ke2017improved;textual}{PINstimation}. #' #' @param data A dataframe with 3 variables: #' \code{{timestamp, price, volume}}. @@ -62,84 +69,147 @@ #' The default value is \code{50}. #' @param tradinghours An integer referring to the length of daily #' trading sessions in hours. The default value is \code{24}. -#' -#' @param verbose A binary variable that determines whether detailed -#' information about the steps of the estimation of the VPIN model is displayed. -#' No output is produced when \code{verbose} is set to \code{FALSE}. The default -#' value is \code{TRUE}. +#' @param grid_size An integer between `1`, and `20`; +#' representing the size of the grid used in the estimation of IVPIN. The +#' default value is `5`. See more in details. +#' @param verbose A logical variable that determines whether detailed +#' information about the steps of the estimation of the VPIN (IVPIN) model is +#' displayed. No output is produced when \code{verbose} is set to \code{FALSE}. +#' The default value is \code{TRUE}. #' #' @details The dataframe data should contain at least three variables. Only the #' first three variables will be considered and in the following order #' \code{{timestamp, price, volume}}. #' -#' The property \code{@bucketdata} is created as in -#' \insertCite{abad2012;textual}{PINstimation}. -#' #' The argument `timebarsize` is in seconds enabling the user to implement #' shorter than `1` minute intervals. The default value is set to `1` minute #' (`60` seconds) following Easley et al. (2011, 2012). #' -#' The parameter `tradinghours` is used to eventually correct the duration per -#' bucket. The duration of a given bucket is the difference between the +#' The argument `tradinghours` is used to correct the duration per +#' bucket if the market trading session does not cover a full day \code{(24 hours)}. +#' The duration of a given bucket is the difference between the #' timestamp of the last trade `endtime` and the timestamp of the first trade -#' `stime` in the bucket. If the first trade and the last trade in a -#' bucket occur in two different days, and the market trading session does not -#' cover a full day \code{(24 hours)}; then the duration of the bucket will be -#' inflated. Assume that the daily trading session is 8 hours -#' `(tradinghours=8)`, the start time of a bucket is \code{2018-10-12 17:06:40} -#' and its end time is \code{2018-10-13 09:36:00}. A straightforward calculation -#' gives that the duration of this bucket is \code{59,360 secs}. However, this -#' duration includes the time during which the market is closed `(16 hours)`. -#' The corrected duration takes into consideration only the time of market -#' activity: `duration=59,360-16*3600= 1760 secs`, i.e., about `30 minutes`. -#' -#' @return Returns an object of class \code{estimate.vpin}. +#' `stime` in the bucket. If the first and last trades in a bucket occur +#' on different days, and the market trading session is shorter than +#' `24 hours`, the bucket's duration will be inflated. For example, if the daily +#' trading session is 8 hours `(tradinghours = 8)`, and the start time of a +#' bucket is \code{2018-10-12 17:06:40} and its end time is +#' \code{2018-10-13 09:36:00}, the straightforward calculation gives a duration +#' of \code{59,360 secs}. However, this duration includes 16 hours when the +#' market is closed. The corrected duration considers only the market activity +#' time: \code{duration = 59,360 - 16 * 3600 = 1,760 secs}, approximately +#' `30 minutes`. +#' +#' The argument `grid_size` determines the size of the grid for the variables +#' `alpha` and `delta`, used to generate the initial parameter sets +#' that prime the maximum-likelihood estimation step of the +#' algorithm by \insertCite{ke2017improved;textual}{PINstimation} for estimating +#' `IVPIN`. If `grid_size` is set to a value `m`, the algorithm creates a +#' sequence starting from \code{1 / (2m)} and ending at \code{1 - 1 / (2m)}, with a +#' step of `1 / m`. The default value of `5` corresponds to the grid size used by +#' \insertCite{Yan2012;textual}{PINstimation}, where the sequence starts at +#' \code{0.1 = 1 / (2 * 5)} and ends at \code{0.9 = 1 - 1 / (2 * 5)} +#' with a step of \code{0.2 = 1 / 5}. Increasing the value of `grid_size` +#' increases the running time and may marginally improve the accuracy of the +#' IVPIN estimates +#' +#' @return Returns an object of class \code{estimate.vpin}, which +#' contains the following slots: +#' \describe{ +#' \item{\code{@improved}}{ A logical variable that takes the value `FALSE` +#' when the classical VPIN model is estimated (using `vpin()`), and `TRUE` +#' when the improved VPIN model is estimated (using `ivpin()`).} +#' \item{\code{@bucketdata}}{ A data frame created as in +#' \insertCite{abad2012;textual}{PINstimation}.} +#' \item{\code{@vpin}}{ A vector of VPIN values.} +#' \item{\code{@ivpin}}{ A vector of IVPIN values, which remains empty when +#' the function `vpin()` is called.} +#' } #' #' @references #' #' \insertAllCited #' #' @examples -#' # There is a preloaded dataset called 'hfdata' contained in the package. -#' # It is an artificially created high-frequency trading data. The dataset -#' # contains 100 000 trades and five variables 'timestamp', 'price', -#' # 'volume', 'bid' and 'ask'. For more information, type ?hfdata. +#' # The package includes a preloaded dataset called 'hfdata'. +#' # This dataset is an artificially created high-frequency trading data +#' # containing 100,000 trades and five variables: 'timestamp', 'price', +#' # 'volume', 'bid', and 'ask'. For more information, type ?hfdata. #' #' xdata <- hfdata #' -#' # Estimate VPIN model, using the following parameter set where the time -#' # bar size is 5 minutes, i.e., 300 seconds (timebarsize = 300), 50 -#' # buckets per average daily volume (buckets = 50), and a window size of -#' # 250 for the VPIN calculation (samplength = 250). +#' ### Estimation of the VPIN model ### +#' +#' # Estimate the VPIN model using the following parameters: +#' # - timebarsize: 5 minutes (300 seconds) +#' # - buckets: 50 buckets per average daily volume +#' # - samplength: 250 for the VPIN calculation #' #' estimate <- vpin(xdata, timebarsize = 300, buckets = 50, samplength = 250) #' -#' # Display a description of the estimate +#' # Display a description of the VPIN estimate #' #' show(estimate) #' -#' # Plot the estimated VPIN vector -#' -#' plot(estimate@vpin, type = "l", xlab = "time", ylab = "VPIN", col = "blue") -#' -#' # Display the parameters of VPIN estimates +#' # Display the parameters of the VPIN estimates #' #' show(estimate@parameters) #' -#' # Store the computed data of the different buckets in a dataframe 'buckets'. -#' # Display the first 10 rows of the dataframe 'buckets'. +#' # Display the summary statistics of the VPIN vector +#' summary(estimate@vpin) +#' +#' # Store the computed data of the different buckets in a dataframe 'buckets' +#' # and display the first 10 rows of the dataframe. #' #' buckets <- estimate@bucketdata #' show(head(buckets, 10)) #' -#' # Store the daily VPIN values (weighted and unweighted) in a dataframe -#' # 'dayvpin'. -#' #' # Display the first 10 rows of the dataframe 'dayvpin'. -#' #' dayvpin <- estimate@dailyvpin #' show(head(dayvpin, 10)) #' +#' +#' ### Estimation of the IVPIN model ### +#' +#' # Estimate the IVPIN model using the same parameters as above. +#' # The grid_size parameter is unspecified and will default to 5. +#' +#' iestimate <- ivpin(xdata, timebarsize = 300, samplength = 250, verbose = FALSE) +#' +#' # Display the summary statistics of the IVPIN vector +#' summary(iestimate@ivpin) +#' +#' # The output of ivpin() also contains the VPIN vector in the @vpin slot. +#' # Plot the VPIN and IVPIN vectors in the same plot using the iestimate object. +#' +#' # Define the range for the VPIN and IVPIN vectors, removing NAs. +#' vpin_range <- range(c(iestimate@vpin, iestimate@ivpin), na.rm = TRUE) +#' +#' # Plot the VPIN vector in blue +#' plot(iestimate@vpin, type = "l", col = "blue", ylim = vpin_range, +#' ylab = "Value", xlab = "Bucket", main = "Plot of VPIN and IVPIN") +#' +#' # Add the IVPIN vector in red +#' lines(iestimate@ivpin, type = "l", col = "red") +#' +#' # Add a legend to the plot +#' legend("topright", legend = c("VPIN", "IVPIN"), col = c("blue", "red"), +#' lty = 1, +#' cex = 0.6, # Adjust the text size +#' x.intersp = 1.2, # Adjust the horizontal spacing +#' y.intersp = 2, # Adjust the vertical spacing +#' inset = c(0.05, 0.05)) # Adjust the position slightly +#' +#' @importFrom magrittr %>% +#' @importFrom dplyr group_by mutate +#' @importFrom stats lag +#' @aliases vpin ivpin +#' @name vpin_measures +NULL + + +#' @rdname vpin_measures +#' @aliases vpin ivpin #' @export vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, tradinghours = 24, verbose = TRUE) { @@ -152,8 +222,9 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, @tradinghours : length of trading days - used to correct the durations of buckets. Default value is 24. " + vpin_err <- uierrors$vpin() - vpin_ms <- uix$vpin(timebarsize = timebarsize) + vpin_ms <- uix$vpin(timebarsize = timebarsize, improved = FALSE) # Check that all variables exist and do not refer to non-existent variables # -------------------------------------------------------------------------- @@ -169,12 +240,105 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, rst <- .xcheck$args(arglist = largs, fn = "vpin") ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) + .vpin(data = data, timebarsize = timebarsize, buckets = buckets, + samplength = samplength, tradinghours = tradinghours, improved = FALSE, + verbose = verbose) +} + + +#' @rdname vpin_measures +#' @aliases vpin ivpin +#' @export +ivpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, + tradinghours = 24, grid_size = 5, verbose = TRUE) { + + " +@timebarsize : the size of timebars in seconds default value: 60 +@buckets : number of buckets per volume of bucket size default value: 50 +@samplength : sample length or window of buckets to calculate VPIN default + value: 50 +@tradinghours : length of trading days - used to correct the durations of + buckets. Default value is 24. +@grid_size : size of the grid used to generate initial parameter sets for + the maximum-likelihood step for the estimation of ivpin. +" + + vpin_err <- uierrors$vpin() + vpin_ms <- uix$vpin(timebarsize = timebarsize, improved = TRUE) + + # Check that all variables exist and do not refer to non-existent variables + # -------------------------------------------------------------------------- + allvars <- names(formals()) + environment(.xcheck$existence) <- environment() + .xcheck$existence(allvars, err = uierrors$vpin()$fn) + + # Checking the the arguments of the function are valid + # Check that all arguments are valid + # ------------------------------------------------------------------------- + largs <- list(data, timebarsize, buckets, samplength, tradinghours, grid_size, + verbose) + names(largs) <- names(formals()) + rst <- .xcheck$args(arglist = largs, fn = "vpin") + ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) + + .vpin(data = data, timebarsize = timebarsize, buckets = buckets, + samplength = samplength, tradinghours = tradinghours, + grid_size = grid_size, improved = TRUE, verbose = verbose) +} + + + + + +## +++++++++++++++++++++++++ +## ++++++| | PRIVATE FUNCTIONS | | +## +++++++++++++++++++++++++ + + + +.vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, + tradinghours = 24, improved = FALSE, grid_size = 5, + verbose = TRUE) { + + " +@timebarsize : the size of timebars in seconds default value: 60 +@buckets : number of buckets per volume of bucket size default value: 50 +@samplength : sample length or window of buckets to calculate VPIN default + value: 50 +@tradinghours : length of trading days - used to correct the durations of + buckets. Default value is 24. +@improved : selects the estimation model, the volume-synchronized PIN of + Easley et al.(2011, 2012) if FALSE, and the improved volume- + synchronized PIN of Ke and Lin (2017). +@grid_size : size of the grid used to generate initial parameter sets for + the maximum-likelihood step for the estimation of ivpin. +" + + vpin_err <- uierrors$vpin() + vpin_ms <- uix$vpin(timebarsize = timebarsize, improved) + + # Check that all variables exist and do not refer to non-existent variables + # -------------------------------------------------------------------------- + allvars <- names(formals()) + environment(.xcheck$existence) <- environment() + .xcheck$existence(allvars, err = uierrors$vpin()$fn) + + # Checking the the arguments of the function are valid + # Check that all arguments are valid + # ------------------------------------------------------------------------- + largs <- list(data, timebarsize, buckets, samplength, tradinghours, improved, + grid_size, verbose) + names(largs) <- names(formals()) + rst <- .xcheck$args(arglist = largs, fn = "vpin") + ux$stopnow(rst$off, m = rst$error, s = uierrors$vpin()$fn) + # Convert data into a dataframe, in case it is a matrix or an array data <- as.data.frame(data) # initialize the local variables tbv <- vbs <- bucket <- NULL estimatevpin <- new("estimate.vpin") + estimatevpin@improved = improved time_on <- Sys.time() @@ -207,7 +371,7 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, dataset$volume <- as.numeric(dataset$volume) - dataset$timestamp <- as.POSIXct(dataset$timestamp) + dataset$timestamp <- as.POSIXct(dataset$timestamp, origin = orig, tz = "UTC") rownames(dataset) <- NULL @@ -248,7 +412,122 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, tbsize <- paste(timebarsize, " sec", sep = "") - dataset$interval <- cut(dataset$timestamp, breaks = tbsize) + # We need to find the trading hours. We'll assume that trading starts + # at the hour of the day when the earliest trade occurred, and ends + # at the hour of the day when the latest trade occurred across all days. + + # We will use "1970-01-01" as the origin for POSIXct conversion, so we will + # give it a shorter name: orig. Similarly, we will use the timezone UTC, and + # two formats: "%H:%M:%S", which we call hf for hour format; and the format + # "%Y-%m-%d %H:%M:%S" which we call ff for full format. + orig = "1970-01-01" + hf <- "%H:%M:%S" + ff <- "%Y-%m-%d %H:%M:%S" + + + # Finding trading hours + # ---------------------------------------------------------------------------- + # Extract the time component from the timestamps + dataset$time_only <- format(dataset$timestamp, format = hf) + + # Find the earliest and latest times across all timestamps + earliest_time <- min(as.POSIXct(dataset$time_only, format = hf, + origin = orig, tz = "UTC")) + latest_time <- max(as.POSIXct(dataset$time_only, format = hf, + origin = orig, tz = "UTC")) + + # Round earliest_time down to the nearest full hour and add one millisecond + earliest_time <- as.POSIXct(floor(as.numeric(earliest_time) / 3600) * 3600, + origin = orig, tz = "UTC") + 0.001 + + # Round latest_time up to the nearest full hour and subtract one millisecond + latest_time <- as.POSIXct(ceiling(as.numeric(latest_time) / 3600) * 3600, + origin = orig, tz = "UTC") - 0.001 + + # Drop the temporary 'time_only' column + dataset$time_only <- NULL + + + # Partitioning the trading time into timebars requires considering time, + # not just trade data. Timebars, even with no trades, impact the duration + # of buckets. We'll create the 'partition' dataframe to divide the entire + # trading hours into timebars. Non-trading days are excluded, and we focus + # on unique days that have at least one trade. + + # Constructing the 'partition' dataframe + # ---------------------------------------------------------------------------- + + # Extract the unique days from the dataset + unique_days <- unique(as.Date(dataset$timestamp, origin = orig, tz = "UTC")) + + # # Create a data frame with all possible intervals for each unique trading day + partition <- do.call(rbind, lapply(unique_days, function(day) { + # Create time intervals from earliest_time to latest_time for each day + day_start <- as.POSIXct(paste(day, format(earliest_time, hf))) + day_end <- as.POSIXct(paste(day, format(latest_time, hf))) + data.frame(interval = levels(cut(seq(day_start, day_end, by = tbsize), + breaks = tbsize))) + })) + + # Extract the intervals from partition as POSIXct format. + intervals <- as.POSIXct(partition$interval, format = ff, tz = "UTC") + + # Fix any issue where POSIXct conversion causes dates at 00:00:00 to become NA + intervals[1] <- as.POSIXct( + paste(unique_days[1], format(earliest_time, hf)), tz = "UTC") + + + # Partition the trading data into intervals matching those in 'partition'. + # To ensure compatibility, we find the closest interval to the earliest trade + # in the dataset. + start_intval <- max(na.omit(intervals[intervals <= min(dataset$timestamp)])) + + + # Assign each timestamp to its corresponding interval using 'cut'. + # Start from the closest start interval and end at the maximum timestamp. + # Non-trading days are automatically ignored. + end_intval <- max(dataset$timestamp) + dataset$interval <- + cut(dataset$timestamp, breaks = seq(start_intval, end_intval,by = tbsize)) + + dataset$interval <- as.POSIXct(dataset$interval, format = ff, tz = "UTC") + + + # Now we merge the dataset with 'partition' to obtain one dataset where + # the entire trading time is partitioned into timebars. For timebars with + # no trades, the price and volume will be set to 0. The timestamp will + # default to the start of the interval to avoid NA values. + + # Merging the dataset with the 'partition' dataframe + # ---------------------------------------------------------------------------- + + # Merge the original dataset with the 'partition' dataframe + partition$interval <- intervals + db_full <- merge(partition, dataset, by = "interval", all.x = TRUE) + + # Fill missing data: set price and volume to 0, and set missing timestamps + # to the interval start time + db_full$price[is.na(db_full$price)] <- 0 + db_full$volume[is.na(db_full$volume)] <- 0 + db_full$timestamp[is.na(db_full$timestamp)] <- db_full$interval[ + is.na(db_full$timestamp)] + + # Assign the merged dataset 'db_full' back to 'dataset' + dataset <- db_full + + # For the dataset included in this package 'hfdata', we get the following + # output + # ...................................................................... + # interval timestamp price volume + # 2018-10-18 00:07:00 2018-10-18 00:07:00 0.0000 0 + # 2018-10-18 00:08:00 2018-10-18 00:08:00 0.0000 0 + # 2018-10-18 00:09:00 2018-10-18 00:09:00 0.0000 0 + # 2018-10-18 00:10:00 2018-10-18 00:10:00 0.0000 0 + # 2018-10-18 00:11:00 2018-10-18 00:11:33 15.4357 6 + # 2018-10-18 00:12:00 2018-10-18 00:12:00 0.0000 0 + # 2018-10-18 00:13:00 2018-10-18 00:13:10 15.4754 6 + # 2018-10-18 00:14:00 2018-10-18 00:14:00 0.0000 0 + # ---------------------------------------------------------------------------- # I.2 CREATE THE T-MINUTE TIMEBAR DATASET CALLED 'MINUTEBARS' @@ -280,7 +559,7 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, aggregate(volume ~ interval, dataset[1:chunk, ], sum), by = "interval" ) - tempbars$interval <- as.POSIXct(tempbars$interval, tz = "") + tempbars$interval <- as.POSIXct(tempbars$interval, origin = orig, tz = "UTC") temptime_off <- Sys.time() @@ -303,7 +582,7 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, minutebars <- merge(minutebars, aggregate(volume ~ interval, dataset, sum), by = "interval") - minutebars$interval <- as.POSIXct(minutebars$interval, tz = "") + minutebars$interval <- as.POSIXct(minutebars$interval, origin = orig, tz = "UTC") colnames(minutebars) <- c("interval", "dp", "tbv") minutebars <- minutebars[complete.cases(minutebars), ] @@ -403,9 +682,8 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, minutebars$id <- seq.int(nrow(minutebars)) # Define the precision factor (x) and calculate the threshold - - x <- 10 - threshold <- (1 - 1 / x) * vbs + x <- 1 + threshold <- vbs #(1 - 1 / x) * vbs #vbs # Find all timebars with volume higher than the threshold and store them in # largebars @@ -451,8 +729,39 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # ------------------------------- # 2019-04-02 04:53 53.0 340. + # ---------------------------------------------------------------------------- + # A new addition inspired by ivpin + # ---------------------------------------------------------------------------- + # In addition to dividing the volume of the time bars into bars with volume + # of 'threshold', we will also divide the duration of the time bar size if + # the time bar spans over multiple buckets. + + # As discussed above, the time bar will be divided between a remainder (340) + # and 5 bars of volume size of 1000. The time will be divided similarly, i.e., + # proportionally. If 'timebarsize' is equal to 60, then the duration given to + # the remainder bar will be (340/5340)*60, while each of the 5 bars will get + # a duration of (1000/5340)*60 each. The difference now, is that we will update + # the timestamp in the variable 'interval': The remainder bar will keep the + # same timestamp, while the first of 5 bars will see its timestamp increase by + # (340/5340)*60 seconds, the second one, by (1340/5340)*60, the third one by + # (2340/5340)*60... and so on, this is inline with the spirit of the ivpin + # study that assume that trades are uniformly distributed over the span of the + # time bar. The variable 'duration' will contain the duration just discussed. + # The duration will be calculated as above for the large time bars, i.e., for + # the time bars with volume larger than vbs. The remaining time bars will get + # a duration equal to 'timebarsize', 60 in our example. + + minutebars$duration <- timebarsize + # print(sum(minutebars$duration)) + if(!is.null(largebnum)) { + # First we update the duration, and then the volume corresponding the + # bar holding the remainder of volume. + + minutebars[largebnum, ]$duration <- timebarsize * + ((minutebars[largebnum, ]$tbv %% threshold) / minutebars[largebnum, ]$tbv) + minutebars[largebnum, ]$tbv <- minutebars[largebnum, ]$tbv %% threshold # We will now replicate the time bar with the same price movement (dp), the @@ -467,7 +776,9 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # All what is left now is to change the value of 'tbv' in 'largebars' to # (threshold/x) and then replicate the timebars using the corresponding value # in n_rep + # The value of the duration is calculated analogously. + largebars$duration = ((threshold / x) / largebars$tbv) * timebarsize largebars$tbv <- threshold / x largebars <- largebars[rep(seq_len(nrow(largebars)), n_rep), ] @@ -493,8 +804,96 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, minutebars <- rbind(minutebars, largebars) minutebars <- minutebars[order(minutebars$interval), ] + + # NEW: + # Now we update the timestamps of the time bars after they have been divided + # This is a bit technical but it shall become clear once operated. + + # Let's look at this timebar + # minute dp tbv duration + # ----------------------------------------------- + # 2018-10-18 04:31:33 0.0228 3690 60 + + # It has a volume of 3690 larger than 'threshold' that is equal to 3500.849 + # Therefore it will be divided in one remainder time bar of volume 189.1505 + # (3690 %% 3500.849 = 189.1505) and 10 (x = 10) time bars of volume 350.0849 + # (threshold/x = 3500.849/10 = 350.0849). The duration assigned to the first + # time bar (remainder) is 3.075618 ((189.1505/3690) * 60 = 3.075618); while + # the others will have a duration of 5.692437 each ((350.0849/3690) * 60). + + # minute dp tbv duration + # ----------------------------------------------- + # 2018-10-18 04:31:33 0.0228 189.1505 3.075618 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:33 0.0228 350.0849 5.692438 + + # The remains now to update the time stamps, since the first time bar took + # 3.075618 to complete, the timestamp of the second time bar should be + # 2018-10-18 04:31:33 + 3.075618 = 2018-10-18 04:31:36. How to do that + # recursively? We need to find the cumulative sum of duration for each + # timestamp (the variable interval), and then take its lagged values. This + # way, time bars with duration of 60 will have a lagged cumulative duration + # of zero. Once we have the lagged cumulative duration, we can just add it + # to the timestamp (interval) and obtain a proportional and uniform + # distribution of the duration, with updated timestamps. + + minutebars <- minutebars %>% + group_by(interval) %>% + mutate(cum_duration = cumsum(duration), + lag_cum_duration = lag(cum_duration, default = 0)) %>% dplyr::ungroup() + + # We obtain: + + # minute dp tbv duration lag_cum_duration + # -------------------------------------------------------------- + # 2018-10-18 04:31:33 0.0228 189. 3.08 0 + # 2018-10-18 04:31:33 0.0228 350. 5.69 3.08 + # 2018-10-18 04:31:33 0.0228 350. 5.69 8.77 + # 2018-10-18 04:31:33 0.0228 350. 5.69 14.5 + # 2018-10-18 04:31:33 0.0228 350. 5.69 20.2 + # 2018-10-18 04:31:33 0.0228 350. 5.69 25.8 + # 2018-10-18 04:31:33 0.0228 350. 5.69 31.5 + # 2018-10-18 04:31:33 0.0228 350. 5.69 37.2 + # 2018-10-18 04:31:33 0.0228 350. 5.69 42.9 + # 2018-10-18 04:31:33 0.0228 350. 5.69 48.6 + # 2018-10-18 04:31:33 0.0228 350. 5.69 54.3 + + + minutebars$interval <- minutebars$interval + minutebars$lag_cum_duration + minutebars <- data.frame(minutebars) + + # Now, we obtain a uniform distribution of the volume and the duration + # of large time bars over multiple smaller time bars. The result looks as + # follows: + + # minute dp tbv duration + # ------------------------------------------------ + # 2018-10-18 04:31:33 0.0228 189.1505 3.075618 + # 2018-10-18 04:31:36 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:42 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:48 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:53 0.0228 350.0849 5.692438 + # 2018-10-18 04:31:59 0.0228 350.0849 5.692438 + # 2018-10-18 04:32:05 0.0228 350.0849 5.692438 + # 2018-10-18 04:32:10 0.0228 350.0849 5.692438 + # 2018-10-18 04:32:16 0.0228 350.0849 5.692438 + # 2018-10-18 04:32:22 0.0228 350.0849 5.692438 + # 2018-10-18 04:32:27 0.0228 350.0849 5.692438 + + minutebars$cum_duration <- minutebars$lag_cum_duration <- NULL + + + # Finally, we reassign new identifiers to timebars (id) to make it easier to - # identify them in coming tasks + # identify them in coming tasks. minutebars$id <- seq.int(nrow(minutebars)) rm(largebars, largebnum) @@ -610,7 +1009,8 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # Now that we have found all the identifiers; for each minutebar having an # identifier in xtrarnum, change the value volume (tbv) to the value of excess # volume (exvol). - + minutebars[xtrarnum, ]$duration <- minutebars[xtrarnum, ]$duration * + (minutebars[xtrarnum, ]$exvol/minutebars[xtrarnum, ]$tbv) minutebars[xtrarnum, "tbv"] <- minutebars[xtrarnum, "exvol"] # -------------------------------------------------------------------------- @@ -624,9 +1024,26 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # will just change the bucket number to the one before it and set tbv to # tbv-exvol. + # Before that, we find how much time is spent in the last timebar in each + # bucket, which is proportional to (tbv-exvol). Once we find it, we add it to + # to the timestamp (interval); so that the duration calculation at the end + # becomes correct. The same timestamp should be that of the first timebar in + # the next bucket. + + + xtrarows$interval <- xtrarows$interval + xtrarows$duration * + ((xtrarows$tbv - xtrarows$exvol)/xtrarows$tbv) + xtrarows$duration <- xtrarows$duration * + ((xtrarows$tbv - xtrarows$exvol)/xtrarows$tbv) + minutebars[xtrarnum, ]$interval <- xtrarows$interval + xtrarows$tbv <- xtrarows$tbv - xtrarows$exvol xtrarows$bucket <- xtrarows$bucket - 1 + + # Now it it time to drop the duration variable of the dataframe minutebars + # minutebars$duration <- NULL + # -------------------------------------------------------------------------- # V.4 CALCULATE ACCUMULATED BUCKET VOLUME FOR THE ORIGINAL DATASET # -------------------------------------------------------------------------- @@ -636,8 +1053,8 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # minutebars next to one another. xtrarows$interval <- as.POSIXct(xtrarows$interval, - origin = "1970-01-01", tz = "") - xtrarows <- xtrarows[c("interval", "dp", "tbv", "id", "runvol", "bucket", + origin = orig, tz = "UTC") + xtrarows <- xtrarows[c("interval", "dp", "tbv", "id", "duration", "runvol", "bucket", "exvol")] minutebars <- rbind(minutebars, xtrarows) @@ -649,6 +1066,9 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, minutebars$runvol <- cumsum(minutebars$tbv) minutebars$exvol <- minutebars$runvol - (minutebars$bucket - 1) * vbs + # Reinitialize rownames of minutebars. + rownames(minutebars) <- NULL + rm(xtrarnum, xtrarows) ############################################################################ @@ -697,8 +1117,8 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # +++++++++++++++++++ bucketdata <- setNames( - aggregate(cbind(bvol, svol) ~ bucket, minutebars, sum), - c("bucket", "agg.bvol", "agg.svol")) + aggregate(cbind(bvol, svol, duration) ~ bucket, minutebars, sum), + c("bucket", "agg.bvol", "agg.svol", "duration")) bucketdata$aoi <- abs(bucketdata$agg.bvol - bucketdata$agg.svol) bucketdata$starttime <- aggregate(interval ~ bucket, @@ -714,7 +1134,7 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, # USER MESSAGE # -------------------------------------------------------------------------- - ux$show(c= verbose, m = vpin_ms$step8) + ux$show(c= verbose & !improved, m = vpin_ms$step8) # -------------------------------------------------------------------------- # TASK DESCRIPTON @@ -762,7 +1182,7 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, (samplength * vbs) bucketdata$vpin[samplength] <- bucketdata$cumoi[samplength] / (samplength * vbs) - bucketdata$duration <- as.numeric( + bucketdata$bduration <- as.numeric( difftime(bucketdata$endtime, bucketdata$starttime, units = "secs")) # -------------------------------------------------------------------------- @@ -783,8 +1203,8 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, } if (tradinghours > 0 & tradinghours < 24) { - bucketdata$duration <- apply(bucketdata, 1, function(x) - corduration(x[6], x[5], x[10])) + bucketdata$bduration <- apply(bucketdata, 1, function(x) + corduration(x[6], x[5], x[11])) } # Store the vector of vpin in the results vector @@ -800,7 +1220,7 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, dailyvpin <- setNames(aggregate(vpin ~ day, bucketdata, function(x) mean(x, na.rm = TRUE)), c("day", "dvpin")) - dailyvpin$day <- as.Date(dailyvpin$day) + dailyvpin$day <- as.Date(dailyvpin$day, origin = orig, tz = "UTC") temp <- setNames(aggregate(cbind(duration, vpindur) ~ day, bucketdata, function(x) sum(x, na.rm = TRUE)), @@ -822,9 +1242,345 @@ vpin <- function(data, timebarsize = 60, buckets = 50, samplength = 50, time_off <- Sys.time() + if(!improved) { + + estimatevpin@runningtime <- ux$timediff(time_on, time_off) + + ux$show(c= verbose, m = vpin_ms$complete) + + return(estimatevpin) + + } + + bucketdata$duration <- bucketdata$duration /mean(bucketdata$duration) + + ############################################################################ + # STEP 8 : CALCULATING IVPIN VECTOR FOLLOWING KE & LIN 2017 + ############################################################################ + + # -------------------------------------------------------------------------- + # USER MESSAGE + # -------------------------------------------------------------------------- + + ux$show(c= verbose & improved, m = vpin_ms$step8) + + # -------------------------------------------------------------------------- + # TASK DESCRIPTION + # -------------------------------------------------------------------------- + + # The calculation of the IVPIN vector uses the vector of order imbalances + # (OI = agg.bvol - agg.svol), the sample length (samplength), the volume + # bucket size (vbs), and the vector of duration (duration). + + # Easley et al. (2012b) derived the VPIN estimator based on two moment conditions + # from Poisson processes: + # E[|VB - VS|] ≈ alpha * mu and E[|VB + VS|] = 2 * epsilon + alpha * mu. + # Ke et al. (2017) suggested expressing these conditions as: + # (EQ1) E[|VB - VS| | t; theta] ≈ alpha * mu * t, and + # (EQ2) E[|VB + VS| | t; theta] = (2 * epsilon + alpha * mu) * t. + # These equations correspond to Equations (3) on page 364 of Ke et al. (2017). + + # In our implementation: + # - The variable "total_arrival_rate" or 'tar' is calculated as |VB + VS|/t. + # Its average would, therefore, represent 2 * epsilon + alpha * mu. + # - The variable "informed_arrival_rate" or 'iar' is calculated as |VB - VS|/t. + # Its average would, therefore, approximate alpha * mu. + + # These two variables 'tar' and 'iar' allow us to estimate mu using (EQ1), + # where mu = mean(iar) / alpha. + # Once mu is estimated, we can determine epsilon using (EQ2), where + # eps = mean(tar - iar) / 2. + + # In the first step, we calculate for each bucket, mean(iar) and mean(tar - iar), + # which correspond to the average total arrival rate and uninformed arrival rate + # for the last 'samplength' buckets. + + # Let's start by calculating the variables 'tar' and 'iar' + bucketdata$tar <- with (bucketdata, (agg.bvol + agg.svol)/duration) + bucketdata$iar <- with (bucketdata, abs(agg.bvol - agg.svol)/duration) + + # To calculate the mean 'iar' and 'tar' over the last 'samplength' buckets, + # we use the following strategy. + # Assume samplength is 50. The formula for the first average 'tar' + # observation is sum(tar[i], i = 1 to 50) / 50. + # The formula for the 5th average 'tar' is sum(tar[i], i = 5 to 54) / 50. + + # Note that sum(tar[i], i = 5 to 54) = sum(tar[i], i = 1 to 54) - sum(tar[i], i = 1 to 4). + # The first one is the cumulative sum at 54 and the second one is the cumulative + # sum at 4. So sum(tar[i], i = 5 to 54) = cumsum[54] - cumsum[4]. + # The general formula is sum(tar[i], i = n to 50 + n) = cumsum[50 + n] - cumsum[n]. + + # So, to calculate the mean 'tar' and mean 'iar' for the bucket 50, we shift + # the cumsum by 1 position. + + # -------------------------------------------------------------------------- + # VIII.1 CALCULATING AVERAGE TAR AND IAR VECTOR + # -------------------------------------------------------------------------- + + # Calculate the cumulative sum of 'TAR' and 'IAR': cumTAR and cumIAR + + if (nrow(bucketdata) < samplength) { + + estimatevpin@success <- FALSE + + estimatevpin@errorMessage <- vpin_err$largesamplength + + ux$show(c= verbose, m = vpin_ms$aborted) + + ux$show(ux$line()) + + ux$stopnow(m = vpin_err$largesamplength, s = vpin_err$fn) + + } + + bucketdata$cumTAR <- cumsum(bucketdata$tar) + bucketdata$cumIAR <- cumsum(bucketdata$iar) + + bucketdata$lagcumTAR <- head(c(rep(NA, samplength - 1), 0, bucketdata$cumTAR), + nrow(bucketdata)) + bucketdata$lagcumIAR <- head(c(rep(NA, samplength - 1), 0, bucketdata$cumIAR), + nrow(bucketdata)) + + # Recall our objective: + # These two variables 'tar' and 'iar' allow us to estimate mu using (EQ1), + # where mu = mean(iar) / alpha. + # Once mu is estimated, we can determine epsilon using (EQ2), where + # eps = mean(tar - iar) / 2. + # These averages are taken over the last 'samplength' buckets. + + # The variable avIAR = mean(iar) for the last 'samplength' (say 50) buckets + # can be easily obtained for bucket k by taking the sum of 'iar' over buckets + # k-49 to bucket k, and dividing this sum by 50. + # This sum at bucket k can be easily obtained by cumIAR[k] - lagcumIAR[k]. + + # The variable avUAR = mean(tar - iar) for the last 'samplength' (say 50) + # buckets, can be easily obtained for bucket k by subtracting the sum of 'iar' + # over the buckets k-49 to bucket k from the sum of 'tar' over buckets k-49 to + # bucket k, and dividing this sum by 50. + # These sums at bucket k can be easily obtained by cumTAR[k] - lagcumTAR[k] + # and cumIAR[k] - lagcumIAR[k]. + + bucketdata$avIAR <- (bucketdata$cumIAR - bucketdata$lagcumIAR)/samplength + bucketdata$avTAR <- (bucketdata$cumTAR - bucketdata$lagcumTAR)/samplength + bucketdata$avUAR <- bucketdata$avTAR - bucketdata$avIAR + + # Now, we have obtained the variables we need to calculate mu and alpha. + # (EQ1) becomes mu (at bucket k) = avIAR[k] / alpha. + # (EQ2) becomes eps (at bucket k) = avUAR[k] / 2. + + + # We can, therefore, delete the other variables + bucketdata$cumIAR <- bucketdata$lagcumIAR <- bucketdata$avTAR <- NULL + bucketdata$cumTAR <- bucketdata$lagcumTAR <- NULL + bucketdata$tar <- bucketdata$iar <- NULL + + # -------------------------------------------------------------------------- + # VIII.2 MAXIMUM-LIKELIHOOD ESTIMATION + # -------------------------------------------------------------------------- + + # Implementation of this step involves the following: + # We will loop over the buckets, collecting the necessary elements for the + # maximum likelihood estimation for each bucket. These elements include: + # 1. A dataframe composed of agg.bvol, agg.svol, and duration for the last + # 'samplength' buckets. + # 2. The values of avIAR and avUIR. + + # The value of avIAR will help us create a grid of initial parameter sets. + # The values of alpha and delta are chosen within the interval (0, 1) following + # the grid algorithm of Yang and Zhang (2012) and used by Ke and Lin (2017). + # We use avIAR to estimate mu, given by (EQ1) above: mu = avIAR / alpha. + # The value of eps is given by (EQ2) and is simply: eps = avUIR / 2. + # For each bucket, we start by creating a set of initial parameter sets in + # a dataframe called 'initials'. + + # We extract the relevant dataframe for the maximum likelihood estimation, + # which consists of the variables agg.bvol, agg.svol, and duration for the + # last 'samplength' buckets. + # mldata <- buckets[(k - samplength + 1):k, c("agg.bvol", "agg.svol", "duration")] + + # For the dataframe 'initials', we apply a function 'findMLE'. The function + # sapply() takes the values of the initial parameters, row by row, and sends + # them alongside the dataframe mldata to the function findMLE(). + + # The function findMLE receives the initial parameter set, the dataframe + # mldata, and finds the likelihood-maximizing estimate. It returns a vector of + # 4 estimates (alpha, delta, mu, eps) and the value of the likelihood. + # Once we have all results, we pick the row that has the highest likelihood. + # We then use the corresponding variables to calculate the value of ivpin using + # Equation (19) in Ke and Lin (2017), page 370. + + # ivpin = (alpha* * mu*) / (eps.b* + eps.s* + mu*) + + # Define the function findMLE + + findMLE <- function(params, data, s = FALSE){ + + # Load the factorization of the likelihood function for the IVPIN model. + # Details of the factorization can be found in footnote 8 on page 369 of the paper. + + mlefn <- factorizations$ivpin(data) + + # Calculate the default value for the function in case the estimation fails + # or if the returned likelihood is infinite. This default value consists of + # the initial parameter set and the likelihood value calculated from these + # initial parameters. The estimation is considered to have failed if the + # convergence value is below zero. + + default_value <- c(params, mlefn(params), 0) + + optimal <- tryCatch({ + low <- c(0, 0, 0, 0, 0) + up <- c(1, 1, Inf, Inf, Inf) + estimates <- suppressWarnings( + nloptr::lbfgs(params, + fn = mlefn, + lower = low, + upper = up)) + if(estimates$convergence < 0){ + return(default_value) + } + + c(estimates$par, estimates$value, 1) + }) + + return(optimal) + + } + + # Initialize the progress bar + if (verbose) { + pb_ivpin <- ux$progressbar(0, maxvalue = nrow(bucketdata)- samplength + 1) + cat(uix$vpin()$progressbar) + } + + # Create an NA vector of ivpin values + bucketdata$ivpin <- NA + + # create a variable to store the optimal value from the previous optimization + previously_optimal <- NULL + + + for(k in samplength:nrow(bucketdata)){ + + # Obtain mldata which consits of Vb, Vs and t + mldata <- bucketdata[(k-samplength+1):k, c("agg.bvol", "agg.svol", "duration")] + + # If the variable previously_optimal is not NULL, we use it as the initial + # value for the optimization. If the optimization is successful, i.e., the + # last element of the output from the function findMLE is equal to 1, then + # the first five values of the output represent the new optimal values for + # the current bucket, and also become the new "previously_optimal" values for + # the next iteration. We then move to the next bucket. If the optimization is + # not successful, we continue with the grid search. + + + if (!is.null(previously_optimal)) { + optimal <- as.list(findMLE(previously_optimal, mldata)) + names(optimal) <- c("alpha", "delta", "mu", "eb", "es", "likelihood", "conv") + + if(optimal$conv >= 0){ + bucketdata$ivpin[k] <- with(optimal, (alpha * mu)/(eb + es + mu)) + previously_optimal <- unlist(optimal[1:5]) + # Update the progressbar + if (verbose) setTxtProgressBar(pb_ivpin, k) + next + } + } + + ############################################################################ + # THIS IS THE GRID ######################################################### + ############################################################################ + + # Obtain currentavUAR and currentavIAR, which represent the current average + # uninformed arrival rate and the current average informed arrival rate. We + # use them to calculate the values of mu, eb, and es. + # Recall from (EQ1) mu = avIAR / alpha and from (EQ2) eb = es = avUAR / 2. + + currentavUAR <- bucketdata$avUAR[k] + currentavIAR <- bucketdata$avIAR[k] + + # Now, we create the grid of initial parameter sets using a variable called + # grid_size, which determines the granularity of the partition of the parameter + # space for alpha and delta. + + # Generate the set of values for alpha and delta: + # (0.1, 0.3, 0.5, 0.7, 0.9) when grid_size is equal to 5. + hstep <- 1 / (2 * grid_size) + grid <- seq(hstep, 1 - hstep, 2 * hstep) + alpha_values <- delta_values <- grid + + # Take the Cartesian product of the values of alpha and delta, and store them + # in a dataframe called 'initials'. + + initials <- as.data.frame(expand.grid(alpha_values, delta_values)) + colnames(initials) <- c("alpha", "delta") + + # Now we add the values of mu, eb, and es using (EQ1) and (EQ2) + initials$mu <- currentavIAR/initials$alpha + initials$eps.b <- initials$eps.s <- currentavUAR/2 + + # Now, we have everything we need to find the likelihood-maximizing + # parameters starting from each initial parameter set. + # Apply the function findMLE using apply(), pass mldata as an additional + # argument, and store the results in a dataframe called 'results'. + # This dataframe will contain six columns: the first five are the optimal + # parameters (alpha*, delta*, mu*, eps.b*, eps.s*), and the last one is the + # likelihood value 'likelihood'. If the optimization of the likelihood + # function fails, the initial parameters are returned along with the + # likelihood value calculated at these parameters. This ensures that we do + # not have NA or infinite values. + + results <- as.data.frame(t(apply( + initials, 1, function(row){findMLE(row, mldata)}))) + colnames(results) <- c("alpha", "delta", "mu", "eb", "es", "likelihood", "conv") + + # We select the entries for which the variable 'conv' is equal to 1, indicating + # that the optimization algorithm has converged, and save them in a dataframe + # called 'optresults'. If this dataframe is empty, we use the dataframe + # of the initial parameter sets with the corresponding likelihood values. + + optresults <- results[results$conv == 1,] + if (nrow(optresults) == 0) optresults <- results + # if (k > samplength) { + # print(optresults) + # browser() + # } + + + # Once the results are returned, we select the row from 'optresults' that has + # the highest likelihood value, i.e., the likelihood-maximizing estimate. + + optimalrow <- results[which.max(results$likelihood),] + + # We calculate the value of IVPIN using Equation 19 on page 370 of + # Ke and Lin (2017). + + bucketdata$ivpin[k] <- with(optimalrow, (alpha * mu)/(eb + es + mu)) + + # We store the value of the optimal parameters in the variable 'previously_optimal' + # to use it in the next iteration. + + previously_optimal <- unlist(optimalrow[1:5]) + + # Update the progressbar + + if (verbose) setTxtProgressBar(pb_ivpin, k) + + } + + ux$show(c= verbose, m = paste("\n", vpin_ms$step9, sep="")) + + # Store the vector of ivpin in the ivpin results vector + estimatevpin@ivpin <- bucketdata$ivpin + + estimatevpin@bucketdata <- bucketdata + + time_off <- Sys.time() + estimatevpin@runningtime <- ux$timediff(time_on, time_off) ux$show(c= verbose, m = vpin_ms$complete) return(estimatevpin) + } diff --git a/R/output_classes.R b/R/output_classes.R index e956118..9a638e5 100644 --- a/R/output_classes.R +++ b/R/output_classes.R @@ -217,6 +217,10 @@ setMethod( #' has succeeded, \code{FALSE} otherwise. #' @slot errorMessage (`character`) returns an error message if the `VPIN` #' estimation has failed, and is empty otherwise. +#' @slot improved (`logical`) returns the value \code{TRUE} when the model used +#' is the improved volume-synchronized probability of informed trading of Ke and +#' Lin (2017), and \code{FALSE} when the model used is the volume-synchronized +#' probability of informed trading of Easley et al.(2011,2012). #' @slot parameters (`numeric`) returns a numeric vector of estimation #' parameters (tbSize, buckets, samplength, VBS, #days), where `tbSize` is the #' size of timebars (in seconds); `buckets` is the number of buckets per average @@ -236,6 +240,8 @@ setMethod( #' the `VPIN` vector. #' @slot vpin (`numeric`) returns the vector of the volume-synchronized #' probabilities of informed trading. +#' @slot ivpin (`numeric`) returns the vector of the improved volume- +#' synchronized probabilities of informed trading as in Ke and Lin (2017). #' @slot dailyvpin (`dataframe`) returns the daily `VPIN` values. Two #' variants are provided for any given day: \code{dvpin} corresponds to #' the unweighted average of vpin values, and \code{dvpin.weighted} @@ -245,13 +251,13 @@ setMethod( setClass( "estimate.vpin", slots = list( - success = "logical", errorMessage = "character", parameters = "numeric", - bucketdata = "data.frame", vpin = "numeric", dailyvpin = "data.frame", - runningtime = "numeric" + success = "logical", errorMessage = "character", improved = "logical", + parameters = "numeric", bucketdata = "data.frame", vpin = "numeric", + ivpin = "numeric", dailyvpin = "data.frame", runningtime = "numeric" ), prototype = list( - success = TRUE, errorMessage = "", parameters = 0, - bucketdata = data.frame(), vpin = 0, dailyvpin = data.frame(), + success = TRUE, errorMessage = "", improved = FALSE, parameters = 0, + bucketdata = data.frame(), vpin = 0, ivpin = NaN, dailyvpin = data.frame(), runningtime = 0 ) ) diff --git a/R/utilities_messages.R b/R/utilities_messages.R index 643e882..3f95522 100644 --- a/R/utilities_messages.R +++ b/R/utilities_messages.R @@ -218,10 +218,12 @@ uix <- list( return(ui) }, - vpin = function(timebarsize = 0) { + vpin = function(timebarsize = 0, improved = FALSE) { + ui <- list() - ui$start <- "[+] VPIN Estimation started." + ui$start <- ifelse(improved, "[+] IVPIN Estimation started.", + "[+] VPIN Estimation started.") ui$step1 <- " |-[1] Checking and preparing the data..." @@ -242,11 +244,17 @@ uix <- list( ui$step7 <- " |-[7] Calculating aggregate bucket data..." - ui$step8 <- " |-[8] Calculating VPIN vector..." + ui$step8 <- ifelse(improved, " |-[8] Finding ML estimates for the ivpin model parameters...", + " |-[8] Calculating VPIN vector...") + ui$step9 <- " |-[9] Calculating IVPIN vector..." + + ui$complete <- ifelse(improved, "[+] IVPIN estimation completed", + "[+] VPIN estimation completed") - ui$complete <- "[+] VPIN estimation completed" + ui$aborted <- ifelse(improved, "[+] IVPIN estimation aborted!", + "[+] VPIN estimation aborted!") - ui$aborted <- "[+] VPIN estimation aborted!" + ui$progressbar <- " of buckets treated" return(ui) @@ -1521,8 +1529,10 @@ uiclasses <- list( vpin = function(object) { ui <- list() + improved <- object@improved - badge_txt <- " VPIN model " + model <- ifelse(improved, "IVPIN", "VPIN") + badge_txt <- ifelse(improved, " IVPIN model ", " VPIN model ") ui$badge <- paste( "\n", ux$color(fg = 37, bg = 46, x = badge_txt), " ", sep = "") @@ -1530,29 +1540,33 @@ uiclasses <- list( ui$line <- "----------------------------------" ui$outcome <- if (object@success) - "VPIN estimation completed successfully" else - "VPIN estimation failed" + paste(model," estimation completed successfully", sep ="") else + paste(model," estimation failed", sep = "") ui$vpinfunctions <- paste( + ifelse(improved, "Type object@ivpin to access the IVPIN vector.\n",""), "Type object@vpin to access the VPIN vector.", - "\nType object@bucketdata to access data used to construct", - "the VPIN vector.", - "\nType object@dailyvpin to access the daily VPIN vectors.") + "\nType object@bucketdata to access data used to construct ", + "the ", model, " vector.", + "\nType object@dailyvpin to access the daily VPIN vectors.", sep = "") + vpinsummary <- ifelse(improved, unclass(summary(object@ivpin)), + unclass(summary(object@vpin))) vpinsummary <- unclass(summary(object@vpin)) + if(improved) vpinsummary <- unclass(summary(object@ivpin)) vpinnames <- names(vpinsummary) vpinsummary <- data.frame(t(ux$round3(vpinsummary))) colnames(vpinsummary) <- vpinnames ui$vpinsummary <- vpinsummary - ui$summarycaption <- "\r[+] VPIN descriptive statistics" + ui$summarycaption <- paste("\r[+] ", model, " descriptive statistics", sep = "") vpinparams <- data.frame(matrix(object@parameters, ncol = 5)) colnames(vpinparams) <- names(object@parameters) rownames(vpinparams) <- NULL ui$vpinparams <- vpinparams - ui$paramscaption <- "\r[+] VPIN parameters" + ui$paramscaption <- paste("\r[+] ", model, " parameters", sep = "") ui$error <- paste("\n\n", uierrors$vpin()$failed, "\n\n") diff --git a/README.md b/README.md index d0ccffe..8c0857c 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,15 @@ can be achieved by solely the use of raw trade level data. [![Like PINstimation? - Support our Work!](https://img.shields.io/badge/Like%20PINstimation%20-%20Support%20Our%20Work!-red?style=flat&logo=ko-fi&logoColor=white)](https://ko-fi.com/pinstimation) +## New features in Version 0.1.3 + +* We have updated the `initials_adjpin()` function, which generates initial parameter sets for the adjusted PIN model, to align with the algorithm outlined in Ersan and Ghachem (2024). + +* The function `adjpin()` now includes the time spent on generating initial parameter sets in the total time displayed in the output. + +* The function `ivpin()` has been introduced, which implements an improved version of the Volume-Synchronized Probability of Informed Trading (VPIN), based on Lin and Ke (2017). The function uses maximum likelihood estimation to provide more stable VPIN estimates, particularly for small volume buckets or infrequent informed trades, and improves the predictability of flow toxicity. + + ## New features in Version 0.1.2 * We introduce a new function called `classify_trades()` that enables users to diff --git a/authors.md b/authors.md deleted file mode 100644 index 2d9b7a9..0000000 --- a/authors.md +++ /dev/null @@ -1,94 +0,0 @@ -# Authors - - - - - -
- - - - - diff --git a/cran-comments.md b/cran-comments.md index cab081e..ebb1926 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,3 +1,53 @@ +## Resubmission 3 + +This is a resubmission. In this version I have added some new features and fixed a few bugs. + + +## New Features + +* We have updated the `initials_adjpin()` function, which generates initial parameter sets for the adjusted PIN model, to align with the algorithm outlined in Ersan and Ghachem (2024). + +* We have introduced the `ivpin()` function, which implements an improved version of the Volume-Synchronized Probability of Informed Trading (VPIN), based on Lin and Ke (2017). The function uses maximum likelihood estimation to provide more stable VPIN estimates, particularly for small volume buckets or infrequent informed trades, and improves the predictability of flow toxicity. + + +## Bugfixes + +* We have rectified an issue in the `mpin_ecm()` function. In cases where an observation in the E-step of the ECM algorithm had a zero probability of belonging to any cluster, we decided to assign it a uniform probability of belonging to each cluster, calculated as `1` divided by the total number of clusters (`cls`). Previously, the code erroneously assumed a fixed number of clusters (`6`), which has now been replaced with the variable `cls` to accommodate varying cluster sizes. + +* We have enhanced the format and performance of polynomial root calculations +within the conditional-maximization steps of the ECM algorithm. These enhancements +are implemented in the `solve_eqx` function. + +* We've addressed two concerns related to the dependency package future. Firstly, previous code assigned variables to the global environment for parallel calls of the function `.get_lagged_value`, potentially yielding unexpected results. The updated code employs lexical scoping using the `local()` function to manage the variable `.lwbound` between function calls. Secondly, the previous code set the maximum size of futures to `+Inf` upon package loading, potentially impacting other packages. The option to adjust this value is now left to the user. + +### Test environments + +* local windows 11 , R 4.2.1 +* macOS-latest (release) (on GitHub) +* windows-latest (release) (on GitHub) +* windows-latest (4.1) (on GitHub) +* ubuntu-latest (devel) (on GitHub) +* ubuntu-latest (release) (on GitHub) +* ubuntu-latest (oldrel-1) (on GitHub) +* ubuntu-latest (oldrel-2) (on GitHub) +* Windows Server 2022, R-release, 64 bit - R 4.2.1 +* Windows Server 2022, R-devel, 64 bit - R 4.3 +* macOS 10.13.6 High Sierra, R-release, brew - R 4.3 +* macOS 10.13.6 High Sierra, R-release, CRAN's setup - R 4.3 + +### R CMD check results + +0 errors | 0 warnings | 0 notes + +### Reverse dependencies + +There are currently no downstream dependencies for this package. + + +----- +----- + + ## Resubmission 2 This is a resubmission. In this version I have added some new features and fixed a few bugs. @@ -15,7 +65,7 @@ trade is buyer-initiated, or `FALSE` if it is seller-initiated. automatically aggregated into daily trade data. However, with the updated version, users can now specify the desired frequency, such as every 15 minutes. -### New Bugfixes +### Bugfixes * We identified and corrected an error in the `mpin_ecm()` function. Previously, the function would sometimes produce inconsistent results as the posterior diff --git a/inst/REFERENCES.bib b/inst/REFERENCES.bib index 9c82ee7..b42f3e4 100644 --- a/inst/REFERENCES.bib +++ b/inst/REFERENCES.bib @@ -388,3 +388,14 @@ @article{griffin2021 year={2021}, publisher={Elsevier} } + +@article{ke2017improved, + title={An improved version of the volume-synchronized probability of informed trading}, + author={Ke, Wen-Chyan and Lin, Hsiou-Wei William and others}, + journal={Critical Finance Review}, + volume={6}, + number={2}, + pages={357--376}, + year={2017}, + publisher={Now Publishers, Inc.} +} diff --git a/man/PINstimation-package.Rd b/man/PINstimation-package.Rd index feb2ab6..a965b28 100644 --- a/man/PINstimation-package.Rd +++ b/man/PINstimation-package.Rd @@ -2,6 +2,7 @@ % Please edit documentation in R/PINstimation.R \docType{package} \name{PINstimation-package} +\alias{PINstimation} \alias{PINstimation-package} \title{An R package for estimating the probability of informed trading} \description{ @@ -178,6 +179,8 @@ using the initial parameter sets from the grid-search algorithm of \insertCite{Yan2012;textual}{PINstimation}. \item \link{vpin} estimates the volume-synchronized probability of informed trading (\code{VPIN}). +\item \link{ivpin} estimates the improved volume-synchronized probability +of informed trading (\code{IVPIN}). } } @@ -223,6 +226,15 @@ simulation of the aggregate daily trading data. \references{ \insertAllCited +} +\seealso{ +Useful links: +\itemize{ + \item \url{https://www.pinstimation.com} + \item \url{https://github.com/monty-se/PINstimation} + \item Report bugs at \url{https://github.com/monty-se/PINstimation/issues} +} + } \author{ Montasser Ghachem \href{mailto:montasser.ghachem@pinstimation.com}{montasser.ghachem@pinstimation.com} \cr diff --git a/man/estimate.vpin-class.Rd b/man/estimate.vpin-class.Rd index 35fa158..95c956b 100644 --- a/man/estimate.vpin-class.Rd +++ b/man/estimate.vpin-class.Rd @@ -29,6 +29,11 @@ has succeeded, \code{FALSE} otherwise.} \item{\code{errorMessage}}{(\code{character}) returns an error message if the \code{VPIN} estimation has failed, and is empty otherwise.} +\item{\code{improved}}{(\code{logical}) returns the value \code{TRUE} when the model used +is the improved volume-synchronized probability of informed trading of Ke and +Lin (2017), and \code{FALSE} when the model used is the volume-synchronized +probability of informed trading of Easley et al.(2011,2012).} + \item{\code{parameters}}{(\code{numeric}) returns a numeric vector of estimation parameters (tbSize, buckets, samplength, VBS, #days), where \code{tbSize} is the size of timebars (in seconds); \code{buckets} is the number of buckets per average @@ -49,6 +54,9 @@ the \code{VPIN} vector.} \item{\code{vpin}}{(\code{numeric}) returns the vector of the volume-synchronized probabilities of informed trading.} +\item{\code{ivpin}}{(\code{numeric}) returns the vector of the improved volume- +synchronized probabilities of informed trading as in Ke and Lin (2017).} + \item{\code{dailyvpin}}{(\code{dataframe}) returns the daily \code{VPIN} values. Two variants are provided for any given day: \code{dvpin} corresponds to the unweighted average of vpin values, and \code{dvpin.weighted} diff --git a/man/vpin.Rd b/man/vpin.Rd deleted file mode 100644 index da5d9ba..0000000 --- a/man/vpin.Rd +++ /dev/null @@ -1,110 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/model_vpin.R -\name{vpin} -\alias{vpin} -\title{Estimation of Volume-Synchronized PIN model} -\usage{ -vpin(data, timebarsize = 60, buckets = 50, samplength = 50, - tradinghours = 24, verbose = TRUE) -} -\arguments{ -\item{data}{A dataframe with 3 variables: -\code{{timestamp, price, volume}}.} - -\item{timebarsize}{An integer referring to the size of timebars -in seconds. The default value is \code{60}.} - -\item{buckets}{An integer referring to the number of buckets in a -daily average volume. The default value is \code{50}.} - -\item{samplength}{An integer referring to the sample length -or the window size used to calculate the \code{VPIN} vector. -The default value is \code{50}.} - -\item{tradinghours}{An integer referring to the length of daily -trading sessions in hours. The default value is \code{24}.} - -\item{verbose}{A binary variable that determines whether detailed -information about the steps of the estimation of the VPIN model is displayed. -No output is produced when \code{verbose} is set to \code{FALSE}. The default -value is \code{TRUE}.} -} -\value{ -Returns an object of class \code{estimate.vpin}. -} -\description{ -Estimates the Volume-Synchronized Probability of Informed -Trading as developed in \insertCite{Easley2011;textual}{PINstimation} -and \insertCite{Easley2012;textual}{PINstimation}. -} -\details{ -The dataframe data should contain at least three variables. Only the -first three variables will be considered and in the following order -\code{{timestamp, price, volume}}. - -The property \code{@bucketdata} is created as in -\insertCite{abad2012;textual}{PINstimation}. - -The argument \code{timebarsize} is in seconds enabling the user to implement -shorter than \code{1} minute intervals. The default value is set to \code{1} minute -(\code{60} seconds) following Easley et al. (2011, 2012). - -The parameter \code{tradinghours} is used to eventually correct the duration per -bucket. The duration of a given bucket is the difference between the -timestamp of the last trade \code{endtime} and the timestamp of the first trade -\code{stime} in the bucket. If the first trade and the last trade in a -bucket occur in two different days, and the market trading session does not -cover a full day \code{(24 hours)}; then the duration of the bucket will be -inflated. Assume that the daily trading session is 8 hours -\code{(tradinghours=8)}, the start time of a bucket is \code{2018-10-12 17:06:40} -and its end time is \code{2018-10-13 09:36:00}. A straightforward calculation -gives that the duration of this bucket is \code{59,360 secs}. However, this -duration includes the time during which the market is closed \verb{(16 hours)}. -The corrected duration takes into consideration only the time of market -activity: \verb{duration=59,360-16*3600= 1760 secs}, i.e., about \verb{30 minutes}. -} -\examples{ -# There is a preloaded dataset called 'hfdata' contained in the package. -# It is an artificially created high-frequency trading data. The dataset -# contains 100 000 trades and five variables 'timestamp', 'price', -# 'volume', 'bid' and 'ask'. For more information, type ?hfdata. - -xdata <- hfdata - -# Estimate VPIN model, using the following parameter set where the time -# bar size is 5 minutes, i.e., 300 seconds (timebarsize = 300), 50 -# buckets per average daily volume (buckets = 50), and a window size of -# 250 for the VPIN calculation (samplength = 250). - -estimate <- vpin(xdata, timebarsize = 300, buckets = 50, samplength = 250) - -# Display a description of the estimate - -show(estimate) - -# Plot the estimated VPIN vector - -plot(estimate@vpin, type = "l", xlab = "time", ylab = "VPIN", col = "blue") - -# Display the parameters of VPIN estimates - -show(estimate@parameters) - -# Store the computed data of the different buckets in a dataframe 'buckets'. -# Display the first 10 rows of the dataframe 'buckets'. - -buckets <- estimate@bucketdata -show(head(buckets, 10)) - -# Store the daily VPIN values (weighted and unweighted) in a dataframe -# 'dayvpin'. - -# Display the first 10 rows of the dataframe 'dayvpin'. - -dayvpin <- estimate@dailyvpin -show(head(dayvpin, 10)) - -} -\references{ -\insertAllCited -} diff --git a/man/vpin_measures.Rd b/man/vpin_measures.Rd new file mode 100644 index 0000000..fa55e00 --- /dev/null +++ b/man/vpin_measures.Rd @@ -0,0 +1,186 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/model_vpin.R +\name{vpin_measures} +\alias{vpin_measures} +\alias{vpin} +\alias{ivpin} +\title{Estimation of Volume-Synchronized PIN model (vpin) and the improved +volume-synchronized PIN model (ivpin)} +\usage{ +vpin( + data, + timebarsize = 60, + buckets = 50, + samplength = 50, + tradinghours = 24, + verbose = TRUE +) + +ivpin( + data, + timebarsize = 60, + buckets = 50, + samplength = 50, + tradinghours = 24, + grid_size = 5, + verbose = TRUE +) +} +\arguments{ +\item{data}{A dataframe with 3 variables: +\code{{timestamp, price, volume}}.} + +\item{timebarsize}{An integer referring to the size of timebars +in seconds. The default value is \code{60}.} + +\item{buckets}{An integer referring to the number of buckets in a +daily average volume. The default value is \code{50}.} + +\item{samplength}{An integer referring to the sample length +or the window size used to calculate the \code{VPIN} vector. +The default value is \code{50}.} + +\item{tradinghours}{An integer referring to the length of daily +trading sessions in hours. The default value is \code{24}.} + +\item{verbose}{A logical variable that determines whether detailed +information about the steps of the estimation of the VPIN (IVPIN) model is +displayed. No output is produced when \code{verbose} is set to \code{FALSE}. +The default value is \code{TRUE}.} + +\item{grid_size}{An integer between \code{1}, and \code{20}; +representing the size of the grid used in the estimation of IVPIN. The +default value is \code{5}. See more in details.} +} +\value{ +Returns an object of class \code{estimate.vpin}, which +contains the following slots: +\describe{ +\item{\code{@improved}}{ A logical variable that takes the value \code{FALSE} +when the classical VPIN model is estimated (using \code{vpin()}), and \code{TRUE} +when the improved VPIN model is estimated (using \code{ivpin()}).} +\item{\code{@bucketdata}}{ A data frame created as in +\insertCite{abad2012;textual}{PINstimation}.} +\item{\code{@vpin}}{ A vector of VPIN values.} +\item{\code{@ivpin}}{ A vector of IVPIN values, which remains empty when +the function \code{vpin()} is called.} +} +} +\description{ +Estimates the Volume-Synchronized Probability of Informed +Trading as developed in \insertCite{Easley2011;textual}{PINstimation} +and \insertCite{Easley2012;textual}{PINstimation}. \cr +Estimates the improved Volume-Synchronized Probability of Informed +Trading as developed in \insertCite{ke2017improved;textual}{PINstimation}. +} +\details{ +The dataframe data should contain at least three variables. Only the +first three variables will be considered and in the following order +\code{{timestamp, price, volume}}. + +The argument \code{timebarsize} is in seconds enabling the user to implement +shorter than \code{1} minute intervals. The default value is set to \code{1} minute +(\code{60} seconds) following Easley et al. (2011, 2012). + +The argument \code{tradinghours} is used to correct the duration per +bucket if the market trading session does not cover a full day \code{(24 hours)}. +The duration of a given bucket is the difference between the +timestamp of the last trade \code{endtime} and the timestamp of the first trade +\code{stime} in the bucket. If the first and last trades in a bucket occur +on different days, and the market trading session is shorter than +\verb{24 hours}, the bucket's duration will be inflated. For example, if the daily +trading session is 8 hours \code{(tradinghours = 8)}, and the start time of a +bucket is \code{2018-10-12 17:06:40} and its end time is +\code{2018-10-13 09:36:00}, the straightforward calculation gives a duration +of \code{59,360 secs}. However, this duration includes 16 hours when the +market is closed. The corrected duration considers only the market activity +time: \code{duration = 59,360 - 16 * 3600 = 1,760 secs}, approximately +\verb{30 minutes}. + +The argument \code{grid_size} determines the size of the grid for the variables +\code{alpha} and \code{delta}, used to generate the initial parameter sets +that prime the maximum-likelihood estimation step of the +algorithm by \insertCite{ke2017improved;textual}{PINstimation} for estimating +\code{IVPIN}. If \code{grid_size} is set to a value \code{m}, the algorithm creates a +sequence starting from \code{1 / (2m)} and ending at \code{1 - 1 / (2m)}, with a +step of \code{1 / m}. The default value of \code{5} corresponds to the grid size used by +\insertCite{Yan2012;textual}{PINstimation}, where the sequence starts at +\code{0.1 = 1 / (2 * 5)} and ends at \code{0.9 = 1 - 1 / (2 * 5)} +with a step of \code{0.2 = 1 / 5}. Increasing the value of \code{grid_size} +increases the running time and may marginally improve the accuracy of the +IVPIN estimates +} +\examples{ +# The package includes a preloaded dataset called 'hfdata'. +# This dataset is an artificially created high-frequency trading data +# containing 100,000 trades and five variables: 'timestamp', 'price', +# 'volume', 'bid', and 'ask'. For more information, type ?hfdata. + +xdata <- hfdata + +### Estimation of the VPIN model ### + +# Estimate the VPIN model using the following parameters: +# - timebarsize: 5 minutes (300 seconds) +# - buckets: 50 buckets per average daily volume +# - samplength: 250 for the VPIN calculation + +estimate <- vpin(xdata, timebarsize = 300, buckets = 50, samplength = 250) + +# Display a description of the VPIN estimate + +show(estimate) + +# Display the parameters of the VPIN estimates + +show(estimate@parameters) + +# Display the summary statistics of the VPIN vector +summary(estimate@vpin) + +# Store the computed data of the different buckets in a dataframe 'buckets' +# and display the first 10 rows of the dataframe. + +buckets <- estimate@bucketdata +show(head(buckets, 10)) + +# Display the first 10 rows of the dataframe 'dayvpin'. +dayvpin <- estimate@dailyvpin +show(head(dayvpin, 10)) + + +### Estimation of the IVPIN model ### + +# Estimate the IVPIN model using the same parameters as above. +# The grid_size parameter is unspecified and will default to 5. + +iestimate <- ivpin(xdata, timebarsize = 300, samplength = 250, verbose = FALSE) + +# Display the summary statistics of the IVPIN vector +summary(iestimate@ivpin) + +# The output of ivpin() also contains the VPIN vector in the @vpin slot. +# Plot the VPIN and IVPIN vectors in the same plot using the iestimate object. + +# Define the range for the VPIN and IVPIN vectors, removing NAs. +vpin_range <- range(c(iestimate@vpin, iestimate@ivpin), na.rm = TRUE) + +# Plot the VPIN vector in blue +plot(iestimate@vpin, type = "l", col = "blue", ylim = vpin_range, + ylab = "Value", xlab = "Bucket", main = "Plot of VPIN and IVPIN") + +# Add the IVPIN vector in red +lines(iestimate@ivpin, type = "l", col = "red") + +# Add a legend to the plot +legend("topright", legend = c("VPIN", "IVPIN"), col = c("blue", "red"), + lty = 1, + cex = 0.6, # Adjust the text size + x.intersp = 1.2, # Adjust the horizontal spacing + y.intersp = 2, # Adjust the vertical spacing + inset = c(0.05, 0.05)) # Adjust the position slightly + +} +\references{ +\insertAllCited +} diff --git a/vignettes/PINstimation.rmd b/vignettes/PINstimation.rmd index f3dab5d..c6c896f 100644 --- a/vignettes/PINstimation.rmd +++ b/vignettes/PINstimation.rmd @@ -43,6 +43,7 @@ Loading the package ```{r installation, results = 'hide', message=FALSE, warning=FALSE} library(PINstimation) +library(dplyr) ```