-
Notifications
You must be signed in to change notification settings - Fork 30
/
Copy pathunivRegLog.R
115 lines (99 loc) · 3.87 KB
/
univRegLog.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
################################################################################
#' Column-wise logistic regression
#'
#' Slopes of column-wise logistic regressions of each column
#' of a `big.matrix`, with some other associated statistics.
#' Covariates can be added to correct for confounders.
#'
#' @inheritParams bigstatsr-package
#'
#' @param tol Relative tolerance to assess convergence of the coefficient.
#' Default is `1e-8`.
#' @param maxiter Maximum number of iterations before giving up.
#' Default is `20`. Usually, convergence is reached within 3 or 4 iterations.
#' If there is not convergence,
#' [glm][stats::glm] is used instead for the corresponding column.
#'
#' @return A data.frame with as elements:
#' 1. the slopes of each regression,
#' 2. the standard errors of each slope,
#' 3. the number of iteration for each slope. If is `NA`, this means that the
#' algorithm didn't converge, and [glm][stats::glm] was used instead.
#' 4. the z-scores associated with each slope,
#' 5. the p-values associated with each z-score.
#' @example examples/example-univLogReg.R
#' @seealso [glm][stats::glm]
#' @export
#' @import foreach
big_univLogReg <- function(X, y01.train, ind.train = seq(nrow(X)),
covar.train = NULL, ncores2 = 1,
tol = 1e-8, maxiter = 20) {
check_X(X, ncores2 = ncores2)
stopifnot(sort(unique(y01.train)) == 0:1)
is.seq <- (ncores2 == 1)
if (!is.seq) X.desc <- describe(X)
n <- length(ind.train)
stopifnot(n == length(y01.train))
if (is.null(covar.train)) {
covar.train <- cbind(rep(0, n), rep(1, n))
} else {
covar.train <- cbind(0, 1, covar.train)
}
stopifnot(n == nrow(covar.train))
# no intercept because already in covar.train
mod0 <- stats::glm(y01.train ~ covar.train[, -1] - 1,
family = "binomial")
p0 <- mod0$fitted
w0 <- p0 * (1 - p0)
z0 <- log(p0 / (1 - p0)) + (y01.train - p0) / w0
rm(mod0, p0)
range.parts <- CutBySize(ncol(X), nb = ncores2)
if (is.seq) {
registerDoSEQ()
} else {
cl <- parallel::makeCluster(ncores2)
doParallel::registerDoParallel(cl)
}
res.all <- foreach(ic = seq_len(ncores2), .combine = 'cbind') %dopar% {
lims <- range.parts[ic, ]
if (is.seq) {
X.part <- X
} else {
X.part <- sub.big.matrix(X.desc, firstCol = lims[1], lastCol = lims[2])
}
# https://www.r-bloggers.com/too-much-parallelism-is-as-bad/
multi <- !is.seq && detect_MRO()
if (multi) nthreads.save <- RevoUtilsMath::setMKLthreads(1)
res <- IRLS(X.part@address, covar.train, y01.train, z0, w0,
ind.train, tol, maxiter)
if (multi) RevoUtilsMath::setMKLthreads(nthreads.save)
indNoConv <- which(res$conv >= maxiter)
if ((l <- length(indNoConv)) > 0) {
printf(paste("For %d columns, IRLS has not converged",
"using glm for those instead.\n", sep = "; "), l)
for (j in indNoConv) {
mod <- stats::glm(y01.train ~ X.part[ind.train, j] +
covar.train[, -1] - 1,
family = "binomial",
control = list(epsilon = tol, maxit = 100))
if (mod$converged) {
coeffs <- summary(mod)$coefficients
} else {
coeffs <- c(NA, NA)
}
res$betas[j] <- coeffs[1]
res$std[j] <- coeffs[2]
}
res$conv[indNoConv] <- NA
}
rbind(res$betas, res$std, res$conv)
}
if (!is.seq) parallel::stopCluster(cl)
if (nbNA <- sum(is.na(res.all[1, ])))
warning(sprintf("For %d columns, glm has not converged either.", nbNA))
z.scores <- res.all[1, ] / res.all[2, ]
p.values <- 2 * stats::pnorm(abs(z.scores), lower.tail = FALSE)
data.frame(estim = res.all[1, ], std.err = res.all[2, ], niter = res.all[3, ],
z.score = z.scores, p.value = p.values)
}
################################################################################