Skip to content

Commit

Permalink
add new function for atac-seq + testing and comments
Browse files Browse the repository at this point in the history
  • Loading branch information
rochevin committed Aug 20, 2021
1 parent 789391c commit 1a7d8f6
Show file tree
Hide file tree
Showing 102 changed files with 40,053 additions and 37,997 deletions.
11 changes: 6 additions & 5 deletions R/DeepG4.R
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,10 @@
#' head(predictions)
DeepG4 <- function(X = NULL,X.atac = NULL,Y=NULL,model=NULL,lower.case=F,treshold = 0.5,seq.size = 201,retrain=FALSE,retrain.path="",log_odds=F){
model.regular.size.accepted <- 201
if(seq.size != model.regular.size.accepted & retrain != TRUE){
message("Please don't manually set seq.size unless you want to use a custom model")
seq.size <- model.regular.size.accepted
}
tabv = c("T"=4,"G"=3,"C"=2,"A"=1)
#Check if X is provided
if (is.null(X)) {
Expand Down Expand Up @@ -81,7 +85,7 @@ DeepG4 <- function(X = NULL,X.atac = NULL,Y=NULL,model=NULL,lower.case=F,treshol
}
}
default_model <- ifelse(is.null(X.atac),system.file("extdata", "DeepG4_classic_rescale_BW_sampling_02_03_2021/2021-03-02T16-17-28Z/best_model.h5", package = "DeepG4"),
system.file("extdata", "DeepG4_ATAC_rescale_BW_sampling_02_03_2021/2021-03-02T16-01-34Z/best_model.h5", package = "DeepG4"))
system.file("extdata", "DeepG4_ATAC_rescale_BW_by_bg_5kb_seuil_2_19_04_2021/2021-07-06T07-59-11Z/best_model.h5", package = "DeepG4"))
log_odds_index <- ifelse(is.null(X.atac),7,9)
## Check sequences sizes
message("Check sequences sizes...")
Expand Down Expand Up @@ -183,10 +187,7 @@ DeepG4 <- function(X = NULL,X.atac = NULL,Y=NULL,model=NULL,lower.case=F,treshol
message("Loading model...")
if(is.null(model)){
model <- default_model
if(seq.size != model.regular.size.accepted){
message("Please don't manually set seq.size unless you want to use a custom model")
seq.size <- model.regular.size.accepted
}

}else{
if(class(model) != "character"){
stop("model must be a path to a keras model in hdf5 format",
Expand Down
14 changes: 14 additions & 0 deletions R/DeepG4InputFromBED.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,17 @@
#' Pre-Computing function of DeepG4. To use before DeepG4 main function
#'
#' @param BED An object of class GRanges.
#' @param ATAC A character path of bigWig/bedGraph file, or an object of class GRanges/SimpleRleList.
#' @param is.bw a boolean. Set to \code{TRUE} if you want to use rtracklayer::import.bw fonction, \code{FALSE} use rtracklayer::import.bedGraph.
#' @param GENOME a BSgenome object containing the DNA sequence of genome of interest.
#' @param use.bg a boolean. Set to \code{TRUE} you want to normalize the accessibility using a windows background of windows_bg.
#' @param windows_bg numeric value who define the windows use to get background signal.
#' @param treshold_bg numeric value who set the treshold signal/background.
#'
#' @return a list(DNAStringSet,vector) with fasta sequence from GENOME of BED, and vector of accessibility of BED.
#' @export
#'
#' @examples
DeepG4InputFromBED <- function(BED = NULL,ATAC = NULL,is.bw = TRUE,GENOME = NULL,use.bg = TRUE,windows_bg=5000,treshold_bg = 2){
if (is.null(GENOME)) {
stop("GENOME must be provided (see ?DeePG4InputFromBED for accepted formats).",
Expand Down
127 changes: 118 additions & 9 deletions R/DeepG4Scan.R
Original file line number Diff line number Diff line change
Expand Up @@ -4,18 +4,11 @@
#' @param k size of the sliding windows.
#' @param treshold numeric value who define the treshold to use to consider a sequence asc ontaining an active G4.
#' @param threads numeric value who define the number of threads used in DeepG4Scan (Generate sub sequences)
#' @param lower.case a boolean. Set to \code{TRUE} if elements of X are in lower case (default to FALSE).
#'
#' @return a data.frame with the position of potential active G4 across input sequences.
#' @export
#'
#' @examples
DeepG4Scan <- function(X=NULL,k=20,treshold = 0.5,threads = 1){
.callDeepG4Scan <- function(X=NULL,k=20,treshold = 0.5,threads = 1,lower.case=F){
seq.size = 201
#Check if X is provided
if (is.null(X)) {
stop("X must be provided (see ?DeepG4 for accepted formats).",
call. = FALSE)
}
# Packages check
if (!requireNamespace("keras", quietly = TRUE)) {
stop("Package \"keras\" needed for this function to work. Please install it.",
Expand Down Expand Up @@ -84,4 +77,120 @@ DeepG4Scan <- function(X=NULL,k=20,treshold = 0.5,threads = 1){
return(results)
}

#' Scanning of potential active G4 on a sequence or a list of sequences using a sliding window of size k.
#'
#' @param X An object of class GRanges.
#' @param X.ATAC An object of class GRanges containing DNA accessibility.
#' @param k size of the sliding windows.
#' @param treshold numeric value who define the treshold to use to consider a sequence asc ontaining an active G4.
#' @param threads numeric value who define the number of threads used in DeepG4Scan (Generate sub sequences)
#' @param lower.case a boolean. Set to \code{TRUE} if elements of X are in lower case (default to FALSE).
#' @param GENOME a BSgenome object containing the DNA sequence of genome of interest.
#' @param use.bg a boolean. Set to \code{TRUE} you want to normalize the accessibility using a windows background of windows_bg.
#' @param windows_bg numeric value who define the windows use to get background signal.
#' @param treshold_bg numeric value who set the treshold signal/background.
#'
#' @return a data.frame with the position of potential active G4 across input sequences.
#' @examples
.callDeepG4ScanATAC <- function(X=NULL,X.ATAC=NULL,k=20,treshold = 0.5,threads = 1,lower.case=F,GENOME = NULL,use.bg=T,windows_bg=5000,treshold_bg = 2){
seq.size = 201
# Packages check
if (!requireNamespace("keras", quietly = TRUE)) {
stop("Package \"keras\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("Biostrings", quietly = TRUE)) {
stop("Package \"Biostrings\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (!requireNamespace("GenomicRanges", quietly = TRUE)) {
stop("Package \"GenomicRanges\" needed for this function to work. Please install it.",
call. = FALSE)
}
if (is.null(GENOME)) {
stop("GENOME must be provided (see ?DeepG4Scan for accepted formats).",
call. = FALSE)
}
if (class(GENOME) != "BSgenome") {
stop("GENOME must be a BSgenome object.",
call. = FALSE)
}
if (is.null(X.ATAC)) {
stop("X.ATAC must be provided (see ?DeepG4Scan for accepted formats).",
call. = FALSE)
}
if(class(X.ATAC) == "SimpleRleList"){
X.ATAC <- as(X.ATAC,"GRanges")
}else if(class(X.ATAC) == "GRanges"){
if(!"score" %in% colnames(IRanges::values(X.ATAC))){
stop("X.ATAC must be a GRanges object with a metadata column 'score'",
call. = FALSE)
}
if(class(X.ATAC$score) != "numeric"){
stop("score column must be a numeric vector.",
call. = FALSE)
}
}else{
stop("X.ATAC must be a character (path for a file in BedGraph/BigWig format), or a SimpleRleList/GRanges class",
call. = FALSE)
}
seq.X <- unique(as.character(seqnames(X)))
seq.X.atac <- unique(as.character(seqnames(X.ATAC)))
X.in.atac <- !seq.X %in%seq.X.atac
if(any(X.in.atac)){
message(paste0("Warning: Some of your sequences have chromosome who's missing in the Accessibility file : ",paste(seq.X[X.in.atac],collapse=" "),", and will be croped."))
X <- X[!as.character(seqnames(X)) %in% seq.X[X.in.atac]]
}
binbed <- rtracklayer::import.bed(system.file("extdata", "random_region_for_scaling_min_max.bed", package = "DeepG4"))
message("Normalise accessibility ...")
X.ATAC <- NormBW(X.ATAC,binbed)
message(message("Extract sub-sequences / sub-accessibility using ",threads," threads..."))
results <- ExtractSubSequencefromBed(x=X,x.atac=X.ATAC,GENOME=GENOME,nb.threads=threads,use.bg=T)
message(paste0("From ",length(X)," positions to",nrow(results),"."))
X <- Biostrings::DNAStringSet(as.vector(results$seq))
x.atac <- results$score
message("Check sequences composition...")
resFreq <- Biostrings::letterFrequency(X,"N",as.prob = T)
testNFreq <- as.vector(resFreq>0.1)
if(any(testNFreq)){
message(paste0("Warning: Some of your sequences have a N frequency > 0.1 and will be removed.\nDeepG4 has difficulty to handle sequences with a N rate > 10%"))
X <- X[!testNFreq]
x.atac <- x.atac[!testNFreq]
if(length(X)<1){
stop("Not enough sequences to continue ...",
call. = FALSE)
}
}
## Predictions using DeepG4
message("prediction using DeepG4 main function...")
predictions <- DeepG4(X = X,X.atac = x.atac,lower.case = lower.case)
results <- results[!testNFreq,]
results$score <- predictions[,1]
results <- results[results$score>treshold,]
if(nrow(results)== 0){
stop(paste0("No sequences with a score <",treshold),
call. = FALSE)
}
return(results)
}


#' Standard method to call
#'
#' @param X One of character, list, DNAString, DNAStringSet, DNAStringSetList, GRanges
#' @param ...
#'
#' @return
#' @export
#'
#' @details
#' This function is a method who launch ?callDeepG4Scan or .callDeepG4ScanATAC based on the arguments who are passed on. See ?.callDeepG4ScanATAC or ?.callDeepG4Scan.

setGeneric("DeepG4Scan", function(X,...) standardGeneric("DeepG4Scan"))

setMethod("DeepG4Scan",c(X="character"), .callDeepG4Scan)
setMethod("DeepG4Scan",c(X="list"), .callDeepG4Scan)
setMethod("DeepG4Scan",c(X="DNAString"), .callDeepG4Scan)
setMethod("DeepG4Scan",c(X="DNAStringSet"), .callDeepG4Scan)
setMethod("DeepG4Scan",c(X="DNAStringSetList"), .callDeepG4Scan)
setMethod("DeepG4Scan",c(X="GRanges"), .callDeepG4ScanATAC)
2 changes: 1 addition & 1 deletion R/ExtractMotifFromModel.R
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ ExtractMotifFromModel <- function(X = NULL,Y=NULL,lower.case=F,top_kernel = 20,m
}
if(model.atac){
# IF TRUE, we use the model with accessibility and input will be differents
model <- system.file("extdata", "DeepG4_ATAC_rescale_BW_sampling_02_03_2021/2021-03-02T16-01-34Z/best_model.h5", package = "DeepG4")
model <- system.file("extdata", "DeepG4_ATAC_rescale_BW_by_bg_5kb_seuil_2_19_04_2021/2021-07-06T07-59-11Z/best_model.h5", package = "DeepG4")
#Load model with keras (tensorflow must be installed as well)
model <- keras::load_model_hdf5(model)
Convolution <- keras::keras_model(inputs = model$input[[1]],
Expand Down
13 changes: 5 additions & 8 deletions R/ExtractSubSequence.R
Original file line number Diff line number Diff line change
@@ -1,20 +1,17 @@
#' Title
#' ExtractSubSequencefromBed : Internal function for DeepG4Scan.
#'
#' @param x
#' @param k
#' @param seq.size
#' @param x An object of class GRanges.
#' @param k size of the sliding windows.
#' @param seq.size numeric value representing the sequence size accepted by our model. Don't change it unless you want to use our function with a custom model.
#'
#' @return
#'
#' @examples
#' @return A data.frame with position of subsequence and DNA sequence.
ExtractSubSequence <- function(x=NULL,k = 20,seq.size = 201){
extend <- ((seq.size-1)/2)
center <- seq(1,length(x),k)
start <- center - extend
end <- center + extend
Viewseq <- Biostrings::Views(x, start=start[start>0&end<=length(x)], end=end[start>0&end<=length(x)])
sequences <- Biostrings::DNAStringSet(Viewseq)

results <- cbind(as.data.frame(IRanges::ranges(Viewseq)),seq=as.character(Viewseq))
return(results)
}
46 changes: 46 additions & 0 deletions R/ExtractSubSequencefromBed.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,46 @@
#' ExtractSubSequencefromBed : Internal function for DeepG4Scan.
#'
#' @param x An object of class GRanges.
#' @param x.atac A SimpleRleList object containing DNA accessibility.
#' @param k size of the sliding windows.
#' @param GENOME a BSgenome object containing the DNA sequence of genome of interest.
#' @param seq.size numeric value representing the sequence size accepted by our model. Don't change it unless you want to use our function with a custom model.
#' @param nb.threads number of threads to use, default 1.
#' @param use.bg a boolean. Set to \code{TRUE} you want to normalize the accessibility using a windows background of windows_bg.
#' @param windows_bg numeric value who define the windows use to get background signal.
#' @param treshold_bg numeric value who set the treshold signal/background.
#'
#' @return A data.frame with position of subsequence, DNA sequence and accessibility.
#' @export
#'
#' @examples
ExtractSubSequencefromBed <- function(x=NULL,x.atac=NULL,k = 20,GENOME=NULL,seq.size = 201,nb.threads=1,use.bg=T,windows_bg=5000,treshold_bg = 2){
extend <- ((seq.size-1)/2)
windows = IRanges::slidingWindows(x, width=k,step = k)
res <- mclapply(1:length(windows),function(i){

swindows <- windows[[i]]
end(swindows) <- BiocGenerics::start(swindows) + extend
start(swindows) <- BiocGenerics::start(swindows) - extend

cov <- x.atac[[as.character(GenomeInfoDb::seqnames(x[i]))]]

score <- IRanges::Views(cov, start = BiocGenerics::start(swindows), end = BiocGenerics::end(swindows))
score <- rowMeans(as.matrix(score))
score[is.na(score)] <- 0

if(use.bg){
swindows.bg <- resize(swindows,windows_bg,fix="center")
score.bg <- IRanges::Views(cov, start = BiocGenerics::start(swindows.bg), end = BiocGenerics::end(swindows.bg))
score.bg <- rowMeans(as.matrix(score.bg))
score.bg[is.na(score.bg)] <- 0
my_test <- (score/score.bg)<treshold_bg
score[my_test] <- 0
}
results <- cbind(seqnames = as.character(seqnames(x[i])),as.data.frame(IRanges::ranges(swindows)),score=score,seq=as.character(Biostrings::getSeq(GENOME,swindows)))

return(results)
},mc.cores=nb.threads)

return(do.call("rbind",res))
}
6 changes: 3 additions & 3 deletions R/NormBW.R
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
#' Title
#' Normalize a coverage file (from bigWig or bedGraph) using scales::rescale from a specific set of ranges to set values between [0,1]
#'
#' @param x
#' @param binbed
#' @param x An object of class GRanges, the coverage file to be normalized
#' @param binbed An object of class GRanges, the genomic position used to normalize x.
#'
#' @return
#' @export
Expand Down
18 changes: 11 additions & 7 deletions R/getScoreBW.R
Original file line number Diff line number Diff line change
@@ -1,18 +1,22 @@
#' Title
#' getScoreBW extract DNA accessibility from WIG to BED positions.
#'
#' @param WIG
#' @param BED
#' @param WIG An object of class GRanges containing DNA accessibility.
#' @param BED An object of class GRanges.
#'
#' @return
#' @export
#' @return A SimpleRleList object containing DNA accessibility.
#' @export A numeric vector of DNA accessibility
#'
#' @examples
getScoreBW <- function (WIG, BED)
getScoreBW <- function (WIG, BED,forScan=F)
{
res <- do.call("rbind",lapply(split(BED, droplevels(GenomeInfoDb::seqnames(BED))), function(zz) {
cov <- WIG[[unique(as.character(GenomeInfoDb::seqnames(zz)))]]
score <- IRanges::Views(cov, start = BiocGenerics::start(zz), end = BiocGenerics::end(zz))
return(as.matrix(score))
}))
return(rowMeans(res))
if(forScan){
return(res)
}else{
return(rowMeans(res))
}
}
Loading

0 comments on commit 1a7d8f6

Please sign in to comment.