Skip to content

Commit

Permalink
Add code to retrain our model in DeepG4 function and comments in READ…
Browse files Browse the repository at this point in the history
…ME for using it.
  • Loading branch information
rochevin committed Jul 2, 2020
1 parent cdd9b34 commit 146b020
Show file tree
Hide file tree
Showing 8 changed files with 608 additions and 20 deletions.
1 change: 1 addition & 0 deletions .Rbuildignore
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
^.*\.Rproj$
^\.Rproj\.user$
^README\.Rmd$
2 changes: 1 addition & 1 deletion R/DNAToNumerical.R
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
#' @param seq.size Set the sequence maximal length value authorized by our model (default to 201).
#'
#' @return An array of dimension \code{nrow(x),ncol(x),length(tabv)}
#'
#' @export
#' @examples
#' x <- Biostrings::DNAStringSet(c("ACGT"))
#' x_onehot <- DNAToNumerical(x)
Expand Down
129 changes: 110 additions & 19 deletions R/DeepG4.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,34 @@
DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2,"A"=1),lower.case=F,seq.size = 201,treshold = 0.5){
#' DeepG4 main function to predict a probability to form a G4, given a DNA sequence.
#'
#' @param X a character, a list or a DNAStringSet/DNAStringSetList of DNA sequences.
#' @param Y a numeric vector of 1 and 0 values (default to NULL).
#' @param model a path to a keras model in hdf5 format (default to NULL). Don't change it unless you want to use our function with a custom model.
#' @param tabv a named vector of numeric values representing the DNA to numerical conversion. Don't change it unless you want to use our function with a custom model.
#' @param lower.case boolean. Set to \code{TRUE} if elements of X are in lower case (default to FALSE).
#' @param seq.size numeric value representing the sequence size accepted by our model. Don't change it unless you want to use our function with a custom model.
#' @param treshold numeric value who define the treshold to use to get confusion matrix (default to 0.5).
#' @param retrain boolean. Set to \code{TRUE} if you want to retrain with your own dataset. Need Y to be provided (default to FALSE).
#' @param retrain.path file where retrained model will be saved.
#' @details
#' This function is a wrapper to help people to get a prediction given any DNA sequence(s) of type ACGTN with our DeepG4 model.
#' You don't have to use it to get a DeepG4 prediction, if you're familar with keras and tensorflow, you can access our model in hdf5 package using \code{system.file("extdata", "model.hdf5", package = "DeepG4")}.
#' In complement, \code{\link{DNAToNumerical}} can help you to get the one-hot conversion needed by our model as input.
#' If your sequences > \code{seq.size}, they will be cropped and sequences < \code{seq.size}, will be filled with zero padding.
#' @return if \code{Y = NULL}, return DeepG4 prediction for each value of X.
#' if \code{Y} is provided, return a list with list(prediction for each value of X,a ggplot2 object representing AUC,a ggplot2 object representing confusion matrix,some metrics)
#' @export
#'
#' @examples
#' library(Biostrings)
#' library(DeepG4)
#'
#' sequences <- system.file("extdata", "test_G4_data.fa", package = "DeepG4")
#' sequences <- readDNAStringSet(sequences)
#'
#' predictions <- DeepG4(sequences)
#' head(predictions)
DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2,"A"=1),lower.case=F,seq.size = 201,treshold = 0.5,retrain=FALSE,retrain.path=""){
model.regular.size.accepted <- 201
#Check if X is provided
if (is.null(X)) {
stop("X must be provided (see ?DeepG4 for accepted formats).",
Expand All @@ -15,7 +45,8 @@ DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2
}
# Check sequences and convert into one-hot
## Check model class and convert into DNAStringSet object if needed
if(!class(X) %in%c("DNAStringSet","DNAStringSetList")){

if(!class(X)[[1]] %in%c("DNAStringSet","DNAStringSetList")){
if(class(X) == "character"){
X <- Biostrings::DNAStringSet(X)
}else if(class(X) == "list"){
Expand All @@ -24,9 +55,9 @@ DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2
stop("X must be a character, a list or a DNAStringSet/DNAStringSetList class",
call. = FALSE)
}
}else if(class(X) =="DNAStringSetList"){
}else if(class(X)[[1]] =="DNAStringSetList"){
X <- unlist(Biostrings::DNAStringSetList(X))
}else{
}else if(!class(X)[[1]] =="DNAStringSet"){
stop("X must be a character, a list or a DNAStringSet/DNAStringSetList",
call. = FALSE)
}
Expand All @@ -48,27 +79,84 @@ DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2
})
X <- array(unlist(X_by_size), dim = c(length(X),seq.size,length(tabv)))
}
# Try to load our saved model or custom model if !is.null(model)
message("Loading model...")
if(is.null(model)){
model <- system.file("extdata", "model.hdf5", package = "DeepG4")
}else{
if(class(model) != "character"){
stop("model must be a path to a keras model in hdf5 format",
if(retrain){
# IF RETRAIN = TRUE
message("retrain == TRUE")
message("Model will be retrain using user input...")
# Check Y
if(is.null(Y)){
stop("Y must be set if you want retrain our model.",
call. = FALSE)
}
if(class(Y) != "numeric"){
stop("Y must be a numeric vector of 1 and 0 values",
call. = FALSE)
}
if(FALSE %in% (unique(Y) %in% c(0,1))){
stop("Y must be a numeric vector of 1 and 0 values",
call. = FALSE)
}
# Build the model
# Try to load our saved model or custom model if !is.null(model)
message("Loading model...")
if(is.null(model)){
model <- system.file("extdata", "model.hdf5", package = "DeepG4")
}else{
if(class(model) != "character"){
stop("model must be a path to a keras model in hdf5 format",
call. = FALSE)
}
}
#Load model with keras (tensorflow must be installed as well)
model <- keras::load_model_hdf5(model)
model <- keras::from_config(get_config(model))
keras::compile(model,
optimizer = 'rmsprop',
loss = 'binary_crossentropy',
metrics = list('accuracy')
)
# Retrain the model
history <- keras::fit(model,
X,
Y,
epochs = 20,
batch_size = 128,
validation_split = 0.2,
verbose= 1)
res <- stats::predict(model,X)
if(retrain.path == ""){
retrain.path <- paste0("DeepG4_retrained_",Sys.Date(),".hdf5")
}
keras::save_model_hdf5(model,retrain.path)
}else{
#IF RETRAIN = FALSE
# Try to load our saved model or custom model if !is.null(model)
message("Loading model...")
if(is.null(model)){
model <- system.file("extdata", "model.hdf5", package = "DeepG4")
if(seq.size != model.regular.size.accepted){
message("Please don't manually set seq.size unless you want to use a custom model")
seq.size <- model.regular.size.accepted
}
}else{
if(class(model) != "character"){
stop("model must be a path to a keras model in hdf5 format",
call. = FALSE)
}
}
#Load model with keras (tensorflow must be installed as well)
model <- keras::load_model_hdf5(model)
res <- stats::predict(model,X)
}
#Load model with keras (tensorflow must be installed as well)
model <- keras::load_model_hdf5(model)
res <- stats::predict(model,X)
return(res)
#If Y is provided, instead of returning prediction, return accuracy / AUC
if(!is.null(Y)){
# If Y is provided, instead of returning prediction, return accuracy / AUC
if(is.null(Y)){
return(res)
}else{
if(class(Y) != "numeric"){
stop("Y must be a numeric vector of 1 and 0 values",
call. = FALSE)
}
if(!unique(Y) %in% c(0,1)){
if(FALSE %in% (unique(Y) %in% c(0,1))){
stop("Y must be a numeric vector of 1 and 0 values",
call. = FALSE)
}
Expand All @@ -91,7 +179,9 @@ DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2
}
#Plot AUC
plot_ROC <- ggplot2::autoplot(yardstick::roc_curve(prediction_table,truth,pred_prob))
#Get metrics
table_metrics <- yardstick::metrics(prediction_table,truth,estimate)
#Plot confusion matrix
confusion_matrix <- as.data.frame(yardstick::conf_mat(prediction_table,truth, estimate)[[1]])

confusion_matrix <- ggplot2::ggplot(confusion_matrix,ggplot2::aes(Prediction, Truth, fill = Freq)) +
Expand All @@ -101,7 +191,8 @@ DeepG4 <- function(X = NULL,Y=NULL,model = NULL,tabv = c("N"=5,"T"=4,"G"=3,"C"=2
ggplot2::labs(
title = "Confusion matrix"
) + ggplot2::theme_minimal(base_size=18)
return(list(res,plot_ROC,confusion_matrix,table_metrics))
}

return(list(plot_ROC,confusion_matrix,table_metrics))

}
Empty file added R/RetrainModel.R
Empty file.
109 changes: 109 additions & 0 deletions README.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
---
output: github_document
---

<!-- README.md is generated from README.Rmd. Please edit that file -->

```{r, include = FALSE}
knitr::opts_chunk$set(
collapse = TRUE,
comment = "#>",
fig.path = "man/figures/README-",
out.width = "100%"
)
```
# __DeepG4__: A deep learning approach to predict active G-quadruplexes

<!-- badges: start -->
<!-- badges: end -->

__DeepG4__ is a deep learning model build in keras+tensorflow who aims to predict the probability of DNA sequences to form G-Guadruplexes secondary structures. __DeepG4__ is wrapped in a R package, but can work with any langage that has implemented keras and tensorflow (see below).

## Abstract

DNA is a complex molecule carrying the instructions an organism needs to develop, live and reproduce. In 1953, Watson and Crick discovered that DNA is composed of two chains forming a double-helix. Later on, other structures of DNA were discovered and shown to play important roles in the cell, in particular G-quadruplex (G4). Following genome sequencing, several bioinformatic algorithms were developed to map G4s in vitro based on a canonical sequence motif, G-richness and G-skewness or alternatively sequence features including k-mers. Here, we propose instead a convolutional neural network (DeepG4) to map active G4s (forming both in vitro and in vivo). DeepG4 is very accurate to predict active G4s, while state-of-the-art algorithms fail. Moreover, DeepG4 identifies key DNA motifs that are predictive of G4 activity. We found that active G4 motifs do not follow a very flexible sequence pattern as current algorithms seek for. Instead, active G4s are determined by numerous specific motifs. Moreover, among those motifs, we identified known transcription factors which could play important roles in G4 activity by either directly contributing to G4 structure themselves or by participating in G4 formation in the vicinity. Lastly, variant analysis suggests that SNPs altering predicted G4 activity could affect transcription and chromatin, e.g. gene expression, H3K4me3 mark and DNA methylation. Thus, DeepG4 paves the way for future studies assessing the impact of known disease-associated variants on DNA secondary structure and provides a mechanistic interpretation of SNP impact on transcription and chromatin.

## Installation

You can install the development version from [GitHub](https://github.com/) with:

``` r
# install.packages("devtools")
devtools::install_github("morphos30/DeepG4")
```
## Usage with DeepG4 R package

```r
library(Biostrings)
library(DeepG4)

sequences <- system.file("extdata", "test_G4_data.fa", package = "DeepG4")
sequences <- readDNAStringSet(sequences)

predictions <- DeepG4(sequences)
head(predictions)
```

## Using our model directly with keras in R

Using our model with keras is very simple, the code is very similar, but you have to convert youre sequence in one-hot first. To help you, our function `readDNAStringSet` help you to do it.

```r

library(Biostrings)
library(DeepG4)
library(keras)

sequences <- system.file("extdata", "test_G4_data.fa", package = "DeepG4")
sequences <- readDNAStringSet(sequences)

model <- system.file("extdata", "model.hdf5", package = "DeepG4")
model <- load_model_hdf5(model)

sequences <- DNAToNumerical(sequences)

predictions <- predict(model,sequences)
```

## Using DeepG4 with a new active G4 dataset

If you want to use our model architecture, but retrain with your own dataset, you can do it by running our function `DeepG4` with `retrain = TRUE`

```r

library(Biostrings)
library(DeepG4)
library(rsample)

# Read positive and segative set of sequences
sequences.pos <- readDNAStringSet("Peaks_BG4_G4seq_HaCaT_GSE76688_hg19_201b.Fa")
sequences.ctrl <- readDNAStringSet("Peaks_BG4_G4seq_HaCaT_GSE76688_hg19_201b_Ctrl_gkmSVM.Fa")
sequences <- c(sequences.pos,sequences.ctrl)
# Generate classes
Y <- c(rep(1,length(sequences.pos)),rep(0,length(sequences.ctrl)))

```

It's a good idea to split your dataset in train/test to evaluate the model performance on the testing dataset.

```r
# Sample dataset and get test and train dataset
smp_size <- floor(0.70 * length(sequences))
train_ind <- sample(seq_len(length(sequences)), size = smp_size)
x.train <- sequences[train_ind]
x.test <- sequences[-train_ind]
y.train <- Y[train_ind]
y.test <- Y[-train_ind]
```
Then train your model on your training dataset :

```r
training <- DeepG4(x.train,y.train,retrain=TRUE,retrain.path = "DeepG4_retrained.hdf5")
```

You can now evaluate it with your testing dataset :

```r
predictions <- DeepG4(x.test,y.test,model = "DeepG4_retrained.hdf5")
predictions
```
Loading

0 comments on commit 146b020

Please sign in to comment.