From 32dbbbaa3aee863d7c9cf76a5aac88eb85170ade Mon Sep 17 00:00:00 2001 From: Kent Riemondy Date: Wed, 13 Mar 2024 13:06:29 -0600 Subject: [PATCH] use txdbmaker package for makeTxDb... functions --- DESCRIPTION | 5 +- R/filter_rse.R | 81 ++++++++++++----------- man/filter_clustered_variants.Rd | 39 +++++------ man/filter_splice_variants.Rd | 38 +++++------ tests/testthat/test_rse.R | 107 ++++++++++++++++--------------- vignettes/raer.Rmd | 2 +- 6 files changed, 141 insertions(+), 131 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e9e82ce..26fe55b 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,7 +1,7 @@ Package: raer Type: Package Title: RNA editing tools in R -Version: 1.1.4 +Version: 1.1.5 Authors@R: c( person("Kent", "Riemondy", , "kent.riemondy@gmail.com", role = c("aut", "cre"), comment = c(ORCID = "0000-0003-0750-1273")), @@ -55,7 +55,8 @@ Suggests: scuttle, AnnotationHub, covr, - raerdata + raerdata, + txdbmaker LinkingTo: Rhtslib SystemRequirements: diff --git a/R/filter_rse.R b/R/filter_rse.R index 877b5c8..667b3ac 100644 --- a/R/filter_rse.R +++ b/R/filter_rse.R @@ -106,25 +106,27 @@ get_splice_sites <- function(txdb, slop = 4) { #' @family se-filters #' #' @examples -#' library(GenomicFeatures) -#' rse_adar_ifn <- mock_rse() -#' -#' # mock up a txdb with genes -#' gr <- GRanges(c( -#' "DHFR:310-330:-", -#' "DHFR:410-415:-", -#' "SSR3:100-155:-", -#' "SSR3:180-190:-" -#' )) -#' gr$source <- "raer" -#' gr$type <- "exon" -#' gr$source <- NA -#' gr$phase <- NA_integer_ -#' gr$gene_id <- c(1, 1, 2, 2) -#' gr$transcript_id <- rep(c("1.1", "2.1"), each = 2) -#' txdb <- makeTxDbFromGRanges(gr) -#' -#' filter_splice_variants(rse_adar_ifn, txdb) +#' if(require("txdbmaker")) { +#' rse_adar_ifn <- mock_rse() +#' +#' # mock up a txdb with genes +#' gr <- GRanges(c( +#' "DHFR:310-330:-", +#' "DHFR:410-415:-", +#' "SSR3:100-155:-", +#' "SSR3:180-190:-" +#' )) +#' gr$source <- "raer" +#' gr$type <- "exon" +#' gr$source <- NA +#' gr$phase <- NA_integer_ +#' gr$gene_id <- c(1, 1, 2, 2) +#' gr$transcript_id <- rep(c("1.1", "2.1"), each = 2) +#' txdb <- txdbmaker::makeTxDbFromGRanges(gr) +#' +#' filter_splice_variants(rse_adar_ifn, txdb) +#' } +#' #' #' @returns `SummarizedExperiment::SummarizedExperiment` with sites #' adjacent to splice sites removed. @@ -183,27 +185,28 @@ filter_splice_variants <- function(rse, txdb, #' variants #' #' @examples -#' library(GenomicFeatures) -#' -#' rse_adar_ifn <- mock_rse() -#' rse <- rse_adar_ifn[seqnames(rse_adar_ifn) == "SPCS3"] -#' -#' # mock up a txdb with genes -#' gr <- GRanges(c( -#' "SPCS3:100-120:-", -#' "SPCS3:325-350:-" -#' )) -#' gr$source <- "raer" -#' gr$type <- "exon" -#' gr$source <- NA -#' gr$phase <- NA_integer_ -#' gr$gene_id <- c(1, 2) -#' gr$transcript_id <- c("1.1", "2.1") -#' txdb <- makeTxDbFromGRanges(gr) -#' -#' rse <- filter_multiallelic(rse) -#' filter_clustered_variants(rse, txdb, variant_dist = 10) +#' if(require("txdbmaker")){ +#' rse_adar_ifn <- mock_rse() +#' rse <- rse_adar_ifn[seqnames(rse_adar_ifn) == "SPCS3"] +#' +#' # mock up a txdb with genes +#' gr <- GRanges(c( +#' "SPCS3:100-120:-", +#' "SPCS3:325-350:-" +#' )) +#' gr$source <- "raer" +#' gr$type <- "exon" +#' gr$source <- NA +#' gr$phase <- NA_integer_ +#' gr$gene_id <- c(1, 2) +#' gr$transcript_id <- c("1.1", "2.1") +#' txdb <- txdbmaker::makeTxDbFromGRanges(gr) +#' +#' rse <- filter_multiallelic(rse) +#' filter_clustered_variants(rse, txdb, variant_dist = 10) #' +#' } +#' #' @family se-filters #' #' @return `SummarizedExperiment::SummarizedExperiment` with sites removed from diff --git a/man/filter_clustered_variants.Rd b/man/filter_clustered_variants.Rd index f3add80..bd81a4d 100644 --- a/man/filter_clustered_variants.Rd +++ b/man/filter_clustered_variants.Rd @@ -35,26 +35,27 @@ multiple allele types are present within a given distance in genomic or transcriptome coordinate space. } \examples{ -library(GenomicFeatures) +if(require("txdbmaker")){ + rse_adar_ifn <- mock_rse() + rse <- rse_adar_ifn[seqnames(rse_adar_ifn) == "SPCS3"] + + # mock up a txdb with genes + gr <- GRanges(c( + "SPCS3:100-120:-", + "SPCS3:325-350:-" + )) + gr$source <- "raer" + gr$type <- "exon" + gr$source <- NA + gr$phase <- NA_integer_ + gr$gene_id <- c(1, 2) + gr$transcript_id <- c("1.1", "2.1") + txdb <- txdbmaker::makeTxDbFromGRanges(gr) + + rse <- filter_multiallelic(rse) + filter_clustered_variants(rse, txdb, variant_dist = 10) -rse_adar_ifn <- mock_rse() -rse <- rse_adar_ifn[seqnames(rse_adar_ifn) == "SPCS3"] - -# mock up a txdb with genes -gr <- GRanges(c( - "SPCS3:100-120:-", - "SPCS3:325-350:-" -)) -gr$source <- "raer" -gr$type <- "exon" -gr$source <- NA -gr$phase <- NA_integer_ -gr$gene_id <- c(1, 2) -gr$transcript_id <- c("1.1", "2.1") -txdb <- makeTxDbFromGRanges(gr) - -rse <- filter_multiallelic(rse) -filter_clustered_variants(rse, txdb, variant_dist = 10) +} } \seealso{ diff --git a/man/filter_splice_variants.Rd b/man/filter_splice_variants.Rd index 33b5a4b..c0a8b99 100644 --- a/man/filter_splice_variants.Rd +++ b/man/filter_splice_variants.Rd @@ -25,25 +25,27 @@ Remove editing sites found in regions proximal to annotated splice junctions. } \examples{ -library(GenomicFeatures) -rse_adar_ifn <- mock_rse() - -# mock up a txdb with genes -gr <- GRanges(c( - "DHFR:310-330:-", - "DHFR:410-415:-", - "SSR3:100-155:-", - "SSR3:180-190:-" -)) -gr$source <- "raer" -gr$type <- "exon" -gr$source <- NA -gr$phase <- NA_integer_ -gr$gene_id <- c(1, 1, 2, 2) -gr$transcript_id <- rep(c("1.1", "2.1"), each = 2) -txdb <- makeTxDbFromGRanges(gr) +if(require("txdbmaker")) { + rse_adar_ifn <- mock_rse() + + # mock up a txdb with genes + gr <- GRanges(c( + "DHFR:310-330:-", + "DHFR:410-415:-", + "SSR3:100-155:-", + "SSR3:180-190:-" + )) + gr$source <- "raer" + gr$type <- "exon" + gr$source <- NA + gr$phase <- NA_integer_ + gr$gene_id <- c(1, 1, 2, 2) + gr$transcript_id <- rep(c("1.1", "2.1"), each = 2) + txdb <- txdbmaker::makeTxDbFromGRanges(gr) + + filter_splice_variants(rse_adar_ifn, txdb) +} -filter_splice_variants(rse_adar_ifn, txdb) } \seealso{ diff --git a/tests/testthat/test_rse.R b/tests/testthat/test_rse.R index eafac33..d9c1db0 100644 --- a/tests/testthat/test_rse.R +++ b/tests/testthat/test_rse.R @@ -1,6 +1,5 @@ pkgs <- c( - "GenomicRanges", "GenomicFeatures", - "SummarizedExperiment", "rtracklayer" + "GenomicRanges", "SummarizedExperiment", "rtracklayer", "GenomicFeatures" ) msg <- lapply(pkgs, function(x) { @@ -16,7 +15,7 @@ sites <- import(bedfn) res <- pileup_sites(bamfn, fafn) test_that("annot_snps works", { - if (require(SNPlocs.Hsapiens.dbSNP144.GRCh38)) { + if (require("SNPlocs.Hsapiens.dbSNP144.GRCh38")) { gr <- GRanges(rep("22", 10), IRanges( seq(10510077, @@ -43,7 +42,7 @@ test_that("annot_snps works", { expect_true(is(res, "RangedSummarizedExperiment")) expect_true("RefSNP_id" %in% colnames(mcols(res))) - if (require(BSgenome.Hsapiens.NCBI.GRCh38)) { + if (require("BSgenome.Hsapiens.NCBI.GRCh38")) { res <- annot_snps(se, dbsnp = SNPlocs.Hsapiens.dbSNP144.GRCh38, genome = BSgenome.Hsapiens.NCBI.GRCh38 @@ -121,62 +120,66 @@ mock_txdb <- function() { gr$phase <- NA_integer_ gr$gene_id <- c(1, 1, 2, 2) gr$transcript_id <- rep(c("1.1", "2.1"), each = 2) - makeTxDbFromGRanges(gr) + txdbmaker::makeTxDbFromGRanges(gr) } test_that("get_splice_sites and filtering works", { - txdb <- mock_txdb() - spl_sites <- get_splice_sites(txdb, - slop = 3 - ) - expect_true(all(width(spl_sites) == 6)) - expect_error(get_splice_sites(spl_sites)) - - rse_adar_ifn <- mock_rse() - expect_message(rse <- filter_splice_variants(rse_adar_ifn, txdb)) - n_removed <- nrow(subsetByOverlaps(rse_adar_ifn, rse, invert = TRUE)) - expect_equal(n_removed, 5L) - expect_true("site_DHFR_328_2" %in% rownames(rse_adar_ifn)) - expect_false("site_DHFR_328_2" %in% rownames(rse)) + if("txdbmaker" %in% rownames(installed.packages())) { + txdb <- mock_txdb() + spl_sites <- get_splice_sites(txdb, + slop = 3 + ) + expect_true(all(width(spl_sites) == 6)) + expect_error(get_splice_sites(spl_sites)) + + rse_adar_ifn <- mock_rse() + expect_message(rse <- filter_splice_variants(rse_adar_ifn, txdb)) + n_removed <- nrow(subsetByOverlaps(rse_adar_ifn, rse, invert = TRUE)) + expect_equal(n_removed, 5L) + expect_true("site_DHFR_328_2" %in% rownames(rse_adar_ifn)) + expect_false("site_DHFR_328_2" %in% rownames(rse)) + } }) test_that("removing clustered variants works", { - txdb <- mock_txdb() - - rse_adar_ifn <- mock_rse() - expect_message(rse <- filter_multiallelic(rse_adar_ifn)) - - gr <- rowRanges(rse) - nd <- distanceToNearest(gr) - gr1 <- gr[mcols(nd)$distance > 100] - expect_warning(expect_warning(gr2 <- filter_clustered_variants(rse, txdb))) - expect_true(identical(rowRanges(gr2), gr1)) - - # check that transcript based removal works - # DHFR_328 -> DHFR_423 overlaps 310-330 -> 410-430 splice - ex <- exons(txdb) - rse_ex <- subsetByOverlaps(rse, ex) - - expect_warning( - gr2 <- filter_clustered_variants(rse_ex, - txdb, - regions = "transcript", - variant_dist = 20 + if("txdbmaker" %in% rownames(installed.packages())) { + txdb <- mock_txdb() + + rse_adar_ifn <- mock_rse() + expect_message(rse <- filter_multiallelic(rse_adar_ifn)) + + gr <- rowRanges(rse) + nd <- distanceToNearest(gr) + gr1 <- gr[mcols(nd)$distance > 100] + expect_warning(expect_warning(gr2 <- filter_clustered_variants(rse, txdb))) + expect_true(identical(rowRanges(gr2), gr1)) + + # check that transcript based removal works + # DHFR_328 -> DHFR_423 overlaps 310-330 -> 410-430 splice + ex <- exons(txdb) + rse_ex <- subsetByOverlaps(rse, ex) + + expect_warning( + gr2 <- filter_clustered_variants(rse_ex, + txdb, + regions = "transcript", + variant_dist = 20 + ) ) - ) - - expt <- rowRanges(subsetByOverlaps(rse_ex, gr2, invert = T)) - ds <- distanceToNearest(mapToTranscripts(expt, txdb, - extractor.fun = exonsBy - )) - expect_true(all(mcols(ds)$distance < 20)) - - expect_warning( - gr2 <- filter_clustered_variants(rse_ex, txdb, - variant_dist = 25 + + expt <- rowRanges(subsetByOverlaps(rse_ex, gr2, invert = T)) + ds <- distanceToNearest(mapToTranscripts(expt, txdb, + extractor.fun = exonsBy + )) + expect_true(all(mcols(ds)$distance < 20)) + + expect_warning( + gr2 <- filter_clustered_variants(rse_ex, txdb, + variant_dist = 25 + ) ) - ) - expect_equal(length(gr2), 3L) + expect_equal(length(gr2), 3L) + } }) test_that("calc_confidence works", { diff --git a/vignettes/raer.Rmd b/vignettes/raer.Rmd index 6e35d6e..38b4390 100644 --- a/vignettes/raer.Rmd +++ b/vignettes/raer.Rmd @@ -576,7 +576,7 @@ region of chr4. - `TxDb.Hsapiens.UCSC.hg38.knownGene`, a database of transcript models. Alternatively these can be generated from a `.gtf` file using -`makeTxDbFromGRanges()` from the `GenomicFeatures` package. +`makeTxDbFromGRanges()` from the `txdbmaker` package. - RepeatMasker annotations, which can be obtained from the `AnnotationHub()` for hg38, as shown in the [bulk RNA-seq](#bulk) tutorial.