diff --git a/.Rbuildignore b/.Rbuildignore index 8f14f0f..9ac7e8e 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -5,3 +5,4 @@ ^inst/scripts/HCATonsilDataRNA$ ^inst/scripts/rna_raw_data$ ^inst/scripts/cite_raw_data$ +^\.github$ diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..71bcb52 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,126 @@ +on: + push: + pull_request: + branches: + - devel + schedule: + - cron: '0 8 * * 5' + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + container: ${{ matrix.config.image }} + + name: ${{ matrix.config.os }} (${{ matrix.config.bioc }} - ${{ matrix.config.image }}) + + strategy: + fail-fast: false + matrix: + config: + - { os: windows-latest, bioc: 'devel'} + - { os: macOS-latest, bioc: 'devel', curlConfigPath: '/usr/bin/'} + - { os: ubuntu-latest, bioc: 'devel'} + - { os: ubuntu-latest, image: 'bioconductor/bioconductor_docker:devel'} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + CRAN: ${{ matrix.config.cran }} + GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }} + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + CURL_CONFIG: ${{ matrix.config.curlConfigPath }}curl-config + + steps: + - name: Check out repo + uses: actions/checkout@v2 + + - name: Set up R and install BiocManager + uses: grimbough/bioc-actions/setup-bioc@v1 + if: matrix.config.image == null + with: + bioc-version: ${{ matrix.config.bioc }} + + - name: Set up pandoc + uses: r-lib/actions/setup-pandoc@v2 + if: matrix.config.image == null + + - name: Install remotes + run: | + install.packages('remotes') + shell: Rscript {0} + + - name: Query dependencies + run: | + saveRDS(remotes::dev_package_deps(dependencies = TRUE, repos = c(getOption('repos'), BiocManager::repositories())), 'depends.Rds', version = 2) + shell: Rscript {0} + + - name: Cache R packages + if: runner.os != 'Windows' && matrix.config.image == null + uses: actions/cache@v1 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-bioc-${{ matrix.config.bioc }}-${{ hashFiles('depends.Rds') }} + restore-keys: ${{ runner.os }}-bioc-${{ matrix.config.bioc }}- + + - name: Install system dependencies + if: runner.os == 'Linux' + env: + RHUB_PLATFORM: linux-x86_64-ubuntu-gcc + run: | + Rscript -e "remotes::install_github('r-hub/sysreqs')" + sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") + sudo -s eval "$sysreqs" + sudo apt-get update && sudo apt-get -y install libcurl4-openssl-dev libglpk-dev libharfbuzz-dev libfribidi-dev + + - name: Install system dependencies (macOS) + if: runner.os == 'macOS' + run: | + brew install cairo + brew install --cask xquartz + brew install harfbuzz + brew install fribidi + brew install fftw + + - name: Install dependencies + run: | + local_deps <- remotes::local_package_deps(dependencies = TRUE) + deps <- remotes::dev_package_deps(dependencies = TRUE, repos = BiocManager::repositories()) + BiocManager::install(local_deps[local_deps %in% deps$package[deps$diff != 0]], Ncpu = 2L) + remotes::install_cran('rcmdcheck', Ncpu = 2L) + shell: Rscript {0} + + - name: Session info + run: | + options(width = 100) + pkgs <- installed.packages()[, "Package"] + sessioninfo::session_info(pkgs, include_base = TRUE) + shell: Rscript {0} + + - name: Build, Install, Check + uses: grimbough/bioc-actions/build-install-check@v1 + + - name: Run BiocCheck + uses: grimbough/bioc-actions/run-BiocCheck@v1 + with: + arguments: '--no-check-bioc-views --no-check-bioc-help' + error-on: 'error' + + - name: Show testthat output + if: always() + run: find check -name 'testthat.Rout*' -exec cat '{}' \; || true + shell: bash + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@master + with: + name: ${{ runner.os }}-bioc-${{ matrix.config.bioc }}-results + path: check + + - name: Test coverage + if: matrix.config.os == 'macOS-latest' + run: | + install.packages("covr") + covr::codecov(token = "${{secrets.CODECOV_TOKEN}}") + shell: Rscript {0} diff --git a/DESCRIPTION b/DESCRIPTION index f43d7fe..e254e37 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,6 +1,6 @@ Package: HCATonsilData Title: Provide programmatic access to the tonsil cell atlas datasets -Version: 0.99.3 +Version: 0.99.4 Authors@R: c(person("Ramon", "Massoni-Badosa", role = c("aut", "cre"), email ="ramonmassoni@gmail.com", comment = c(ORCID = "0000-0001-7115-8145")), @@ -39,6 +39,9 @@ Suggests: ggplot2, testthat (>= 3.0.0), scater, + Seurat, + Signac, + iSEE, ggspavis, kableExtra, BiocStyle diff --git a/README.md b/README.md index d351543..8eca778 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,9 @@ # HCATonsilData + +[![R-CMD-check](https://github.com/massonix/HCATonsilData/actions/workflows/R-CMD-check.yaml/badge.svg)](https://github.com/massonix/HCATonsilData/actions/workflows/R-CMD-check.yaml) + + HCATonsilData is an R/ExperimentHub package that provides easy access to single-cell RNA-seq (scRNA-seq), single-cell ATAC-seq (scATAC-seq), 10X Multiome, CITE-seq and spatial transcriptomics data (Visium) derived from the tonsil cell diff --git a/vignettes/HCATonsilData.Rmd b/vignettes/HCATonsilData.Rmd index 414f263..5c27562 100644 --- a/vignettes/HCATonsilData.Rmd +++ b/vignettes/HCATonsilData.Rmd @@ -216,11 +216,11 @@ interpretation in the Glossary (see below). # Assay types -HCATonsilData provides access to 5 main types of assays: RNA, ATAC, Multiome +HCATonsilData provides access to 5 main types of assays: RNA, ATAC, Multiome, CITE-seq and Spatial. -## RNA +## scRNA-seq We can obtain the `SingleCellExperiment` object with gene expression (RNA) data as follows: @@ -231,8 +231,11 @@ table(sce$assay) ``` This object consists of 377,988 profiled with scRNA-seq (3P) -and 84,364 cells profiled with multiome. We can dowload a `SingleCellExperiment` -object specific to each of the main subpopulations defined at level 1 as follows: +and 84,364 cells profiled with multiome, for a total of 462,352 cells (37,378 +genes were quantified across all of these). + +We can dowload a `SingleCellExperiment` object specific to each of the main +subpopulations defined at level 1 as follows: ```{r} listCellTypes(assayType = "RNA") @@ -257,7 +260,7 @@ annotations_dictionary[["dict_20220619_to_20230508"]] # load also a predefined palette of colors, to match the ones used in the manuscript data("colors_20230508") -(epithelial <- HCATonsilData("RNA", "epithelial", version = "1.0")) +(epithelial_discovery <- HCATonsilData("RNA", "epithelial", version = "1.0")) scater::plotUMAP(epithelial, colour_by = "annotation_20230508") + ggplot2::scale_color_manual(values = colors_20230508$epithelial) + ggplot2::theme(legend.title = ggplot2::element_blank()) @@ -297,42 +300,62 @@ Here's a brief explanation of all the variables in the colData slot of the * UMAP_1_20220215, UMAP_2_20220215: UMAP coordinates used in figures of the preprint for each cell type. -## ATAC and Multiome +## scATAC-seq and Multiome Since there is not a popular Bioconductor package to analyze or store scATAC-seq data, we point users to the scATAC-seq and Multiome Seurat objects that we created using [Signac](https://stuartlab.org/signac/) [@stuart2021single]. -Here are the instructions to download the scATAC-seq object: +Here are the instructions to download the scATAC-seq object (approximately ~9.3 Gb in size): ```{r eval=FALSE} library("Seurat") library("Signac") -dir.create("download_dir") + +download_dir = tempdir() + options(timeout = 10000000) atac_url <- "https://zenodo.org/record/8373756/files/TonsilAtlasSeuratATAC.tar.gz" -download.file(url = atac_url, destfile = "download_dir/TonsilAtlasSeuratATAC.tar.gz") +download.file( + url = atac_url, + destfile = file.path(download_dir, "TonsilAtlasSeuratATAC.tar.gz") +) # Advice: check that the md5sum is the same as the one in Zenodo -untar("download_dir/TonsilAtlasSeuratATAC.tar.gz", exdir = "download_dir") -atac <- readRDS("download_dir/scATAC-seq/20230911_tonsil_atlas_atac_seurat_obj.rds") -atac +untar( + tarfile = file.path(download_dir, "TonsilAtlasSeuratATAC.tar.gz"), + exdir = download_dir +) +atac_seurat <- readRDS( + file.path(download_dir, "scATAC-seq/20230911_tonsil_atlas_atac_seurat_obj.rds") +) +atac_seurat ``` The multiome object contains 68,749 cells that passed both RNA and ATAC QC filters. -Here are the instructions to download the Multiome object: +Here are the instructions to download the Multiome object (approximately ~5.7 Gb in size): ```{r eval=FALSE} library("Seurat") library("Signac") -dir.create("download_dir") + +download_dir = tempdir() + options(timeout = 10000000) multiome_url <- "https://zenodo.org/record/8373756/files/TonsilAtlasSeuratMultiome.tar.gz" -download.file(url = multiome_url, destfile = "download_dir/TonsilAtlasSeuratMultiome.tar.gz") +download.file( + url = multiome_url, + destfile = file.path(download_dir, "TonsilAtlasSeuratMultiome.tar.gz") +) # Advice: check that the md5sum is the same as the one in Zenodo -untar("download_dir/TonsilAtlasSeuratMultiome.tar.gz", exdir = "download_dir") -multiome <- readRDS("download_dir/multiome/20230911_tonsil_atlas_multiome_seurat_obj.rds") -multiome +untar( + tarfile = file.path(download_dir, "TonsilAtlasSeuratMultiome.tar.gz"), + exdir = download_dir +) +multiome_seurat <- readRDS( + file.path(download_dir, "/multiome/20230911_tonsil_atlas_multiome_seurat_obj.rds") +) +multiome_seurat ``` @@ -344,13 +367,20 @@ To recall peaks or visualize chromatin tracks, it is essential to have access to the [fragments file](https://support.10xgenomics.com/single-cell-atac/software/pipelines/latest/output/fragments). We modified the original fragments files generated by cellranger-atac or cellranger-arc to include the [gem_id](https://support.10xgenomics.com/single-cell-gene-expression/software/pipelines/latest/glossary) -as prefix. These files can be downloaded as follows: +as prefix. These files can be downloaded as follows (approximate size of ~26.0 Gb, feel free to grab a coffee in the meanwhile): ```{r eval=FALSE} -dir.create("download_dir") +download_dir = tempdir() + fragments_url <- "https://zenodo.org/record/8373756/files/fragments_files.tar.gz" -download.file(url = fragments_url, destfile = "download_dir/fragments_files.tar.gz") -untar("download_dir/fragments_files.tar.gz", exdir = "download_dir") +download.file( + url = fragments_url, + destfile = file.path(download_dir, "fragments_files.tar.gz") +) +untar( + tarfile = file.path(download_dir, "fragments_files.tar.gz"), + exdir = download_dir +) ``` **Note**: it is paramount to update the paths to the fragments files in each @@ -384,18 +414,29 @@ the meaning of the variables that are specific to Multiome or scATAC-seq: ## CITE-seq -The CITE-seq object can be downloaded as follows: +The CITE-seq object can be downloaded as follows (approximate size of ~0.4 Gb): ```{r eval=FALSE} library("Seurat") -dir.create("download_dir") + +download_dir = tempdir() + options(timeout = 10000000) cite_url <- "https://zenodo.org/record/8373756/files/TonsilAtlasSeuratCITE.tar.gz" -download.file(url = cite_url, destfile = "download_dir/TonsilAtlasSeuratCITE.tar.gz") +download.file( + url = cite_url, + destfile = file.path(download_dir, "TonsilAtlasSeuratCITE.tar.gz") +) # Advice: check that the md5sum is the same as the one in Zenodo -untar("download_dir/TonsilAtlasSeuratCITE.tar.gz", exdir = "download_dir") -cite <- readRDS("download_dir/CITE-seq/20220215_tonsil_atlas_cite_seurat_obj.rds") -cite +untar( + tarfile = file.path(download_dir, "TonsilAtlasSeuratCITE.tar.gz"), + exdir = download_dir +) + +cite_seurat <- readRDS( + file.path(download_dir, "CITE-seq/20220215_tonsil_atlas_cite_seurat_obj.rds") +) +cite_seurat ``` @@ -406,15 +447,17 @@ dataframe, which can be imported as follows: ```{r eval=FALSE} scirpy_df <- read.csv( - "download_dir/CITE-seq/scirpy_tcr_output.tsv", + file = file.path(download_dir, "CITE-seq/scirpy_tcr_output.tsv"), header = TRUE ) + +head(scirpy_df) ``` -## Spatial +## Spatial transcriptomics -A `r BiocStyle::Biocpkg("SpatialExperiment")` of the [spatial transcriptomics]([10X Visium](https://www.10xgenomics.com/products/spatial-gene-expression)) may be retrieved via `assayType="Spatial"`. The dataset contains 8 tissue slices with ~1,000-3,000 cells each that were profiled on two separated slides, as well as a low-resolution (H&E staining) image for each slice. +A `r BiocStyle::Biocpkg("SpatialExperiment")` of the [spatial transcriptomics]([10X Visium](https://www.10xgenomics.com/products/spatial-gene-expression)) dataset may be retrieved via `assayType="Spatial"`. The dataset contains 8 tissue slices with ~1,000-3,000 cells each that were profiled on two separated slides, as well as a low-resolution (H&E staining) image for each slice. ```{r eval=FALSE} library("SpatialExperiment") @@ -425,7 +468,7 @@ library("SpatialExperiment") To plot gene expression you can use the ggspavis package: ```{r eval=FALSE} -library(ggspavis) +library("ggspavis") sub <- spe[, spe$sample_id == "esvq52_nluss5"] plt <- plotVisium(sub, fill="SELENOP") + scale_fill_gradientn(colors=rev(hcl.colors(9, "Spectral"))) @@ -436,7 +479,7 @@ plt ``` -# Annotations +# Annotations of the Tonsil Data Atlas To allow users to traceback the rationale behind each and every of our annotations, we provide a detailed glossary of 121 cell types and states and related functions @@ -478,7 +521,7 @@ please [open an issue](https://github.com/massonix/HCATonsilData/issues/new) and describe your annotation. -# Interoperability +# Interoperability with other frameworks While we provide data in the form of SingleCellExperiment objects, you may want to analyze your data using a different single-cell data container. In future releases, @@ -495,8 +538,18 @@ epithelial <- HCATonsilData(assayType = "RNA", cellType = "epithelial") writeH5AD(sce = epithelial, file = "epithelial.h5ad") ``` +The SingleCellExperiment objects obtained via `HCATonsilData()` can be explored in +detail using e.g. additional Bioconductor packages, such as the `iSEE` package. -# Session information +This can be as simple as executing this chunk: + +```{r launchisee, eval=FALSE} +if (require(iSEE)) { + iSEE(epithelial) +} +``` + +# Session information {-} ```{r} sessionInfo()