diff --git a/R/qc_preprocessing.R b/R/qc_preprocessing.R index 64a05d0..6475c59 100644 --- a/R/qc_preprocessing.R +++ b/R/qc_preprocessing.R @@ -58,21 +58,21 @@ geomxToolsDefaults <- getAnywhere("DEFAULTS")$objs[[1]] #' - Summary tables for segment, probe, and gene filtering ("table") qcProc <- function(object, - min.segment.reads = 1000, - percent.trimmed = 80, - percent.stitched = 80, - percent.aligned = 80, - percent.saturation = 50, - min.nuclei = 200, - min.area = 16000, - min.negative.count = 10, - max.ntc.count = 60, - min.probe.ratio = 0.1, - outlier.test.alpha = 0.01, - percent.fail.grubbs = 20, - remove.local.outliers = FALSE, - shift.counts.zero = TRUE, - print.plots = FALSE) { + min.segment.reads = 1000, + percent.trimmed = 80, + percent.stitched = 80, + percent.aligned = 80, + percent.saturation = 50, + min.nuclei = 200, + min.area = 16000, + min.negative.count = 10, + max.ntc.count = 60, + min.probe.ratio = 0.1, + outlier.test.alpha = 0.01, + percent.fail.grubbs = 20, + remove.local.outliers = FALSE, + shift.counts.zero = TRUE, + print.plots = FALSE) { # Prerun #### ## functions #### ## graphical summaries of QC statistics @@ -292,12 +292,13 @@ qcProc <- function(object, tables <- list() pData(object) <- pData(object)[, !colnames(pData(object)) %in% neg.cols] + # Summarize & Remove Segments #### ## summarize NTC values (Frequency of Segments with a given NTC count) if (!is.null(qc.params$maxNTCCount)) { tab = table(NTC_Count = sData(object)$NTC) print(kable(tab, col.names = c("NTC Count", "# of Segments"), - caption = "Summary for the NTC values")) + caption = "Summary for the NTC values")) tab = as.data.frame(tab) colnames(tab) = c("NTC_Count", "# Segments") tables$NTC_Count = tab @@ -308,6 +309,137 @@ qcProc <- function(object, rownames_to_column("Parameter") ## object size before segment removal size.before <- dim(object) + + ##### Obtain a list of all flagged rows for segment QC ##### + + # Gather annotation information for flag list export + + ## Annotation columns + annotation.data <- pData(object) + + ## Annotation column names + annotation.column.names <- colnames(annotation.data) + + ## Start the list of selected annotation columns + select.annotation.columns <- "segmentID" + + ## Check if area and nuclei are included + if("area" %in% annotation.column.names){ + select.annotation.columns <- c("area", select.annotation.columns) + } + if("nuclei" %in% annotation.column.names){ + select.annotation.columns <- c("nuclei", select.annotation.columns) + } + + ## The annotation names based on selected columns as a df + ## drop = FALSE ensures single column is still a df + select.annotation.data <- annotation.data[, select.annotation.columns, + drop = FALSE] + select.annotation.data <- rownames_to_column(select.annotation.data, + var = "SampleID") + + # Gather the QC flagged rows + + ## Gather the qc data + segment.qc.data <- object@protocolData@data + + ## The nested flag dataframe + flag.columns <- segment.qc.data %>% select(starts_with("QCFlags")) + + ## Rows with a positive flag + flagged.rows <- flag.columns[rowSums(flag.columns$QCFlags == TRUE) > 0, ] + flagged.rows <- rownames_to_column(flagged.rows, var = "SampleID") + + # Gather the additional QC data + + ## Additional QC column names + add.qc.column.names <- c("Raw", + "Trimmed (%)", + "Stitched (%)", + "Aligned (%)", + "Saturated (%)", + "NegGeoMean") + + ## Check for NTC column + if("NTC" %in% colnames(segment.qc.data)){ + add.qc.column.names <- c("NTC", add.qc.column.names) + } + + ## Additional QC data + add.qc.columns <- segment.qc.data[, add.qc.column.names] + add.qc.columns <- rownames_to_column(add.qc.columns, var = "SampleID") + + ## Convert single column matrix and dataframes into vectors + ## May cause an issue if there is more then one PKC file + matrix.column <- add.qc.columns$NegGeoMean + vector.column <- as.vector(matrix.column) + add.qc.columns$NegGeoMean <- vector.column + + add.qc.columns$TrimmedPerc <- sapply(add.qc.columns$`Trimmed (%)`, + function(x) unname(unlist(x))) + add.qc.columns$TrimmedPerc <- as.vector(add.qc.columns$TrimmedPerc[,1]) + + add.qc.columns$StitchedPerc <- sapply(add.qc.columns$`Stitched (%)`, + function(x) unname(unlist(x))) + add.qc.columns$StitchedPerc <- as.vector(add.qc.columns$StitchedPerc[,1]) + + add.qc.columns$AlignedPerc <- sapply(add.qc.columns$`Aligned (%)`, + function(x) unname(unlist(x))) + add.qc.columns$AlignedPerc <- as.vector(add.qc.columns$AlignedPerc[,1]) + + add.qc.columns$SaturatedPerc <- sapply(add.qc.columns$`Saturated (%)`, + function(x) unname(unlist(x))) + add.qc.columns$SaturatedPerc <- as.vector(add.qc.columns$SaturatedPerc[,1]) + + ## Remove the nested data frames + add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Trimmed (%)")] + add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Stitched (%)")] + add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Aligned (%)")] + add.qc.columns <- add.qc.columns[, -which(names(add.qc.columns) == "Saturated (%)")] + + # Create the final QC flag data frame with all additional info + + ## Merge the annotation, flag, and additional qc data frames + merge.qc.flagged.df <- merge(merge(select.annotation.data, add.qc.columns, + by = "SampleID"), + flagged.rows, by = "SampleID") + + ## Reorder columns so that info is next to flag + final.column.order <- c("SampleID", + "segmentID", + "Raw", + "LowReads", + "TrimmedPerc", + "LowTrimmed", + "StitchedPerc", + "LowStitched", + "AlignedPerc", + "LowAligned", + "SaturatedPerc", + "LowSaturation", + "NegGeoMean", + "LowNegatives") + + ## Add NTC, area, and/or nuclei if part of annotation + if("NTC" %in% annotation.column.names){ + final.column.order <- c(final.column.order, "NTC", "HighNTC") + } + if("area" %in% annotation.column.names){ + final.column.order <- c(final.column.order, "area", "LowArea") + } + if("nuclei" %in% annotation.column.names){ + final.column.order <- c(final.column.order, "nuclei", "LowNuclei") + } + + ## The final QC flag df + final.flagged.segment.df <- merge.qc.flagged.df[, final.column.order] + + ## Final renaming of columns + colnames(final.flagged.segment.df)[colnames(final.flagged.segment.df) == "Raw"] <- "RawReadCount" + + + # Create the output object + ## remove flagged segments object <- object[, seg.qc.results$QCStatus == "PASS"] ## object size after segment removal @@ -315,8 +447,8 @@ qcProc <- function(object, ## print object size before and after segment removal tab <- data.frame(size.before, size.after) print(kable(tab, - col.names = c("# Before Removal", "# After Removal"), - caption = "Summary for Segment QC Removal" + col.names = c("# Before Removal", "# After Removal"), + caption = "Summary for Segment QC Removal" )) tables$Segment_QC_Removal <- tab %>% rownames_to_column("element") @@ -343,6 +475,71 @@ qcProc <- function(object, !probe.qc.results$GlobalGrubbsOutlier ) ) + + # Create a df of the probes that have been flagged + + ## Gather the rows that are outliers for gobal or local tests + global.rows <- rownames(probe.qc.results)[probe.qc.results$GlobalGrubbsOutlier] + local.rows <- rownames(probe.qc.results)[rowSums(probe.qc.results[, -2:-1]) > 0 & + !probe.qc.results$GlobalGrubbsOutlier] + + flagged.probes <- c(global.rows,local.rows) + + flagged.probe.df <- data.frame(RTS_ID = flagged.probes) + + flagged.probe.df$FlagType <- ifelse(flagged.probe.df$RTS_ID + %in% global.rows, "Global", "Local") + + # Initialize an empty list to store results for each probe + result.list <- list() + + ## Establish a list of probes flagged as local outliers and the corresponding + ## segment the probe is flagged in + for (probe.id in local.rows) { + # Get the row index for the probe ID + row.index <- which(rownames(probe.qc.results) == probe.id) + + # Check which columns are TRUE for the corresponding row + true.columns <- colnames(probe.qc.results)[probe.qc.results[row.index, -c(1, 2)] > 0] + + # Remove the extraneous text + true.columns <- gsub("LocalGrubbsOutlier\\.", "", true.columns) + + # Store the result in the list + result.list[[probe.id]] <- true.columns + } + + ## To store the segmens(s) a probe was flagged in with the local test + flagged.probe.df$LocalFlag <- NA + + ## Combine the local flag data with the main probe flag df + matching.rows <- which(flagged.probe.df$RTS_ID %in% local.rows) + flagged.probe.df$LocalFlag[matching.rows] <- + sapply(flagged.probe.df$RTS_ID[matching.rows], + function(probe) {paste(result.list[[probe]], collapse = ",")}) + + # Add the gene name to the probe flag df + + ## Columns from the probe annotation + probe.columns <- c("RTS_ID","TargetName") + + ## Gather the probe ID to gene name mapping + probe.id.gene.mapping <- fData(object)[, probe.columns] + + ## Subset the probe id to gene name mapping + probe.id.gene.mapping <- probe.id.gene.mapping[ + probe.id.gene.mapping$RTS_ID %in% flagged.probe.df$RTS_ID, ] + + ## Add the mapping to the local probe df + flagged.probe.df <- merge(flagged.probe.df, probe.id.gene.mapping, by="RTS_ID") + + ## Rearrange column order for final local flag df + probe.df.column.order <- c("TargetName", "RTS_ID", "FlagType", "LocalFlag") + final.flagged.probe.df <- flagged.probe.df[, probe.df.column.order] + + + # Finish subsetting the object to remove flagged probes + ## subset object to exclude all that did not pass Ratio & Global testing probe.qc.passed <- subset(object, @@ -383,7 +580,9 @@ qcProc <- function(object, output <- list( "object" = object, "plot" = plot.list, - "table" = tables - ) + "table" = tables, + "segment.flags" = final.flagged.segment.df, + "probe.flags" = final.flagged.probe.df + ) invisible(output) } diff --git a/tests/testthat/fixtures/Human_Colon/downloaded/colon_dccs.tar.gz b/tests/testthat/fixtures/Human_Colon/downloaded/colon_dccs.tar.gz new file mode 100644 index 0000000..ab4f5e8 Binary files /dev/null and b/tests/testthat/fixtures/Human_Colon/downloaded/colon_dccs.tar.gz differ diff --git a/tests/testthat/fixtures/Human_Colon/studyDesignHumanColon.RDS b/tests/testthat/fixtures/Human_Colon/studyDesignHumanColon.RDS index f5da238..670bd7b 100644 Binary files a/tests/testthat/fixtures/Human_Colon/studyDesignHumanColon.RDS and b/tests/testthat/fixtures/Human_Colon/studyDesignHumanColon.RDS differ diff --git a/tests/testthat/fixtures/Human_Colon_CPTR/downloaded/colon_cptr_dccs.tar.gz b/tests/testthat/fixtures/Human_Colon_CPTR/downloaded/colon_cptr_dccs.tar.gz new file mode 100644 index 0000000..ae05f1d Binary files /dev/null and b/tests/testthat/fixtures/Human_Colon_CPTR/downloaded/colon_cptr_dccs.tar.gz differ diff --git a/tests/testthat/fixtures/Human_Colon_CPTR/filteringHumanColonCPTR.RDS b/tests/testthat/fixtures/Human_Colon_CPTR/filteringHumanColonCPTR.RDS new file mode 100644 index 0000000..1af7888 Binary files /dev/null and b/tests/testthat/fixtures/Human_Colon_CPTR/filteringHumanColonCPTR.RDS differ diff --git a/tests/testthat/fixtures/Human_Colon_CPTR/qcHumanColonCPTR.RDS b/tests/testthat/fixtures/Human_Colon_CPTR/qcHumanColonCPTR.RDS new file mode 100644 index 0000000..d5e3bbc Binary files /dev/null and b/tests/testthat/fixtures/Human_Colon_CPTR/qcHumanColonCPTR.RDS differ diff --git a/tests/testthat/fixtures/Human_Kidney/downloaded/kidney_dccs.tar.gz b/tests/testthat/fixtures/Human_Kidney/downloaded/kidney_dccs.tar.gz new file mode 100644 index 0000000..1011471 Binary files /dev/null and b/tests/testthat/fixtures/Human_Kidney/downloaded/kidney_dccs.tar.gz differ diff --git a/tests/testthat/fixtures/Human_NSCLC/downloaded/nsclc_dccs.tar.gz b/tests/testthat/fixtures/Human_NSCLC/downloaded/nsclc_dccs.tar.gz new file mode 100644 index 0000000..176b5ea Binary files /dev/null and b/tests/testthat/fixtures/Human_NSCLC/downloaded/nsclc_dccs.tar.gz differ diff --git a/tests/testthat/fixtures/Human_NSCLC/studyDesignHumanNSCLC.RDS b/tests/testthat/fixtures/Human_NSCLC/studyDesignHumanNSCLC.RDS index 352357b..52ddc70 100644 Binary files a/tests/testthat/fixtures/Human_NSCLC/studyDesignHumanNSCLC.RDS and b/tests/testthat/fixtures/Human_NSCLC/studyDesignHumanNSCLC.RDS differ diff --git a/tests/testthat/fixtures/Mouse_Thymus/downloaded/thymus_dccs.tar.gz b/tests/testthat/fixtures/Mouse_Thymus/downloaded/thymus_dccs.tar.gz new file mode 100644 index 0000000..bbdf65a Binary files /dev/null and b/tests/testthat/fixtures/Mouse_Thymus/downloaded/thymus_dccs.tar.gz differ diff --git a/tests/testthat/fixtures/Mouse_Thymus/studyDesignMouseThymus.RDS b/tests/testthat/fixtures/Mouse_Thymus/studyDesignMouseThymus.RDS index c4a50b3..f2af11a 100644 Binary files a/tests/testthat/fixtures/Mouse_Thymus/studyDesignMouseThymus.RDS and b/tests/testthat/fixtures/Mouse_Thymus/studyDesignMouseThymus.RDS differ diff --git a/tests/testthat/test-qc_preprocessing.R b/tests/testthat/test-qc_preprocessing.R index 49788c2..e0e7dfe 100644 --- a/tests/testthat/test-qc_preprocessing.R +++ b/tests/testthat/test-qc_preprocessing.R @@ -1,25 +1,25 @@ test_that("Test Human Kidney dataset", { kidney.dat <- selectDatasetQC("kidney") output <- do.call(qcProc, kidney.dat) - expected.elements <- c("object", "plot","table") + expected.elements <- c("object", "plot","table","segment.flags","probe.flags") expect_equal(length(setdiff(expected.elements, names(output))), 0) }) test_that("Test Mouse Thymus Dataset", { thymus.dat <- selectDatasetQC("thymus") output <- do.call(qcProc, thymus.dat) - expected.elements <- c("object", "plot","table") + expected.elements <- c("object", "plot","table","segment.flags","probe.flags") expect_equal(length(setdiff(expected.elements, names(output))), 0) }) test_that("Test Colon Dataset", { colon.dat <- selectDatasetQC("colon") expect_warning(output <- do.call(qcProc, colon.dat), regexp = NULL) - expected.elements <- c("object", "plot","table") + expected.elements <- c("object", "plot","table","segment.flags","probe.flags") expect_equal(length(setdiff(expected.elements, names(output))), 0) }) test_that("Test Human NSCLC Dataset", { nsclc.dat <- selectDatasetQC("nsclc") expect_warning(output <- do.call(qcProc, nsclc.dat), regexp = NULL) - expected.elements <- c("object", "plot","table") + expected.elements <- c("object", "plot","table","segment.flags","probe.flags") expect_equal(length(setdiff(expected.elements, names(output))), 0) }) test_that( diff --git a/vignettes/Integration_Test_Colon.Rmd b/vignettes/Integration_Test_Colon.Rmd index b39d8fc..efe578a 100644 --- a/vignettes/Integration_Test_Colon.Rmd +++ b/vignettes/Integration_Test_Colon.Rmd @@ -70,7 +70,7 @@ This runs the DSPworkflow package to completion using the Human Colon Dataset: nuclei.col = "nuclei") # For creating fixture RDS - create.rds <- TRUE + create.rds <- FALSE if(create.rds) { study.design.human.colon <- sdesign.list$object saveRDS(study.design.human.colon, file = "tests/testthat/fixtures/Human_Colon/studyDesignHumanColon.RDS") @@ -97,8 +97,10 @@ This runs the DSPworkflow package to completion using the Human Colon Dataset: min.area = 1000, print.plots = TRUE) print(qc.output$segments.qc) + print(qc.output$segment.flags) + print(qc.output$probe.flags) - create.rds <- TRUE + create.rds <- FALSE if(create.rds) { qc.human.colon<- qc.output$object saveRDS(qc.human.colon, file = "tests/testthat/fixtures/Human_Colon/qcHumanColon.RDS") diff --git a/vignettes/Integration_Test_Kidney.Rmd b/vignettes/Integration_Test_Kidney.Rmd index 0010dd6..43fc012 100644 --- a/vignettes/Integration_Test_Kidney.Rmd +++ b/vignettes/Integration_Test_Kidney.Rmd @@ -100,6 +100,8 @@ qc.output <- qcProc(object = sdesign.list$object, min.area = 1000, print.plots = TRUE) print(qc.output$segments.qc) + print(qc.output$segment.flags) + print(qc.output$probe.flags) create.rds <- FALSE if(create.rds) { diff --git a/vignettes/Integration_Test_Mouse_Thymus.Rmd b/vignettes/Integration_Test_Mouse_Thymus.Rmd index 83d062a..e47f6ea 100644 --- a/vignettes/Integration_Test_Mouse_Thymus.Rmd +++ b/vignettes/Integration_Test_Mouse_Thymus.Rmd @@ -71,7 +71,7 @@ This runs the DSPworkflow package to completion using the Mouse Thymus Dataset: nuclei.col = "nuclei") # For creating fixture RDS - create.rds <- TRUE + create.rds <- FALSE if(create.rds) { study.design.mouse.thymus <- sdesign.list$object saveRDS(study.design.mouse.thymus, file = "tests/testthat/fixtures/Mouse_Thymus/studyDesignMouseThymus.RDS") @@ -99,8 +99,10 @@ qc.output <- qcProc(object = sdesign.list$object, min.area = 16000, print.plots = TRUE) print(qc.output$segments.qc) + print(qc.output$segment.flags) + print(qc.output$probe.flags) - create.rds <- TRUE + create.rds <- FALSE if(create.rds) { qc.mouse.thymus <- qc.output$object saveRDS(qc.mouse.thymus, file = "tests/testthat/fixtures/Mouse_Thymus/qcMouseThymus.RDS") diff --git a/vignettes/Integration_Test_NSCLC.Rmd b/vignettes/Integration_Test_NSCLC.Rmd index 2294ba7..a3f6631 100644 --- a/vignettes/Integration_Test_NSCLC.Rmd +++ b/vignettes/Integration_Test_NSCLC.Rmd @@ -104,7 +104,7 @@ qc.output <- qcProc(object = sdesign.list$object, print(qc.output$segments.qc) # For creating a fixture RDS - create.rds <- TRUE + create.rds <- FALSE if(create.rds) { qc.human.nsclc <- qc.output$object saveRDS(qc.human.nsclc, file = "tests/testthat/fixtures/Human_NSCLC/qcHumanNSCLC.RDS")