analysis_individual_figures.Rmd

---
title: "Analysis of scRNA-seq data sets"
author: "Magdalena Strauss"
bibliography: drug_resistance.bib
output:
  BiocStyle::html_document:
    code_folding: hide
    number_sections: yes
    toc: yes
---

To run this script for the ABE data: Rscript run_analysis_individual.R 2 markers_ABE sce_ABE.rds analysis_ABE.html

To run this script for the CBE data: Rscript run_analysis_individual.R 1 markers_CBE sce_CBE.rds analysis_CBE.html

run_analysis_individual.R:

params=commandArgs(trailingOnly=TRUE)
rmarkdown::render("analysis_individual.Rmd",output_file=params[4],
params=list(dataset_ind=params[1],save_file_name_markers=params[2],sce_save_file=params[3]))


# Setup and loading data


```{r,setup, output=FALSE,message=FALSE,warning=FALSE}
knitr::opts_chunk$set(warning=FALSE,message=FALSE)


library(bluster)
source("../../core_functions.R")
library("batchelor")
library("scater")
library(scran)
library(dplyr)
library(igraph)
library(RColorBrewer)
library(pheatmap)
library(Seurat)
library(destiny)
library(edgeR)
library(MASS)
library(energy)
library(ggthemes)
library(progeny)
#library(ggridges)
library(stringr)
library(ggrepel)
library(MHTmult)
set.seed(4444)

knitr::opts_chunk$set(
    message = FALSE,
    warning = FALSE,
    message = FALSE,
    #dev = c("pdf"),
    dpi=300
)


#params=c(2 ,"markers_CBE", "sce_CBE.rds", "analysis_CBE.html")
dataset_ind <- as.double(params[1])

sce_list_file <- "sce_list_gRNA_iBAR.rds"
save_file_name_markers <- params[2]
sce_save_file <- params[3]
folder_cellranger <- "../cellranger/cellranger700_count_46539_7105STDY13259924_GRCh38-2020-A"
yy <- "CBE"
if (dataset_ind==2){yy <- "ABE"}
cell_cycle_frequency_save_file <- paste0("cell_cycle_",yy,".csv")
cell_cycle_frequency_save_file2 <- paste0("cell_cycle_target_gene",yy,".csv")
proliferation_file <- paste0("proliferation",yy,".csv")
markers_file_all <- paste0("markers",yy,"_all.csv") #DE of all genes, also 
#' at non-significant levels or if 0
markers_file <- paste0("markers",yy,".csv") #significant DE expression
file_gRNA_selection <- paste0("selected_gRNAs",yy,".csv")
file_energy_distance <-  paste0("ed_",yy,".rds")
save_file_name_validation_data <- paste0("validation_lib_ann_",yy,".csv")

colourscheme_variant_class <- c("control"="lightgrey","canonical drug resistance"="darkblue","driver"="orange","drug addiction"="darkred")
```

We read in the SingleCellExperiment object, for which QC, gRNA and iBAR calling had been performed previously, and add meta-data from the pooled experiments.

```{r}
sce_list <- readRDS(sce_list_file)
sce <- sce_list[[dataset_ind]]

features <- read.delim(paste0(folder_cellranger,"/filtered_feature_bc_matrix/features.tsv.gz"),header=FALSE,stringsAsFactors = FALSE)
rowData(sce) <- features[match(rownames(sce),features$V1),]

```


```{r}
validation_lib <- data.table::fread("../../sc_HT29_debcet_annotated_270623_revised.txt",header=TRUE,fill=TRUE)
#replace the line above using supplementary table 1 from our preprint
if(yy=="ABE"){
  feature_ref <- read.table("feature_ref_gRNA_ABE.csv",sep=",",header=TRUE)
}else{
  feature_ref <- read.table("feature_ref_gRNA_CBE.csv",sep=",",header=TRUE)
}
if (yy=="CBE"){
  validation_lib <- validation_lib[validation_lib$editor=="CBE" & validation_lib$sgRNA_ID%in%feature_ref$id,]
}else{
   validation_lib <- validation_lib[validation_lib$editor=="ABE" & validation_lib$sgRNA_ID%in%feature_ref$id,]
}

rename_gRNAs <- function(x){
  v <- strsplit(x,"-")[[1]]
  w <- validation_lib$ID[match(v,validation_lib$sgRNA_ID)]
  if (length(w) > 1){
    x <- w
    w <- x[1]
    if (length(x)>1){
      for (j in 2:length(x))
      w <- paste0(w,"-",x[j])
    }
  }
  return(w)
}

meta_lfc <- function(x)
{
  v <- strsplit(x,"-")[[1]]
  w <- validation_lib$L2FC_HT29_DebCet_plasmid_1[match(v,validation_lib$sgRNA_ID)]
  if (length(w) > 1){
    x <- w
    w <- x[1]
    if (length(x)>1){
      for (j in 2:length(x))
      w <- paste0(w,"-",x[j])
    }
  }
  return(w)
}

meta_class <- function(x)
{
  v <- strsplit(x,"-")[[1]]
  w <- validation_lib$variant_class[match(v,validation_lib$ID)]
  if (length(w) > 1){
    x <- w
    w <- x[1]
    if (length(x)>1){
      for (j in 2:length(x))
      w <- paste0(w,"-",x[j])
    }
  }
  return(w)
}

meta_resistance <- function(x)
{
  v <- strsplit(x,"-")[[1]]
  w <- validation_lib$resistance_mutation[match(v,validation_lib$ID)]
  if (length(w) > 1){
    x <- w
    w <- x[1]
    if (length(x)>1){
      for (j in 2:length(x))
      w <- paste0(w,"-",x[j])
    }
  }
  return(w)
}

meta_target <- function(x)
{
  v <- strsplit(x,"-")[[1]]
  w <- validation_lib$Gene[match(v,validation_lib$ID)]
  if (length(w) > 1){
    x <- w
    w <- x[1]
    if (length(x)>1){
      for (j in 2:length(x))
      w <- paste0(w,"-",x[j])
    }
  }
  return(w)
}

meta_type <- function(x)
{
  v <- strsplit(x,"-")[[1]]
  w <- validation_lib$sgRNA_type[match(v,validation_lib$ID)]
  if (length(w) > 1){
    x <- w
    w <- x[1]
    if (length(x)>1){
      for (j in 2:length(x))
      w <- paste0(w,"-",x[j])
    }
  }
  return(w)
}


sce$gRNA <- sapply(sce$gRNA,rename_gRNAs)
sce$l2fc <- sapply(sce$gRNA,meta_lfc)
sce$target_gene <- sapply(sce$gRNA,meta_target)
sce$variant_class <- sapply(sce$gRNA,meta_class)
sce$resistance_mutation <- sapply(sce$gRNA,meta_resistance)
sce$gRNA_type <- sapply(sce$gRNA,meta_type)
```

# Distribution of gRNAs and iBARs

We plot the number of gRNAs called in a cell that passed QC. If there are several gRNAs in a cell, then each of them is counted. The figure below shows that cells with gRNAs targeting splice-essential sites proliferate much less than the other groups. The gRNAs targeting splice-essential sites were added as a positive control to check their depletion, and will be removed from downstream analysis.  

```{r}
nr_gRNA <- table(factor(unlist(lapply(sce$gRNA,function(x) strsplit(x,"-")[[1]])),levels=validation_lib$ID))
validation_lib$nr_cells <- nr_gRNA[match(validation_lib$ID,names(nr_gRNA))]
ggplot(validation_lib,mapping=aes(x=nr_gRNA,color=sgRNA_type))+geom_density(size=1.5)+scale_color_colorblind()+theme_classic(base_size=24)+xlab("number of cells")+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))
```


```{r}
sce <- sce[,!(grepl("splice_essential",sce$gRNA_type))]
```

After the removal of the splice essential gRNAs, we have the following number of cells: `r ncol(sce)`.

The following is a histogram of number of cells per gRNA, with the splice essential gRNAs excluded. 

```{r}
nr_gRNA <- table(unlist(lapply(sce$gRNA,function(x) strsplit(x,"-")[[1]])))
ggplot(mapping=aes(x=nr_gRNA)) + geom_histogram()+scale_x_continuous(trans='log10') + theme_bw()+xlab("number of cells per gRNA") + ylab("number of gRNAs")
```

The following figure is a histogram of the number of different iBARs per gRNA.

```{r}
sce$gRNA_iBAR <- paste0(sce$gRNA,"-",sce$iBAR)
ggplot(mapping=aes(x=table(sce$gRNA_iBAR))) + geom_histogram()+scale_x_continuous() + theme_bw()+xlab("number of cells per iBAR") + ylab("number of iBARs")
```

The mean number of cells per iBAR-group is equal to `r mean(table(sce$gRNA_iBAR))`.

# Combining cells with the same iBAR

We combine cells with the same gRNA-iBAR combination by averaging their gene expression as follows: we combine the counts and then normalise for library size using the logNormCounts function from the scuttle Bioconductor package (@scuttle). We use the iBAR-gRNA combinations rather than iBARs. As the iBAR is only 6nt long, some identical iBAR sequence may have been inserted in multiple cells, whereas in combination with the gRNA, the iBAR creates a unique identifier for a small group of cells with the same genoytpe. Aggregating over groups of cells with the same iBAR and gRNA eliminates the problem of biases resulting from unequal numbers of descendant cells for different progenitor cells. We call a group of cells with the same iBAR-gRNA combination, and therefore the same genotype, an iBAR-group.  

```{r}
Counts <- as.matrix(counts(sce))
sce_counts_iBAR <- sapply(1:length(unique(sce$gRNA_iBAR)),function(x) return(rowSums(Counts[,sce$gRNA_iBAR==unique(sce$gRNA_iBAR)[x],drop=FALSE])))
rownames(sce_counts_iBAR) <- rownames(sce)
colnames(sce_counts_iBAR) <- unique(sce$gRNA_iBAR)
sce_iBAR <- SingleCellExperiment(assays=list(counts=sce_counts_iBAR),rowData=rowData(sce))
sce_iBAR$gRNA <- sce$gRNA[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$l2fc <- sce$l2fc[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$target_gene <- sce$target_gene[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$variant_class <- sce$variant_class[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR$resistance_mutation <- sce$resistance_mutation[match(colnames(sce_iBAR),sce$gRNA_iBAR)]
sce_iBAR <- logNormCounts(sce_iBAR)
```

Number of different iBAR-groups: `r ncol(sce_iBAR)`

Computing the average number of iBAR-groups per gRNA including cells with multiple gRNAs:

```{r}
nr_gRNAs_sce_iBAR <- table(unlist(lapply(sce_iBAR$gRNA,function(x) strsplit(x,"-")[[1]])))
mean(nr_gRNAs_sce_iBAR)
```

Average number of iBAR-groups per gRNA (including cells with multiple gRNAs): `r mean(nr_gRNAs_sce_iBAR)`.

For each gRNA, we compute the number of iBAR-groups.

```{r}
validation_lib$nr_iBAR_groups <- sapply(validation_lib$ID,function(x) sum(sce_iBAR$gRNA==x))
```

## Adding further meta-data

For iBAR-groups with several gRNAs, variant classes are ranked by their impact as follows: driver/drug addiction  > canonical drug resistance > control. iBAR groups with both driver and drug addiction conferring gRNAs are discarded. Colours indicate the class with the highest impact for each cell. 

```{r}
sce_iBAR <- sce_iBAR[,!(grepl("driver",sce_iBAR$variant_class)&grepl("drug addiction",sce_iBAR$variant_class))]
sce_iBAR$variant_class_highest_impact <- "control"
sce_iBAR$variant_class_highest_impact[grepl("canonical drug resistance",sce_iBAR$variant_class)] <- "canonical drug resistance"
sce_iBAR$variant_class_highest_impact[grepl("drug addiction",sce_iBAR$variant_class)] <- "drug addiction"
sce_iBAR$variant_class_highest_impact[grepl("driver",sce_iBAR$variant_class)] <- "driver"
```

# Pathways

We compute scores for pathway activity for each iBAR-group. 

## Pathway analysis with PROGENy

First we compute PROGENy scores, which quantify pathway activation as described in @Schubert and @Holland.                     

```{r}
logcounts_sce_iBAR <- logcounts(sce_iBAR)
rownames(logcounts_sce_iBAR) <- rowData(sce_iBAR)$V2
progeny_scores <- progeny(as.matrix(logcounts_sce_iBAR),scale=FALSE)
colnames(progeny_scores) <- paste0("PROGENy_",colnames(progeny_scores))
```

We transform the PROGENy scores linearly such that their mean across the non-targeting control gRNAs is 0 and the standard deviation across the non-targeting control gRNAs is equal to 1.

```{r scale_progeny}
progeny_scores_1gRNA <- progeny_scores[sapply(rownames(progeny_scores), function(x) str_count(x,"-") == 1),]
progeny_scores_nontargeting <- progeny_scores_1gRNA[grepl("non_",rownames(progeny_scores_1gRNA)),]
mean_nontargeting <- colMeans(progeny_scores_nontargeting)
sd_nontargeting <- apply(progeny_scores_nontargeting,2,sd)
progeny_scores <- t(apply(progeny_scores,1,function(v) (v-mean_nontargeting)/sd_nontargeting))
```


## MAYA pathway analysis

First, we compute MAYA activity scores (@Landais2023). MAYA computes for each gene set a linear low dimensional representation of gene expression using principal component analysis. For each gene set and each principal component it computes a score for each iBAR-group in the data set. The method retains those scores found to be bimodal across the data set. 


```{r maya_activity_hallmark,fig.width=10,fig.height=3}
library(MAYA)
logcounts_sce_iBAR <- logcounts(sce_iBAR)
rownames(logcounts_sce_iBAR) <- rowData(sce_iBAR)[,2]
activity_summary <- MAYA_pathway_analysis(expr_mat=logcounts_sce_iBAR,
                                        modules_list = "hallmark",
                                        is_logcpm=T)

activity_summary_ordered <- activity_summary$activity_matrix[,order(sce_iBAR$variant_class_highest_impact)]
```


# Differential expression (DE) for resistance compared to non-targeting gRNAs

To test and adjust p-values across both the transcriptome and pathways, we first construct a SingleCellExperiment containing expression data for both the genes and the pathways scores. 

```{r}
logcounts_and_pathway_scores <- rbind(logcounts(sce_iBAR),t(progeny_scores[colnames(sce_iBAR),]),
                                  activity_summary_ordered[,colnames(sce_iBAR)])
sce_logcounts_and_pathway_scores <- SingleCellExperiment(assays=list(logcounts=logcounts_and_pathway_scores),
                                        colData=colData(sce_iBAR))
rownames(sce_logcounts_and_pathway_scores)[1:nrow(sce_iBAR)] <- rowData(sce_iBAR)$V2
```

We identify those gRNAs that are associated with drug resistance or drug addiction, driver mutations, or mutations to BCL2 and EGFR with potential high impact, and that are the only gRNA for at least 10 iBAR-groups. 

```{r}
gRNAs <- names(table(sce_iBAR$gRNA[!(grepl("-",sce_iBAR$gRNA))]))[table(sce_iBAR$gRNA[!(grepl("-",sce_iBAR$gRNA))])>=10]
gRNAs_NT <- gRNAs[grepl("non_targeting",gRNAs)]
gRNAs <- setdiff(gRNAs,gRNAs_NT)
gRNAs_resistance_driver <- intersect(gRNAs,validation_lib$ID[validation_lib$variant_class!="control"])
gRNAs_test <-intersect(gRNAs,c(validation_lib$ID[validation_lib$variant_class!="control"],c("BCL2_1e8877", "EGFR_156dcf", "EGFR_53b63e")))
```

The number of such gRNAs is equal to `r length(gRNAs_test)`.

We compute differential expression across all genes as well as the PROGENy and MAYA pathway scores. First, we select those gRNAs with at least one gene or pathway with a p-value less than 10^-6. Following this selection, we control the expected average false discovery rate for DE testing for the selected gRNAs as in @BG. 

```{r, eval=FALSE}
markers <- list()
sce_tempA <- sce_logcounts_and_pathway_scores
sce_tempA$gRNA[sce_tempA$gRNA %in% gRNAs_NT] <- "nontargeting"

for (j in 1:length(gRNAs_test)){
  sce_temp1 <- sce_tempA[,grepl(gRNAs_test[j],sce_tempA$gRNA)]
  sce_temp1$gRNA <- gRNAs_test[j]
  sce_temp <- cbind(sce_tempA[,sce_tempA$gRNA=="nontargeting"],sce_temp1)
  temp <- findMarkers(sce_temp,groups=sce_temp$gRNA,test.type="wilcox")
  temp_lfc <- findMarkers(sce_temp,groups=sce_temp$gRNA,test.type="t")[[gRNAs_test[j]]]
  temp <- temp[[gRNAs_test[j]]]
  temp$lfc <- temp_lfc[rownames(temp),]$summary.logFC
  temp$gRNA <- gRNAs_test[j]
  temp$target_gene <- strsplit(gRNAs_test[j],"_")[[1]][1]
  temp$DE_gene <- rownames(temp)
  markers[[j]] <- temp[order(temp$DE_gene),]
}
names(markers) <- gRNAs_test
# correct for multiple testing 
markers_selected_and_gRNAs <- Benjamini_Bogomolov_correction(markers)
selected_gRNAs <- markers_selected_and_gRNAs$selected
markers_selected <- do.call(rbind,markers_selected_and_gRNAs$markers)
markers <- do.call(rbind,markers)
write.table(markers,file=markers_file_all,sep=",",col.names = TRUE,row.names=FALSE)
write.table(markers_selected,file=markers_file,sep=",",col.names = TRUE,row.names=FALSE)
write.table(selected_gRNAs, file_gRNA_selection, sep=",",row.names = TRUE,col.names=FALSE,quote=FALSE)
```

We also save the markers for gene expression and pathway scores separately. 

```{r}
markers <- read.table(file=markers_file,sep=",",header=TRUE)
markers_progeny <- markers[markers$DE_gene %in% colnames(progeny_scores),]
markers_MAYA <- markers[markers$DE_gene %in% rownames(activity_summary_ordered),]
markers_gex <- 
  markers[!(markers$DE_gene %in% c(colnames(progeny_scores),rownames(activity_summary_ordered))),]
write.table(markers_gex[markers_gex$FDR<0.1,],file=paste0("markers_gex_",yy,".csv"),sep=",",col.names = TRUE,row.names=FALSE)
write.table(markers_progeny[markers_progeny$FDR<0.1,],file=paste0("markers_PROGENy_",yy,".csv"),sep=",",col.names = TRUE,row.names=FALSE)
write.table(markers_MAYA[markers_MAYA$FDR<0.1,],file=paste0("markers_MAYA_",yy,".csv"),sep=",",col.names = TRUE,row.names=FALSE)
```

# Energy distances

We compute energy distances (@Replogle), between targeting and non-targeting gRNAs, using iBAR-groups instead of cells.

We first reduce the dimension of our data set by using principal components in a way guided by the specific application, based on a basis of genes consisting of marker genes from both the CBE and ABE data sets with a minimum log2-fold-change of 0.25 (the same for both of the data sets). 

```{r }
markers_all_ABE <- read.table("markersABE.csv",sep=",",header=TRUE)
markers_all_CBE <- read.table("markersCBE.csv",sep=",",header=TRUE)
markers_ABE_and_CBE <- rbind(markers_all_ABE,markers_all_CBE)

markers_lfc_ABE_and_CBE <- markers_ABE_and_CBE[markers_ABE_and_CBE$FDR<0.1 & abs(markers_ABE_and_CBE$lfc) > 0.25,]
basis_of_genes_ABE_and_CBE <- unique(markers_lfc_ABE_and_CBE$DE_gene)
options(matrixStats.useNames.NA = "deprecated")
reducedDims(sce_iBAR)$PCA_basis_ABE_and_CBE <- scater::calculatePCA(sce_iBAR[rowData(sce_iBAR)$V2%in%basis_of_genes_ABE_and_CBE,],ncomponents=20,scale=TRUE)
```

Now we compute the energy distances based on the principal components computed above. We compute the energy distance between a targeting gRNA and the non-targeting gRNAs as the median of the energy distances between the targeting gRNA and each individual non-targeting gRNA with at least 10 iBAR groups. 

```{r, eval=FALSE}
ed <- rep(NA,length(gRNAs))
names(ed) <- gRNAs
sce_NT <- sce_iBAR[,sce_iBAR$gRNA%in%gRNAs_NT]
for (j in 1:length(gRNAs)){
  pc_temp_1 <- reducedDims(sce_iBAR)$PCA_basis_ABE_and_CBE[grepl(gRNAs[j],sce_iBAR$gRNA),]
  if (nrow(pc_temp_1) > 100){
        pc_temp_1 <- pc_temp_1[sample(1:nrow(pc_temp_1),100),]
      }
   temp_vec <- c()
   for (k in 1:length(unique(sce_NT$gRNA))){
      pc_WT <- reducedDims(sce_NT)$PCA_basis_ABE_and_CBE[sce_NT$gRNA==unique(sce_NT$gRNA)[k],]
      if (nrow(pc_WT) > 100){
        pc_WT <- pc_WT[sample(1:nrow(pc_WT),100),]
      }
     temp_vec <- c(temp_vec,edist(rbind(pc_WT,pc_temp_1),c(nrow(pc_WT),nrow(pc_temp_1))))
   }
   ed[j] <- median(temp_vec)
}
```

We normalise the energy distances by dividing them by the median of the energy distances found across different non-targeting gRNAs for the respective data set.

```{r, eval=FALSE}
ed_NT <- rep(NA,length(gRNAs_NT))
names(ed_NT) <- gRNAs_NT
for (j in 1:length(gRNAs_NT)){
  sce_NT_temp <- sce_NT[,!(grepl(gRNAs_NT[j],sce_NT$gRNA))]
  pc_temp_1 <- reducedDims(sce_NT)$PCA_basis_ABE_and_CBE[grepl(gRNAs_NT[j],sce_NT$gRNA),]
  if (nrow(pc_temp_1) > 100){
        pc_temp_1 <- pc_temp_1[sample(1:nrow(pc_temp_1),100),]
      }
   temp_vec <- c()
   for (k in 1:length(unique(sce_NT_temp$gRNA))){
      pc_WT <- reducedDims(sce_NT_temp)$PCA_basis_ABE_and_CBE[sce_NT_temp$gRNA==unique(sce_NT_temp$gRNA)[k],]
      if (nrow(pc_WT) > 100){
        pc_WT <- pc_WT[sample(1:nrow(pc_WT),100),]
      }
     temp_vec <- c(temp_vec,edist(rbind(pc_WT,pc_temp_1),c(nrow(pc_WT),nrow(pc_temp_1))))
   }
   ed_NT[j] <- median(temp_vec)
}
```


```{r, eval=FALSE}
ed <- ed/median(ed_NT)
saveRDS(ed,file=file_energy_distance)
```

We add the energy distances to the gRNA meta-data.

```{r}
ed <- readRDS(file_energy_distance)
validation_lib$ed <- ed[validation_lib$ID]
```


The following is a plot of the distribution of energy distance coloured by variant class.

```{r ed_variant_class}
ggplot(validation_lib,mapping=aes(x=log2(ed),color=variant_class)) + geom_density(size=1.5) + theme_classic(base_size=20) + labs(color="variant class")+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))+scale_color_manual(values=colourscheme_variant_class)+xlab("log2(normalised ed)")
```

The following plot is coloured by whether the gRNA confers resistance or not (binary). Resistance includes drivers and drug addiction.

```{r ed_resistance}
ggplot(validation_lib,mapping=aes(x=log2(ed),color=resistance_mutation)) + geom_density(size=1.5) + theme_classic(base_size=20) + labs(color="resistance mutation")+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))+scale_color_manual(values=c("FALSE"="grey","TRUE"="purple"),labels=c("no resistance","resistance"))+xlab("log2(normalised ed)")
```

# Binary classification of gRNAs in terms of overall impact

For the differential expression analysis, we had selected those gRNAs with at least one gene or pathway score differentially expressed with a p-value lower than 10^-6. We then corrected the p-values from differential gene expression testing for the selected gRNAs to keep the expected average false discovery rate (FDR) across gRNAs below 0.1. The following gRNAs were selected by the p-value cutoff of 10^-6. 

```{r}
selected_gRNAs <- read.table(file_gRNA_selection, sep=",")
names_selected_gRNAs <- selected_gRNAs$V1
selected_gRNAs <- selected_gRNAs$V2
names(selected_gRNAs) <- names_selected_gRNAs
```

The number of selected gRNAs is equal to `r sum(selected_gRNAs)`.

The percentage of selected among tested gRNAs (all gRNAs with at least 10 iBAR groups and which were categorised as 
conferring canonical resistance or drug addiction, or as driver mutations) is equal to `r round(100*sum(selected_gRNAs)/length(selected_gRNAs),2)`%.

Across the selected gRNAs, we compute the number of differentially expressed genes (average expected FDR < 0.1):

```{r}

gRNAs_selected <- names(selected_gRNAs)[selected_gRNAs]
nr_DE_genes <- sapply(gRNAs_selected,function(x) sum(markers_gex$FDR<0.1&markers_gex$gRNA==x))
```

The numbers of significantly differentially expressed genes are as follows:

```{r}
nr_DE_genes
```

As the level of significance is obviously influenced by the numbers of cells for the gRNA, we compare numbers of differentially expressed genes to cell numbers:

```{r nr_cells_DE_genes,fig.width=5,fig.height=5}
nr_cells <- sapply(gRNAs_selected,function(x) sum(sce_iBAR$gRNA==x))
df_nr_cells_nr_sig <- data.frame(nr_cells=nr_cells,nr_DE_genes=nr_DE_genes,gRNA=names(nr_DE_genes))
ggplot(df_nr_cells_nr_sig,aes(x=nr_cells,y=nr_DE_genes,label=gRNA)) + geom_point() + geom_text_repel(size=4)+
  scale_y_continuous(trans="log2",limits=c(1,NA)) + theme_classic() + 
  ggtitle("Relation between number of cells per gRNA and number of DE genes")
```

We also compare the energy distances to non-targeting control gRNAs for gRNAs impacting the transcriptome versus those not impacting the transcriptome. This shows that gRNAs with a significant transcriptional impact may still have energy distances to the control gRNAs that are similar to those for gRNAs that did not pass our threshold for significant transcriptional impact, especially if only a limited number of genes are differentially expressed.  

```{r ed_impacting}
df_ed_compare <- data.frame(impacting=selected_gRNAs,ed=ed[names(selected_gRNAs)])
ggplot(df_ed_compare,aes(x=ed,fill=impacting)) + geom_density(alpha=0.3) + 
  scale_fill_manual(values=c("TRUE"="darkblue","FALSE"="grey")) + theme_classic(base_size=16)
```

Now we illustrate how the energy distances are related to the number of significantly differentially expressed genes.

```{r DE_ed, fig.width=5,fig.height=5}
df_ed_compare$nr_DE_genes <- NA
df_ed_compare$gRNA <- rownames(df_ed_compare)
df_ed_compare[gRNAs_selected,]$nr_DE_genes <- nr_DE_genes
ggplot(df_ed_compare[gRNAs_selected,],mapping=aes(x=nr_DE_genes,y=ed,label=gRNA)) + geom_point()+
   geom_text_repel()+theme_classic() + ggtitle("Number of DE genes is related to energy distance")+
  scale_x_continuous(trans="log2")
```

We add a column to the validation_lib meta data with binary values indicating whether the gRNA led to significant transcriptional impact as defined by our threshold (at least one p-val < 10^-6). 

```{r}
validation_lib$transcriptional_impact_binary <- validation_lib$ID %in% gRNAs_selected
validation_lib$transcriptional_impact_binary[!(validation_lib$ID %in% names(selected_gRNAs))] <- NA
```

How many gRNAs from each class (canoncial resistance, driver, drug addiction) have a significant impact as defined by our cutoff?

```{r sig_trans_impact_per}
validation_lib_sub <- validation_lib[validation_lib$nr_iBAR_groups >= 10 & validation_lib$Gene!="non_targeting"]#subsetting to the 
#gRNAs with at least 10 cells and not non-targeting (the gRNAs for which we did DE testing)
df_classes_impact <- data.frame(class=validation_lib_sub$variant_class,impact=validation_lib_sub$transcriptional_impact_binary)
ggplot(df_classes_impact,aes(x=class,fill=impact)) + geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..]))+
  theme_classic(base_size=16) + xlab("") + ylab("") + scale_y_continuous(labels=scales::percent) + scale_fill_manual(values=c("TRUE"="black","FALSE"="grey"))+
  labs(fill="significant transcriptional impact") + theme(legend.position = "bottom")
```

# Visualising differential expression 

In this section we visualise log2-fold changes between impacted and control gRNAs. 

## Heatmap plot of top differentially expressed genes

The following heatmap includes genes that are statistically significantly (at FDR 0.1) differentially expressed with log2-fold-change > 0.5 for at least 1 gRNA. Using hierarchical clustering, we identify two groups of genes that are distinct in terms of their patterns of up- or downregulation across gRNAs. We also cluster the gRNAs. 

```{r heatmap_gex}
markers <- read.table(file=markers_file,sep=",",header=TRUE)
markers_progeny <- markers[markers$DE_gene %in% colnames(progeny_scores),]
markers_MAYA <- markers[markers$DE_gene %in% rownames(activity_summary_ordered),]
markers_gex <- 
  markers[!(markers$DE_gene %in% c(colnames(progeny_scores),rownames(activity_summary_ordered))),]

top_marker_genes <- unique(markers_gex$DE_gene[markers_gex$FDR<=0.1&abs(markers_gex$lfc)>0.5])
logcounts_top_markers <- logcounts(sce_iBAR[match(top_marker_genes,rowData(sce_iBAR)$V2),])
# averaging gene expression on per-gRNA basis
av_logcount_per_gRNA <- sapply(1:length(gRNAs),function(x) return(rowMeans(logcounts_top_markers[,grepl(gRNAs[x],sce_iBAR$gRNA)])))
colnames(av_logcount_per_gRNA) <- gRNAs
rownames(av_logcount_per_gRNA) <- top_marker_genes
av_logcount_per_gRNA_scaled <- t(apply(av_logcount_per_gRNA,1,function(x) (x-mean(x))/sd(x)))
df_logcounts <- data.frame(variant_class=validation_lib$variant_class[match(colnames(av_logcount_per_gRNA),validation_lib$ID)])

rownames(df_logcounts) <- colnames(av_logcount_per_gRNA)
cols_logcounts <- list(variant_class=colourscheme_variant_class)

library(viridis)

callback = function(hc, mat){
    sv = svd(t(mat))$v[,1]
    dend = reorder(as.dendrogram(hc), wts = sv)
    as.hclust(dend)
}

library(dendsort)

callback = function(hc, ...){dendsort(hc)}

if (yy=="ABE"){
  cellheight=0.2
}else{cellheight=1}
heatmap_markers <- pheatmap(av_logcount_per_gRNA_scaled,annotation_col = df_logcounts,annotation_colors = cols_logcounts,show_rownames = FALSE,
        treeheight_row = 0,show_colnames = FALSE,color=colorRampPalette(c("Darkblue", "white","red"))(20),cutree_rows=2,cutree_cols = 4,clustering_callback = callback,cellheight = cellheight)
```

We add the cluster information to the gRNA meta-data. 

```{r}
gRNA_clusters <- cutree(heatmap_markers$tree_col,4)
validation_lib$cluster <- NA
validation_lib$cluster[match(names(gRNA_clusters),validation_lib$ID)] <- gRNA_clusters
```

Finally, we replot the heatmap with the same order of gRNAs, only plotting cell-cycle related genes, and restricting to gRNAs found to be resistant in the pooled screens and having a significant overall transcriptional effect. The heatmap includes only strongly affected cell cycle related genes, where we used the following cutoff for inclusion in the figure: an absolute log2-fold change of at least abs(lfc) > 0.75 and FDR < 0.001 for at least one gRNA. 

```{r heatmap_markers_top,fig.width=16,fig.height=4}
file_cc_genes <- "../../GO.0007049.csv"
cc_genes_ens <- unique(read.table(file_cc_genes,sep=",")$V1)
cc_genes <- rowData(sce)$V2[match(cc_genes_ens,rowData(sce)$V1)]
top_markers_rep <- table(markers$DE_gene[markers$FDR<10^-3&abs(markers$lfc)>0.75&markers$DE_gene%in%cc_genes])
frequent_marker_genes <- names(top_markers_rep)
av_logcount_per_gRNA_scaled_ordered <- av_logcount_per_gRNA_scaled[,heatmap_markers$tree_col$order]
av_logcount_per_gRNA_scaled_ordered <- av_logcount_per_gRNA_scaled_ordered[,colnames(av_logcount_per_gRNA_scaled_ordered)%in%validation_lib$ID[validation_lib$variant_class%in%c("driver","drug addiction","canonical drug resistance") & validation_lib$transcriptional_impact_binary]]

av_logcount_per_gRNA_scaled_ordered_sub <- av_logcount_per_gRNA_scaled_ordered[rownames(av_logcount_per_gRNA_scaled_ordered)%in%intersect(cc_genes,frequent_marker_genes),]
df_logcounts_sub <- df_logcounts
cols_logcounts_sub <- cols_logcounts
cols_logcounts_sub$cluster <- c("4"="purple","2"="blue","3"="black","1"="lightgrey")
df_logcounts_sub$cluster <- as.factor(cutree(heatmap_markers$tree_col,4)[rownames(df_logcounts_sub)])
df_logcounts_sub$log2_ed <- log2(ed[rownames(df_logcounts_sub)])
df_logcounts_sub <- df_logcounts_sub[colnames(av_logcount_per_gRNA_scaled_ordered_sub),]
# rowgaps <- c(sum(df_logcounts_sub$cluster==1),sum(df_logcounts_sub$cluster%in%c(1,2)),
#                       sum(df_logcounts_sub$cluster%in%c(1,2,3)), sum(df_logcounts_sub$cluster!=4))

pheatmap(t(av_logcount_per_gRNA_scaled_ordered_sub),annotation_row = df_logcounts_sub[,c("variant_class","cluster","log2_ed")],annotation_colors = cols_logcounts_sub,treeheight_row = 0,
        show_rownames=TRUE,color=colorRampPalette(c("Darkblue", "white","red"))(20),
        cluster_rows=FALSE,cutree_cols=2,treeheight_col=0,cellheight=13)

```

## Heatmap plot of progeny scores

We plot the heatmap of PROGENy scores for all gRNAs in the same order as for the gene expression heatmap. 

```{r}
gRNA_pathway_mean <- do.call(rbind,lapply(1:length(gRNAs),function(x) colMeans(progeny_scores[grepl(gRNAs[x],rownames(progeny_scores)),,drop=FALSE])))
rownames(gRNA_pathway_mean) <- gRNAs
gRNA_pathway_mean <- gRNA_pathway_mean[colnames(av_logcount_per_gRNA_scaled),]
colnames(gRNA_pathway_mean) <- sapply(colnames(gRNA_pathway_mean),function(x) strsplit(x,"_")[[1]][2])

df_row_pathway <- data.frame(variant_class=validation_lib$variant_class[match(gRNAs,validation_lib$ID)],
                            cluster =  validation_lib$cluster[match(gRNAs,validation_lib$ID)])

rownames(df_row_pathway) <- gRNAs
colors_pathway_heatmap <- list(variant_class=colourscheme_variant_class[unique(df_row_pathway$variant_class)],
                               cluster =  c("4"="purple","2"="blue","3"="black","1"="lightgrey"))
max_val <- max(gRNA_pathway_mean)
min_val <- min(gRNA_pathway_mean)
nr_colors <- 30
col_pathway = colorRampPalette(c("Darkblue", "white","red"))(nr_colors)

breaks_pathway = c(seq(min_val, 0, 
                      length.out=ceiling(nr_colors/2) + 1),
                  seq(max_val/nr_colors, 
                      max_val, length.out=floor(nr_colors/2)))
pheatmap(t(gRNA_pathway_mean),annotation_col=df_row_pathway,annotation_colors = colors_pathway_heatmap ,show_colnames = FALSE,cluster_cols=TRUE,cutree_cols=3,cellheight = 10,treeheight_row = 0,breaks=breaks_pathway,color=col_pathway)
```

## Heatmap illustration of differential expression of pathway scores

The following heatmap illustrates the log2-fold-changes compared to nontargeting gRNAs for the PROGENy pathway scores. The plot includes all gRNAs tested for DE (i.e. gRNAs that were not found to act like controls in pooled screens and had at least 10 iBAR groups assigned to them), even those without differentially expressed genes or pathways. 

```{r heatmap_lfc_progeny,fig.height=5}
markers_all <- read.table(markers_file_all,sep=",",header=TRUE)
markers_all_progeny <- markers_all[markers_all$DE_gene%in%colnames(progeny_scores),]
markers_all_progeny$ID <- paste0(markers_all_progeny$DE_gene,"_",markers_all_progeny$target_gene)
markers_all_progeny$FDR <- NA
markers_progeny$ID <- paste0(markers_progeny$DE_gene,"_",markers_progeny$target_gene)
markers_all_progeny$FDR[match(markers_progeny$ID,markers_all_progeny$ID)] <-   markers_progeny$FDR

df <- as.data.frame(markers_all_progeny)
df$sig <- "not_significant"
df$sig[df$FDR<0.1] <- "significant"
pp <- ggplot(df,mapping = aes(y = sapply(DE_gene,function(x) strsplit(x,"_")[[1]][2]), x = gRNA,size=sig)) +
  geom_raster(aes(fill=lfc)) +
  labs(y="", x="") +
  theme_bw(base_size=16) + theme(axis.text.x=element_text(size=14, angle=90, vjust=0.3),
                     axis.text.y=element_text(size=14),
                     plot.title=element_text(size=15)) + labs(fill="lfc")+
                     scale_fill_gradient2(
        oob = scales::squish,low="darkblue",high="red") + geom_point() +
  scale_size_manual(values=c(significant=2, not_significant=NA),guide="none")

print(pp+ggtitle("DE for PROGENy scores compared to nontargeting control"))
```

Below is the corresponding plot for the MAYA pathway scores. 

```{r heatmap_lfc_MAYA_hallmark,fig.width=10,fig.height=5}
markers_all_MAYA <- markers_all[markers_all$DE_gene%in%rownames(activity_summary_ordered),]
markers_all_MAYA$ID <- paste0(markers_all_MAYA$DE_gene,"_",markers_all_MAYA$target_gene)
markers_all_MAYA$FDR <- NA
markers_MAYA$ID <- paste0(markers_MAYA$DE_gene,"_",markers_MAYA$target_gene)
markers_all_MAYA$FDR[match(markers_MAYA$ID,markers_all_MAYA$ID)] <-   markers_MAYA$FDR
df <- as.data.frame(markers_all_MAYA)
df$sig <- "not_significant"
df$sig[df$FDR<0.1] <- "significant"
pp <- ggplot(df,mapping = aes(y = DE_gene, x = gRNA,size=sig)) +
  geom_raster(aes(fill=lfc)) +
  labs(y="", x="") +
  theme_bw() + theme(axis.text.x=element_text(size=9, angle=90, vjust=0.3),
                     axis.text.y=element_text(size=9),
                     plot.title=element_text(size=11)) + labs(fill="lfc")+
                    scale_fill_gradient2(
        oob = scales::squish,low="darkblue",high="red")+ geom_point() +
  scale_size_manual(values=c(significant=2, not_significant=NA),guide="none")

print(pp+ggtitle("DE for MAYA_hallmark scores compared to nontargeting control"))
```


# Low dimensional representation of the data

We perform PCA for further downstream analysis and visualisation (energy distance, UMAP, diffusion score).

We use the genes identified above as differentially expressed between a gRNA and non-targeting controls as a basis of genes on which to perform principal component analysis (PCA). Genes are included if for at least one gRNA they have an FDR<0.1 and an abs(lfc) > 0.25. Unlike for the energy distances, for which we used a shared basis of genes for the CBE and ABE data sets for better comparability, we now use a basis of genes identified using the `r yy` data set only. We use these genes to compute UMAP and diffusion map representations. 

```{r }
markers <-  read.table(markers_file,sep=",",header=TRUE)
markers_lfc <- markers[abs(markers$lfc)>0.25 & markers$FDR<0.1,]
basis_of_genes <- unique(markers_lfc$DE_gene)
```

```{r }
reducedDims(sce_iBAR)$PCA_basis <- calculatePCA(sce_iBAR[rowData(sce_iBAR)$V2%in%basis_of_genes,],ncomponents=20,scale=TRUE)
```


```{r}
sce_iBAR <- runUMAP(sce_iBAR,dimred="PCA_basis")
```

We now plot the UMAP representation of the data. Each iBAR-group is a dot. For iBAR-groups with several gRNAs, variant classes are ranked by their impact as follows: driver/drug addiction  > canonical drug resistance > control. iBAR groups with both driver and drug addiction conferring gRNAs are discarded. Colours indicate the class with the highest impact for each cell.

```{r, umap_all}
sce_iBAR <- sce_iBAR[,!(grepl("driver",sce_iBAR$variant_class)&grepl("drug addiction",sce_iBAR$variant_class))]
sce_iBAR$variant_class_highest_impact <- "control"
sce_iBAR$variant_class_highest_impact[grepl("canonical drug resistance",sce_iBAR$variant_class)] <- "canonical drug resistance"
sce_iBAR$variant_class_highest_impact[grepl("drug addiction",sce_iBAR$variant_class)] <- "drug addiction"
sce_iBAR$variant_class_highest_impact[grepl("driver",sce_iBAR$variant_class)] <- "driver"

plotReducedDim(cbind(sce_iBAR[,sce_iBAR$variant_class_highest_impact=="control"],sce_iBAR[,sce_iBAR$variant_class_highest_impact!="control"]),colour_by="variant_class_highest_impact",dimred="UMAP") +
  scale_colour_manual(values=colourscheme_variant_class)+theme(legend.position = "bottom",legend.title = element_blank())+guides(color=guide_legend(nrow=2,override.aes = list(size=3)))
```

# Correlation of differential expression across gRNAs

We plot the correlation of differential expression from non-targeting controls (measured by the log-fold change) across all classes of resistance targets.

```{r cor_lfc,fig.width=14,fig.height=10}
markers_all <- read.table(markers_file_all,sep=",",header=TRUE)
markers_all_gex <- markers_all[!(markers_all$DE_gene%in%c(rownames(progeny_scores),colnames(activity_summary_ordered))),]
cor_effect <- matrix(0,nrow=length(gRNAs_resistance_driver),ncol=length(gRNAs_resistance_driver))
rownames(cor_effect) <- gRNAs_resistance_driver
colnames(cor_effect) <- gRNAs_resistance_driver
for (j in 2:length(gRNAs_resistance_driver)){
  for (k in 1:(j-1)){
    cor_effect[j,k] <- cor(markers_all_gex$lfc[markers_all_gex$gRNA== gRNAs_resistance_driver[k]],
    markers_all_gex$lfc[markers_all_gex$gRNA== gRNAs_resistance_driver[j]])
  }
}
cor_effect <- cor_effect+t(cor_effect)
for (j in 1:nrow(cor_effect)){
  cor_effect[j,j] <- 1
}

df_row_cor <- data.frame(variant_class=validation_lib$variant_class[match(colnames(cor_effect),validation_lib$ID)],
                     gene=validation_lib$Gene[match(colnames(cor_effect),validation_lib$ID)])

rownames(df_row_cor) <- rownames(cor_effect)
col_genes <- brewer.pal(length(unique(df_row_cor$gene)),"Dark2")
names(col_genes) <- unique(df_row_cor$gene)
cols_cor <- list(variant_class=colourscheme_variant_class,gene=col_genes)

pheatmap(cor_effect,annotation_col = df_row_cor,annotation_colors = cols_cor)
```

We restrict the plot to drivers/drug addiction

```{r cor_DA}
gRNAs_DA <- intersect(gRNAs_resistance_driver,validation_lib$ID[validation_lib$variant_class!="canonical drug resistance"])

pheatmap(cor_effect[gRNAs_DA,gRNAs_DA],annotation_col = df_row_cor,annotation_colors = cols_cor)
```

# Progression-free survival (PFS) outcome score

We use DE expression data (log2-fold change) for patients with PFS > 6 months and patients with PFS < 6 months from the @Tian2023. 
First, we select the genes with adjusted p-values < 0.1 and abs(log2-fold change) > 0.25 for differential expression at 15 days versus pre-treatment for PFS<6m or PFS>6m.

```{r}
DEG_NR <- read.table("../../Tian_et_al_NR.csv",sep=",",header=TRUE)
DEG_R <- read.table("../../Tian_et_al_R.csv",sep=",",header=TRUE)
genes_preselected <- 
  unique(c(DEG_NR$gene[DEG_NR$p_val_adj<0.01&abs(DEG_NR$avg_log2FC)>0.25],
           DEG_R$gene[DEG_R$p_val_adj<0.01&abs(DEG_R$avg_log2FC)>0.25]))
```


We compute rank correlation between our differential expression results and Tian et al.'s DE results for patients with progression free survival (PFS) < 6 months, based on the genes selected above. 

```{r DEG_PFS_outcome}
cor_R <- rep(0,length(gRNAs_resistance_driver))
names(cor_R) <- gRNAs_resistance_driver
cor_NR <- rep(0,length(gRNAs_resistance_driver))
names(cor_NR) <- gRNAs_resistance_driver
for (j in 1:length(gRNAs_resistance_driver)){
  markers_temp <- markers_all_gex[markers_all_gex$gRNA==gRNAs_resistance_driver[j],]
  markers_temp <- markers_temp[match(DEG_R$gene,markers_temp$DE_gene),]
  markers_temp <- markers_temp[match(genes_preselected,markers_temp$DE_gene),]
  cor_R[j] <- cor(markers_temp$lfc,DEG_R$avg_log2FC[match(genes_preselected,DEG_R$gene)],method="spearman")
  cor_NR[j] <- cor(markers_temp$lfc,DEG_NR$avg_log2FC[match(genes_preselected,DEG_NR$gene)],method="spearman")

}
df_cor_R <- data.frame(cor_R=cor_R,gRNA=names(cor_R),variant_type=validation_lib$variant_class[match(names(cor_R),validation_lib$ID)])
df_cor_R <- df_cor_R[order(cor_R),]
df_cor_R$gRNA <- factor(df_cor_R$gRNA,levels=df_cor_R$gRNA)
ggplot(df_cor_R,mapping=aes(y=cor_R,x=gRNA,fill=variant_type)) + geom_bar(stat="identity") + coord_flip()+scale_fill_manual(values=colourscheme_variant_class[names(colourscheme_variant_class)!="control"])+theme_classic()+ylab("PFS_outcome score")
```


We compare PFS outcome scores across the different variant classes.

```{r PFS_outcome_score_box}
gRNAs_lfc <- sce_iBAR$gRNA[!(grepl("-",sce_iBAR$gRNA))]
gRNAs_lfc <- names(table(gRNAs_lfc))[table(gRNAs_lfc)>=10]

lfc_all <- matrix(NA,ncol=length(gRNAs_lfc),nrow=nrow(sce_iBAR))
names(lfc_all) <- gRNAs_lfc
for (j in 1:length(gRNAs_lfc)){
  sce_temp <- sce_iBAR[,sce_iBAR$gRNA%in%c(gRNAs_lfc[j],gRNAs_NT)]
  sce_temp$gRNA[sce_temp$gRNA %in% setdiff(gRNAs_NT,gRNAs_lfc[j])] <- "nontargeting"
  a <- findMarkers(sce_temp,groups=sce_temp$gRNA,test.type="t")[[gRNAs_lfc[j]]][rownames(sce_temp),]
  lfc_all[,j] <- a$summary.logFC
    
}


cor_R_all <- rep(NA,length(gRNAs_lfc))
names(cor_R_all) <- gRNAs_lfc
for (j in 1:length(gRNAs_lfc)){
  cor_R_all[j] <- cor(lfc_all[match(genes_preselected,rowData(sce_iBAR)$V2),j],DEG_R$avg_log2FC[match(genes_preselected,DEG_R$gene)],method="spearman")
  
}

df_cor_R_all <- data.frame(gRNA=names(cor_R_all),PFS_outcome_score=cor_R_all,variant_class=validation_lib$variant_class[match(names(cor_R_all),validation_lib$ID)])
df_cor_R_all$variant_class[grepl("non_targeting",df_cor_R_all$gRNA)] <- "non_targeting_control"
colourscheme_temp <- c(colourscheme_variant_class,"non_targeting_control"="black")
if (yy=="ABE"){
  colourscheme_temp <- colourscheme_temp[setdiff(names(colourscheme_temp),"driver")]
  df_cor_R_all <- df_cor_R_all[df_cor_R_all$variant_class!="driver",]
}else{
  colourscheme_temp <-colourscheme_temp[setdiff(names(colourscheme_temp),"drug addiction")]
}

median_PFS_outcome_score_per_variant_class <- df_cor_R_all[,c("PFS_outcome_score","variant_class")] %>% group_by(variant_class)%>% summarize(median_PFS_outcome_score=median(PFS_outcome_score))
median_PFS_outcome_score_per_variant_class <- median_PFS_outcome_score_per_variant_class[order(median_PFS_outcome_score_per_variant_class$median_PFS_outcome_score),]
df_cor_R_all$variant_class <- factor(df_cor_R_all$variant_class,levels=median_PFS_outcome_score_per_variant_class$variant_class)

nr_per_variant_class <- df_cor_R_all[,c("PFS_outcome_score","variant_class")] %>% group_by(variant_class)%>% dplyr::count()
print(nr_per_variant_class)

ggplot(df_cor_R_all,aes(y=PFS_outcome_score,x=variant_class,color=variant_class)) +  geom_boxplot(outliers=FALSE) + geom_jitter() + scale_color_manual(values=colourscheme_temp)+theme_classic(base_size=14)+xlab("")+theme(
  axis.text.x = element_blank())
```


We test for significance of difference of PSF outcome scores across variant classes. 

```{r wilcox_PFS_outcome_score,fig.height=2,fig.width=10}
wilcox_PFS_outcome_score <- pairwise.wilcox.test(df_cor_R_all$PFS_outcome_score,g=df_cor_R_all$variant_class,p.adjust.method = "BH")
wilcox_PFS_outcome_score_p_val <- reshape2::melt(signif(wilcox_PFS_outcome_score$p.value,2))
names(wilcox_PFS_outcome_score_p_val) <- c("variant_class_1","variant_class_2","FDR")
if (yy=="ABE"){
  wilcox_PFS_outcome_score_p_val$variant_class_1 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_1,levels=c("non_targeting_control","control","canonical drug resistance","drug addiction"))
  wilcox_PFS_outcome_score_p_val$variant_class_2 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_2,levels=c("non_targeting_control","control","canonical drug resistance","drug addiction"))
}else{
  wilcox_PFS_outcome_score_p_val$variant_class_1 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_1,levels=c("non_targeting_control","control","canonical drug resistance","driver"))
  wilcox_PFS_outcome_score_p_val$variant_class_2 <- factor(wilcox_PFS_outcome_score_p_val$variant_class_2,levels=c("non_targeting_control","control","canonical drug resistance","driver"))
}

ggplot(wilcox_PFS_outcome_score_p_val,aes(x=variant_class_1,y=variant_class_2,fill=-log10(FDR),label=FDR)) + geom_raster()+geom_text(size=5,color="white")+theme_classic(base_size=14)+xlab("")+ylab("")+scale_fill_continuous_tableau(na.value="white",palette="Red-Gold")+ggtitle(paste0("p-values: PFS outcome score: ",yy))
```


We add the PFS_outcome score to the SingleCellExperiment, for iBAR clones with one resistance gRNA and no other gRNAs.

```{r}
sce_iBAR$PFS_outcome_score <- NA
for (j in 1:length(cor_R)){
  sce_iBAR$PFS_outcome_score[sce_iBAR$gRNA == names(cor_R)[j]] <- cor_R[j]
}
```


# Diffusion score

We compute diffusion scores as in @Haghverdi and @scsnv-seq, for those resistance-conferring gRNAs (assessment of resistance in the pooled screens) that are the unique gRNA in at least 10 iBAR groups.

```{r dm_DA,fig.height=4}
sce_iBAR_resistance <- sce_iBAR[,sce_iBAR$variant_class_highest_impact!="control"]
markers_lfc$variant_class <- validation_lib$variant_class[match(markers_lfc$gRNA,validation_lib$ID)]
markers_lfc_DA <- markers_lfc[markers_lfc$variant_class!="canonical drug resistance",]
nr_DE_genes <- table(markers_lfc_DA$gRNA)
sce_iBAR_resistance$nr_DE_genes <- 0
for (j in 1:length(nr_DE_genes)){
  sce_iBAR_resistance$nr_DE_genes[grepl(names(nr_DE_genes)[j],sce_iBAR_resistance$gRNA)] <- sce_iBAR_resistance$nr_DE_genes[grepl(names(nr_DE_genes)[j],sce_iBAR_resistance$gRNA)] + nr_DE_genes[j]
}

dm <- DiffusionMap(reducedDims(sce_iBAR_resistance)$PCA_basis)
if (cor(dm$DC1,sce_iBAR_resistance$nr_DE_genes) < 0){
  eigenvectors(dm) <- -eigenvectors(dm)
}

sce_iBAR_resistance$diffusion_score <- dm$DC1
sce_iBAR$diffusion_score <- NA
sce_iBAR[,colnames(sce_iBAR_resistance)]$diffusion_score <- sce_iBAR_resistance$diffusion_score

xx <- sample(1:length(dm$DC1))
tmp <- data.frame(DC1 = eigenvectors(dm)[xx, 1],
                  DC2 = eigenvectors(dm)[xx, 2],
                  target_gene =sce_iBAR_resistance$target_gene[xx],
                  gRNA =sce_iBAR_resistance$gRNA[xx],
                  nr_DE_genes=sce_iBAR_resistance$nr_DE_genes[xx],
                  dpt=dm$DC1[xx])
```


The following is a diffusion map coloured by the diffusion score.

```{r diffusion_map_diffusion_score}
p1c <- ggplot(tmp, aes(x = DC1, y = DC2, colour = dpt)) +
  geom_point() +
  xlab("Diffusion component 1") +
  ylab("Diffusion component 2") +
  theme_classic(base_size=11) + theme(legend.position = "bottom",legend.box="vertical",legend.margin=margin())+
  labs(color="")+scale_color_viridis_c(option="magma")+ggtitle("diffusion score")
print(p1c)

```

The following plot shows the distribution of diffusion scores for each driver/drug addiction gRNA, ordered horizontally by their mean diffusion scores. For several of the gRNAs the diffusion scores have a multi-modal distribution, indicating either differences in editing between iBAR-groups of the same gRNA, or different responses to the same edits.

```{r ggridges_plot_diffusion_score}
sce_iBAR_resistance_one_gRNA <- sce_iBAR_resistance[,!(grepl("-",sce_iBAR_resistance$gRNA))]
df_gRNA_ds_mean <- unlist(lapply(1:length(gRNAs_resistance_driver),function(x) mean(sce_iBAR_resistance_one_gRNA$diffusion_score[sce_iBAR_resistance_one_gRNA$gRNA==gRNAs_resistance_driver[x]])))
names(df_gRNA_ds_mean ) <- gRNAs_resistance_driver

levels_a <- names(df_gRNA_ds_mean)[order(df_gRNA_ds_mean,decreasing=TRUE)]


df <- data.frame(diffusion_score=sce_iBAR_resistance_one_gRNA$diffusion_score,gRNA=sce_iBAR_resistance_one_gRNA$gRNA)

split_gRNAs <- function(line_df){
  output <- line_df
  if (sum(grepl("-",line_df$gRNA))>=1)
    {
    w <- strsplit(line_df$gRNA,"-")[[1]]
      for (k in 1:length(w)){
        output <- rbind(output,unlist(c(line_df[1],w[k])))
      }
    }
  return(output)
}

split_gRNAs_df <- function(d_f){
  output <- c()
  for (j in 1:nrow(d_f)){
    output <- rbind(output,split_gRNAs(d_f[j,]))  }
  output <- output[!(grepl("-",output$gRNA)),]
}

df <- split_gRNAs_df(df)
df <- df[df$gRNA%in%gRNAs_resistance_driver,]


df$diffusion_score <- as.double(df$diffusion_score)
df$mean_diffusion_score <- df_gRNA_ds_mean[as.vector(df$gRNA)]
df$variant_class <- validation_lib$variant_class[match(as.vector(df$gRNA),validation_lib$ID)]
df$gRNA=factor(df$gRNA,levels=levels_a)

y_lab_colours <- colourscheme_variant_class[validation_lib$variant_class[match(levels_a,validation_lib$ID)]]

p2 <- ggplot(as.data.frame(df),aes(x=diffusion_score,y=gRNA,fill=mean_diffusion_score)) + ggridges::geom_density_ridges() + scale_fill_viridis_c(option="magma")+
  theme_classic()+theme(axis.text.y = element_text(color=y_lab_colours))+ylab("")
print(p2)
```

Now we re-colour the plot above by PFS_outcome score, keeping the horizontal ordering of the gRNAs as above.


```{r ggridges_plot_PFS_outcome_score}
df_unique_gRNA <- df[!(grepl("-",df$gRNA)),]
df_unique_gRNA$PFS_outcome_score <- cor_R[as.vector(df_unique_gRNA$gRNA)]

p3 <- ggplot(as.data.frame(df_unique_gRNA),aes(x=diffusion_score,y=gRNA,fill=PFS_outcome_score)) + ggridges::geom_density_ridges() + scale_fill_viridis_c()+
  theme_classic()+theme(axis.text.y = element_text(color=y_lab_colours))+ylab("")
print(p3)
```

The following scatterplot compares average diffusion scores for each gRNA to survival scores, illustrating the high negative correlation between the two scores.

```{r scatter_diffusion_PFS_outcome}
library(ggrepel)
df_scatter_diffusion_PFS_outcome <- data.frame(diffusion_score=df_gRNA_ds_mean,PFS_outcome_score=cor_R[names(df_gRNA_ds_mean)],
                                            gRNA=names(df_gRNA_ds_mean),gene=sapply(names(df_gRNA_ds_mean),function(x) strsplit(x,"_")[[1]][1]))
ggplot() + geom_point(data=df_scatter_diffusion_PFS_outcome,mapping=aes(x=diffusion_score,y=PFS_outcome_score,label=gRNA,color=gene))+ geom_smooth(data=df_scatter_diffusion_PFS_outcome,mapping=aes(x=diffusion_score,y=PFS_outcome_score),alpha=0.2)+theme_classic()+geom_text_repel(data=df_scatter_diffusion_PFS_outcome,mapping=aes(x=diffusion_score,y=PFS_outcome_score,label=gRNA,color=gene))+theme(legend.position = "None")
```

Correlation between diffusion and PFS_outcome score: `r cor(df_scatter_diffusion_PFS_outcome$diffusion_score,df_scatter_diffusion_PFS_outcome$PFS_outcome_score)`.


# Differential gene expression plots for each gRNA (driver and drug addiction)

We plot up to 20 upregulated and 20 downregulated genes (the most statistically significant ones), using the markers computed previously for all the gRNAs passing our threshold for transcriptional impact (p-value < 10^-6 for at least one gene or pathway).

```{r marker_plots_driver_da}
for (j in 1:length(gRNAs_selected)){
  markers_temp <- markers_gex[markers_gex$gRNA==gRNAs_selected[j],]
  sig <- rep("not significant",nrow(markers_temp))
  sig[markers_temp$FDR<0.1&markers_temp$lfc<0] <- "down"
  sig[markers_temp$FDR<0.1&markers_temp$lfc>0] <- "up"
  df_temp <- data.frame(lfc=markers_temp$lfc,FDR=markers_temp$FDR,gene_name=markers_temp$DE_gene,
                        sig=sig)
  df_highlight_up <- df_temp[df_temp$FDR<0.1 & df_temp$lfc>0,]
  df_highlight_up <- df_highlight_up[order(df_highlight_up$FDR)[1:(min(20,nrow(df_highlight_up)))],]

  df_highlight_down <- df_temp[df_temp$FDR<0.1 & df_temp$lfc<0,]
  df_highlight_down <- df_highlight_down[order(df_highlight_down$FDR)[1:(min(20,nrow(df_highlight_down)))],]
  pp <- ggplot(df_temp, aes(x=lfc, y=-log10(FDR),color=sig)) +
    geom_point() +
    ggtitle(gRNAs_selected[j]) +
    labs(y=expression('-Log'[10]*' FDR'), x=expression('lfc')) +
    theme_classic(base_size=20) +
    theme(legend.position="none", plot.title = element_text(size = rel(1), hjust = 0.5))+
    geom_text_repel(data=rbind(df_highlight_up,df_highlight_down),aes(x = lfc, y = -log10(FDR),label=gene_name),max.overlaps=100)+
    scale_color_manual(values=c("not significant"='lightgray',"down"='red',"up"='blue'))+ scale_y_continuous(trans=scales::pseudo_log_trans(sigma=1,base = 1.05))+
  geom_hline(yintercept = 1)
  print(pp)
}
```


# Differential expression across gRNA groups

Now we perform differential expression for PROGENy scores across the variant types. To give equal weight to each gRNA, we subsample the same number of iBAR groups for each gRNA for any gRNA with more than 20 iBAR groups.

```{r}
sce_canonical_drug_resistance <- sce_logcounts_and_pathway_scores[colnames(progeny_scores),sce_logcounts_and_pathway_scores$variant_class_highest_impact=="canonical drug resistance"]
gRNAs_canonical_drug_resistance <- validation_lib$ID[validation_lib$variant_class=="canonical drug resistance"]
sce_canonical_drug_resistance_balanced <- sce_logcounts_and_pathway_scores[,sce_logcounts_and_pathway_scores$gRNA==""]
for (j in 1:length(gRNAs_canonical_drug_resistance)){
  xx_temp <- colnames(sce_logcounts_and_pathway_scores)[grepl(gRNAs_canonical_drug_resistance[j],sce_logcounts_and_pathway_scores$gRNA)]
  if (length(xx_temp)>=20){
  xx_temp <- sample(xx_temp,20)}
  sce_canonical_drug_resistance_balanced  <- cbind(sce_canonical_drug_resistance_balanced ,sce_logcounts_and_pathway_scores[,xx_temp])
}

aa <- ifelse(yy=="ABE","drug addiction","driver") 
sce_other_impact <- sce_logcounts_and_pathway_scores[,sce_logcounts_and_pathway_scores$variant_class_highest_impact==aa] 
gRNAs_other_impact <- validation_lib$ID[validation_lib$variant_class==aa] 
sce_other_impact_balanced <- sce_logcounts_and_pathway_scores[,sce_logcounts_and_pathway_scores$gRNA==""] 
for (j in 1:length(gRNAs_other_impact)){ 
   xx_temp <- colnames(sce_logcounts_and_pathway_scores)[grepl(gRNAs_other_impact[j],sce_logcounts_and_pathway_scores$gRNA)] 
   if (length(xx_temp)>=20){
   xx_temp <- sample(xx_temp,20)}
   sce_other_impact_balanced  <- cbind(sce_other_impact_balanced ,sce_logcounts_and_pathway_scores[,xx_temp])
 }

 markers_other_impact_versus_canonical <- findMarkers(cbind(sce_canonical_drug_resistance_balanced,sce_other_impact_balanced),  groups=c(rep("canonical",ncol(sce_canonical_drug_resistance_balanced)), 
                                                               rep(aa,ncol(sce_other_impact_balanced))),test.type="wilcox")[[aa]] 
 markers_other_impact_versus_canonical_lfc <- findMarkers(cbind(sce_canonical_drug_resistance_balanced,sce_other_impact_balanced), 
                                   groups=c(rep("canonical",ncol(sce_canonical_drug_resistance_balanced)), 
                                    rep(aa,ncol(sce_other_impact_balanced))),test.type="t")[[aa]] 
 markers_other_impact_versus_canonical$lfc <- markers_other_impact_versus_canonical_lfc[rownames(markers_other_impact_versus_canonical),]$summary.logFC 
 markers_other_impact_versus_canonical$gene <- rownames(markers_other_impact_versus_canonical)

 
 markers_other_impact_versus_canonical_gex <- 
   markers_other_impact_versus_canonical[!(markers_other_impact_versus_canonical$gene %in% c(colnames(progeny_scores),rownames(activity_summary_ordered))),]
 
  sig <- rep("not significant",nrow(markers_other_impact_versus_canonical_gex)) 
 sig[markers_other_impact_versus_canonical_gex$FDR<0.1&markers_other_impact_versus_canonical_gex$lfc<0] <- "down" 
 sig[markers_other_impact_versus_canonical_gex$FDR<0.1&markers_other_impact_versus_canonical_gex$lfc>0] <- "up" 
 
 df_temp <- data.frame(lfc=markers_other_impact_versus_canonical_gex$lfc,FDR=markers_other_impact_versus_canonical_gex$FDR, 
                       gene_name=markers_other_impact_versus_canonical_gex$gene,sig=sig) 
 df_highlight_up <- df_temp[df_temp$FDR<0.1 & df_temp$lfc>0.25,] 
 df_highlight_up <- df_highlight_up[order(df_highlight_up$FDR)[1:(min(20,nrow(df_highlight_up)))],] 

 df_highlight_down <- df_temp[df_temp$FDR<0.1 & df_temp$lfc< -0.25,] 
 df_highlight_down <- df_highlight_down[order(df_highlight_down$FDR)[1:(min(20,nrow(df_highlight_down)))],] 
 pp <- ggplot(df_temp, aes(x=lfc, y=-log10(FDR),color=sig)) + 
   geom_point() + 
   ggtitle(paste0(aa," versus canonical resistance")) + 
   labs(y=expression('-Log'[10]*' FDR'), x=expression('lfc')) + 
   theme_classic(base_size=20) + 
   theme(legend.position="none", plot.title = element_text(size = rel(1), hjust = 0.5))+ 
   geom_text_repel(data=rbind(df_highlight_up,df_highlight_down),aes(x = lfc, y = -log10(FDR),label=gene_name),max.overlaps=100)+ 
   scale_color_manual(values=c("not significant"='lightgray',"down"='red',"up"='blue'))+ scale_y_continuous(trans=scales::pseudo_log_trans(sigma=1,base = 1.05))+ 
 geom_hline(yintercept = 1) 
 print(pp) 
``` 

We plot the same figure for the PROGENy pathway scores. 

```{r progeny_comparing_categories} 
 markers_other_impact_versus_canonical_progeny <- 
   markers_other_impact_versus_canonical[markers_other_impact_versus_canonical$gene %in% colnames(progeny_scores),]
 
  sig <- rep("not significant",nrow(markers_other_impact_versus_canonical_progeny)) 
 sig[markers_other_impact_versus_canonical_progeny$FDR<0.1&markers_other_impact_versus_canonical_progeny$lfc<0] <- "down" 
 sig[markers_other_impact_versus_canonical_progeny$FDR<0.1&markers_other_impact_versus_canonical_progeny$lfc>0] <- "up" 
 
 df_temp <- data.frame(lfc=markers_other_impact_versus_canonical_progeny$lfc,FDR=markers_other_impact_versus_canonical_progeny$FDR, gene_name=markers_other_impact_versus_canonical_progeny$gene,sig=sig) 
 df_highlight_up <- df_temp[df_temp$FDR<0.1 & df_temp$lfc>0.25,] 
 df_highlight_up <- df_highlight_up[order(df_highlight_up$FDR)[1:(min(20,nrow(df_highlight_up)))],] 

 df_highlight_down <- df_temp[df_temp$FDR<0.1 & df_temp$lfc< -0.25,] 
 df_highlight_down <- df_highlight_down[order(df_highlight_down$FDR)[1:(min(20,nrow(df_highlight_down)))],] 
 pp <- ggplot(df_temp, aes(x=lfc, y=-log10(FDR),color=sig)) + 
   geom_point() + 
   ggtitle(paste0(aa," versus canonical resistance")) + 
   labs(y=expression('-Log'[10]*' FDR'), x=expression('lfc')) + 
   theme_classic(base_size=20) + 
   theme(legend.position="none", plot.title = element_text(size = rel(1), hjust = 0.5))+ 
   geom_text_repel(data=rbind(df_highlight_up,df_highlight_down),aes(x = lfc, y = -log10(FDR),label=gene_name),max.overlaps=100)+ 
   scale_color_manual(values=c("not significant"='lightgray',"down"='red',"up"='blue'))+ scale_y_continuous(trans=scales::pseudo_log_trans(sigma=1,base = 1.05))+ 
 geom_hline(yintercept = 1) 
 print(pp) 
``` 


# Cell-cycle scoring

We investigate whether there is a difference in proportion of cell cycle phases across variant types.
First, we plot the relation between cell cycle and variant class.

```{r cell_cycle}
sce_iBAR <- cell_cycle_scoring(sce_iBAR)#can use the aggregated counts across cells (different number
#of cells for different iBAR groups), as 'cell_cycle_scoring' includes a normalisation step
df_cc <- data.frame(variant_class=sce_iBAR$variant_class_highest_impact,phase=sce_iBAR$phase)
cc_count <- df_cc %>% dplyr::group_by_all() %>% dplyr::count()
if (yy=="ABE"){
  cc_count <- cc_count[cc_count$variant_class!="driver",]#remove driver for ABE for lack of data
}
cc_count
ggplot(df_cc,mapping=aes(x=variant_class,fill=phase)) + geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..])) + scale_fill_manual(values=c("G1"="darkgrey","G2M"="black","S"="purple"))+theme_bw(base_size=12) + ylab("proportion in phase")


```

Now we plot the relation between cell cycle phase and resistance as a binary variable.

```{r cell_cycle_binary}
sce_iBAR$resistance_mutation <- grepl("TRUE",sce_iBAR$resistance_mutation) #at least one resistance conferring gRNA in the cell
df_cc <- data.frame(resistance=sce_iBAR$resistance_mutation ,phase=sce_iBAR$phase)
df_cc$resistance <- ifelse(df_cc$resistance,"resistance","no resistance")

ggplot(df_cc,mapping=aes(x=resistance,fill=phase)) + geom_bar(aes( y=..count../tapply(..count.., ..x.. ,sum)[..x..])) +  scale_fill_manual(values=c("G1"="darkgrey","G2M"="black","S"="purple"))+theme_bw(base_size=12) + ylab("proportion in phase")


```

We test whether there is a significant change in the number of iBAR groups for each cell cycle phase between gRNAs associated with resistance, and those not associated with resistance.

```{r}
df_cc_count <- df_cc %>% dplyr::group_by_all() %>% dplyr::count()
table_compare_cc <- reshape2::acast(df_cc_count,phase~resistance)
print(chisq.test(table_compare_cc))
```


We add the average diffusion score (for drug addiction(ABE)/driver(CBE)) and PSF outcome score to the validation library, and saving the annotated validation library and the SingleCellExperiment. 

```{r}
validation_lib$diffusion_score <- NA
for (j in 1:length(gRNAs_resistance_driver))
{validation_lib$diffusion_score[validation_lib$ID==gRNAs_resistance_driver[j]] <- 
  mean(sce_iBAR_resistance_one_gRNA$diffusion_score[grepl(gRNAs_resistance_driver[j],sce_iBAR_resistance_one_gRNA$gRNA)],na.rm=TRUE)}
validation_lib$PFS_outcome_score <- NA
validation_lib$PFS_outcome_score[match(df_cor_R_all$gRNA,validation_lib$ID)] <- df_cor_R_all$PFS_outcome_score
write.table(validation_lib,file=paste0("validation_lib_ann_",yy,".csv"),sep=",",col.names=TRUE,row.names=FALSE)
```

```{r}
saveRDS(sce_iBAR,file=sce_save_file)
```

# Session info 

```{r}
sessionInfo()
```