boxplotteR_ggplot.R

library(ggplot2)
library(pals)

# Read column names from text file
args <- commandArgs(trailingOnly = TRUE)
column_names <- readLines(args[1])
#column_names <- readLines("columns.txt")
column_names <- gsub("%", ".", column_names)

# Read your data from a TSV file (assuming your data is in a TSV file named 'data.tsv')
data <- read.table(args[2], header = TRUE, sep = "\t")
#data <- read.table("public_samples/full_comparison/QC_table_uniq_filtered_gen_2_new_exon.csv", header = TRUE, sep = "\t")

biotype_columns <- c(    "Ala_tRNA", "Arg_tRNA", "Asn_tRNA", "Asp_tRNA", "Cys_tRNA", "Gln_tRNA", "Glu_tRNA", "Gly_tRNA", "His_tRNA", "IG_C_gene",
                         
                         "IG_C_pseudogene", "IG_D_gene", "IG_J_gene", "IG_J_pseudogene", "IG_V_gene", "IG_V_pseudogene", "IG_pseudogene", "Ile_tRNA", "Leu_tRNA", "Lys_tRNA", "Met_tRNA", "Mt_rRNA", "Mt_tRNA", "Phe_tRNA", "Pro_tRNA", "Pseudo_tRNA",
                         
                         "SeC.e._tRNA", "SeC_tRNA", "Ser_tRNA", "Sup_tRNA", "TEC", "TR_C_gene", "TR_D_gene", "TR_J_gene", "TR_J_pseudogene", "TR_V_gene", "TR_V_pseudogene", "Thr_tRNA", "Trp_tRNA", "Tyr_tRNA", "Undet_tRNA", "Val_tRNA", "lncRNA", "miRNA",
                         
                         "misc_RNA", "polymorphic_pseudogene", "processed_pseudogene", "protein_coding", "pseudogene", "rRNA", "rRNA_pseudogene", "ribozyme", "sRNA", "scRNA", "scaRNA", "snRNA", "snoRNA", "spike_in",
                         
                         "transcribed_processed_pseudogene", "transcribed_unitary_pseudogene", "transcribed_unprocessed_pseudogene", "translated_processed_pseudogene", "translated_unprocessed_pseudogene",
                         
                         "unitary_pseudogene", "unprocessed_pseudogene", "vault_RNA")

biotype_data <- data[, biotype_columns]
biotype_data$total <- rowSums(biotype_data)
biotype_data$percent_of_reads_mapping_to_spike_ins <- biotype_data$spike_in / biotype_data$total
biotype_data$percent_of_reads_mapping_to_spike_ins <- biotype_data$percent_of_reads_mapping_to_spike_ins * 100

# comparison <- data.frame (data$Sample, data$dataset, data$Exonic, biotype_data$total)
# comparison$difference <- comparison$data.Exonic - comparison$biotype_data.total
# summary(comparison$difference)
# write.csv(comparison, file = "exonic_minus_biotype.csv")

# Keep specified columns
selected_columns <- c(column_names, "dataset")
filtered_data <- data[, selected_columns]

filtered_data$percent_of_reads_mapping_to_spike_ins <- biotype_data$percent_of_reads_mapping_to_spike_ins
filtered_data$exonic_reads_minus_spike_ins <- filtered_data$exonic_reads_minus_spike_ins *100

library(dplyr)

result <- filtered_data %>%
  filter(percent_of_reads_mapping_to_spike_ins > 5) %>%  # Filter rows where variable1 is greater than 5
  group_by(dataset) %>%     # Group data by the 'dataset' column
  summarise(count = n())    # Count the number of rows in each group

#print(result)

table_filtered <- filtered_data %>%
  filter(percent_of_reads_mapping_to_spike_ins <= 5)


# Generate colors from the glasbey palette
num_datasets <- length(unique(table_filtered$dataset))
glasbey_colors <- pals::glasbey(num_datasets)
table_filtered$dataset <- factor(table_filtered$dataset, levels = c("flomics_gen_1", "flomics_gen_2", "flomics_gen_3", "block", "chen", "ngo", "roskams", "zhu", "exome_chalasani", "exome_ibarra", "exome_toden", "encode_bulkRNA", "cfdna_tao_wei")) # change this vector for the real levels of your dataset variable


column_names <- c(column_names, "percent_of_reads_mapping_to_spike_ins")
# Create a list to store ggplot objects for each column
ggplot_objects <- lapply(column_names, function(col_name) {
  ggplot(table_filtered, aes(x = dataset, y = .data[[col_name]], fill = dataset)) +
    geom_boxplot(alpha = 0.3, color = glasbey_colors, position = position_dodge(width = 0.75), outlier.shape = NA ) +
    geom_point(aes(y = .data[[col_name]], color = dataset), 
               position = position_jitterdodge(dodge.width = 0.75, jitter.width = 0.8), 
               shape = 16, size = 2) +
    labs(title =  col_name,
         x = "Dataset", y = col_name) +
    theme_minimal() +
    theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
    scale_fill_manual(values = glasbey_colors) +
    scale_color_manual(values = glasbey_colors) + # Match outline colors to fill
    guides(fill = guide_legend(override.aes = list(color = glasbey_colors))) # Override the legend

})

# Save individual ggplot boxplots as JPEG images
for (i in 1:length(ggplot_objects)) {
  col_name <- column_names[i]
  output_file <- paste0(gsub(" ", "_", tolower(col_name)), "_boxplot_with_points.png")
  ggsave(output_file, ggplot_objects[[i]], width = 8, height = 6)
}