-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathIM.Rmd
1078 lines (804 loc) · 51.3 KB
/
IM.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "Interference Modeling in MS2-quantified multiplex proteomics"
author: "Moritz Madern"
date: "2023-02-10"
---
Data input:
1) PSM-table (e.g. MaxQuant's "msms.txt")
2) thermo raw files (*.raw)
3) rawStallion tool (https://github.com/fstanek/rawStallion)
4) isotopic impurity matrix
5) functions contained in "functions_IM.R"
Data output:
1) modified PSM-table named "PSM.txt"
This script extracts relevant information from the Thermo raw files, and subsequently models the PSM-wise total reporter ion signal. Based on the trained model, PSM-wise "estimated interference level" (EIL) values will be calculated. After between-sample normalization, the script performs interference-correction on the basis of calculated EIL values. A modified PSM-table is finally returned that contains PSM-wise EIL values as well as normalized uncorrected and normalized interference-corrected ("EIL-corrected") reporter ion intensities.
Note: This script currently supports MaxQuant as well as FragPite output as input.
```{r Load required packages and functions, echo=FALSE, message=FALSE, warning=FALSE}
## Load packages. If not installed yet, install them prior to running this script!
library(tidyverse)
library(readr)
library(pracma)
library(plot3D)
library(MASS)
library(gridExtra)
library(rlist)
library(foreach)
library(doParallel)
library(fields)
library(cowplot)
library(MSnbase) ## Bioconductor
library(limma) ## Bioconductor
library(DESeq2) ## Bioconductor
library(msqrob2) ## Bioconductor
## Source functions from functions.R (contains the larger functions of this script)
source("./functions_IM.R")
## Create Results folder
if (!file.exists("Results")){
dir.create("Results")
}
```
```{r Specify required parameters, echo=FALSE}
## Specify file path to the folder where Thermo raw files (ending with "*.raw") of the experiment are stored.
rawfilefolder_filepath = "./rawfiles"
## Specify file path to rawStallion.exe, a C#-tool for extracting noise values among other information from the *.raw files. Download here: https://github.com/fstanek/rawStallion.
rawStallion.exe_filepath = "./rawStallion/rawStallion.exe"
## Specify file path to the PSM-table generated by database searching (in the MaxQuant database search output, the corresponding table is called "msms.txt").
msms_filepath = "./msms.txt"
## Optional: Specify specific pattern of raw file names which matches the raw files to be processed. Only relevant if not all of the PSMs in your PSM table are to be processed in the script (e.g. because some come from MS3 measurements, or some need independent normalization and interference correction). If all raw files are to be processed, specify parameter as empty character "". Note that we recommend processing measurements of unmodified peptides and of any kind of PTM-enriched peptides independently from each other, as they can differ in their relative sample contribution to the total intensity.
rawfile_pattern_to_keep = "acet" # Setting for the demo. This specific setting filters for PSMs of the six raw files that were generated from measuring acetyl-enriched samples (more specifically, acetyl-enriched samples of pools P1 - P6), thereby discarding PSMs of all other raw files that were searched together.
## Specify name of the column denoting raw file identity for each PSM (=row) in the PSM-table. It will later be renamed as "Raw.file".
rawfile_columnname = "Raw.file"
## Specify name of the column denoting scan number for each PSM (=row) in the PSM-table. It will later be renamed as "Scan.number".
scannumber_columnname = "Scan.number"
## Specify name of the column denoting precursor ion charge for each PSM (=row) in the PSM-table. It will later be renamed as "Charge".
charge_columnname = "Charge"
## Specify name of the column denoting retention time for each PSM (=row) in the PSM-table. It will later be renamed as "Retention.time".
retentionTime_columnname = "Retention.time"
## Specify name of the column denoting precursor peptide amino acid sequence for each PSM (=row) in the PSM-table. It will later be renamed as "Sequence".
sequence_columnname = "Sequence"
## Specify name of the column denoting protein identity for each PSM (=row) in the PSM-table.
proteinGroup_columnname = "Proteins"
## Specify the search engine that was used to generate the PSM-table. Supported search engines so far: "MaxQuant", "FragPipe". The script might have to be adapted for other input sources.
search_engine = "MaxQuant"
## Specify the multiplexing labels used in the experiment. This parameter should be set as an object of the "ReporterIons" class used in the Bioconductor package "MSnBase", which comes with TMT6, TMT10, TMT11 and TMT16 as predefined objects. For more information, please see: https://lgatto.github.io/MSnbase/reference/ReporterIons-class.html.
reporter_ions = TMT16
## Optional: Correct for isotopic impurities of reporter ions by specifying an impurity matrix. If no impurity correction should be performed, set this parameter to NULL. Else, specify a purity correction matrix as an nxn matrix, where n is the number of channels in the kit. Columns represent reporter channels, and rows represent relative signal percentages. The order of columns and rows has to match that of the reporter_ions object specified above. For more information on how to construct an impurity matrix, please refer to the respective guide from "MSnBase": https://rdrr.io/github/lgatto/MSnbase/man/purityCorrect-methods.html.
impuritymatrix = as.matrix(read.csv(file="./impurity_matrix_tmtpro.csv" , sep=",", header=TRUE, row.names = 1))
## Specify name of the column denoting the precursor peptide modified amino acid sequence for each PSM (=row) in the PSM-table. This is required to infer specific peptide characteristics. In MaxQuant, this column is called "Modified.Sequence". An example entry is: "_(Acetyl (Protein N-term))SWQAYTDNLIGTGK_".
modifiedSequence_columnname = "Modified.sequence"
## Specify the TMT search setting as either "fixed" or "variable". If "fixed", it is assumed that sequences in the "Modified.Sequence" column will not show the TMT modification if found on the peptide. If "variable", it is assumed that a TMT-modification will be listed in the "Modified.Sequence" column at the respective position in the peptide's sequence. In that case, also specify the corresponding label pattern in the "Modified.Sequence" column as a regular expression pattern.
TMT_search_setting = "fixed"
label_pattern = ""
## Specify the pattern of acetylation (encompassing both N-terminal and Lysine acetylation) in the "Modified.Sequence" column as a regular expression pattern. This is crucial for correctly calculating the number of TMT-labels per peptide.
acetylation_pattern = "Acetyl"
## Optional: Specify an optional additional pattern in the "Modified.Sequence" column as a regular expression pattern specific to a post-translational modification (PTM). This will help accounting for differences in fragmentation efficiencies in the model. Note: It is assumed that this PTM does not confer an additional charge to the peptide under measurement conditions. If no special PTM pattern is present, specify as empty character "". Note: In my experience, neither Lysine (K) acetylation nor (STY) phosphorylation could explain much variance in terms of fragmentation efficiency. Maybe other PTMS/modifications do?
ptm_pattern = ""
## specify the between-sample normalization strategy to be applied on reporter ion intensities. Supported is "loess" and "DESeq". Normalization is required prior to interference-correction (which assumes an uniform interference background).
norm_strategy = "loess"
## Optional: Specify sample groups vector as character vector to perform filtering PSMs based on a minimum valid values threshold in at least 1 group. The order of entries should match the order of the reporter ion columns in your PSM-table.
groups = c("blank", "blank", "blank", "Group0", "Group6", "Group0", "Group6", "Group0", "Group6","blank", "Group9", "Group12", "Group9", "Group12", "Group9", "Group12")
minimum_valid_threshold = 3
## Optional: Specify the sample group names that represent empty channels in the input data (e.g. in case when using 16plex kit but only 12 channels are being used for labeling). Their respective reporter ion columns will be filtered out. This parameter requires specification of the empty group label as listed in the parameter "groups" above. If no group name corresponds to empty channels, specify as NULL.
groups_empty = "blank"
## Note: Parameters that are described as "Optional" can be ignored if certain steps in the workflow are to be skipped.
## Note: If specified correctly, no other user input is required for the remainder of the script.
## Note: If the script produces any errors, please first check if all parameters above were specified correctly.
```
```{r Read in PSM-table, echo=FALSE}
## Read in PSM-table:
df_msms <- read.delim(file=msms_filepath, header=TRUE, stringsAsFactors = FALSE, sep="\t")
writeLines("Dimensions of PSM-table upon reading in:")
dim(df_msms)
## Rename some columns names to match them to the default-values of parameters of downstream functions and plots:
names(df_msms)[names(df_msms) == rawfile_columnname] <- "Raw.file"
names(df_msms)[names(df_msms) == scannumber_columnname] <- "Scan.number"
names(df_msms)[names(df_msms) == charge_columnname] <- "Charge"
names(df_msms)[names(df_msms) == retentionTime_columnname] <- "Retention.time"
names(df_msms)[names(df_msms) == proteinGroup_columnname] <- "Proteins"
names(df_msms)[names(df_msms) == sequence_columnname] <- "Sequence"
names(df_msms)[names(df_msms) == modifiedSequence_columnname] <- "Modified.sequence"
## If search_engine is FragPipe, modify columns appropriately to how they are needed
if (search_engine == "FragPipe"){
# extract spectrum number matching fragpipe "Spectrum" column pattern and convert it into numeric vector
scannr <- substring(df_msms$Scan.number, first=1, last= nchar(df_msms$Scan.number) -2)
ind_last_point <- gregexpr(scannr, pattern="[.]") %>% sapply(., FUN="[", 2)
scannr <- substring(scannr, first=ind_last_point + 1, last=nchar(scannr))
df_msms$Scan.number <- as.numeric(scannr)
# remove "pep.xml" in the rawfile column. Also rmove "interact-" at the front.
df_msms$Raw.file <- sub(df_msms$Raw.file, pattern="[.]pep[.]xml", replacement = "")
df_msms$Raw.file <- sub(df_msms$Raw.file, pattern="interact-", replacement = "")
}
## If search_engine is MaxQuant, filter out second peptides, reverse hits and contaminants
if (search_engine == "MaxQuant"){
# get rid of second peptides:
df_msms <- df_msms[df_msms$Type != "MULTI-SECPEP",]
dim(df_msms)
# get rid of reverse hits:
df_msms <- df_msms[!df_msms$Reverse=="+",]
dim(df_msms)
# get rid of contaminants:
con_bool <- grepl(df_msms$Proteins, pattern="^CON")
df_msms <- df_msms[!con_bool,]
writeLines("\nDimension of PSM-table after filtering out second peptides, reverse and CONs:")
print(dim(df_msms))
}
## Filter for raw files with specific pattern as specified via rawfile_pattern_to_keep parameter
if (!is.null(rawfile_pattern_to_keep)){
df_msms <- df_msms[grepl(df_msms$Raw.file, pattern=rawfile_pattern_to_keep),]
}
writeLines("\nThe PSMs of these raw files will be processed:")
table(df_msms$Raw.file)
## Establish unique PSM identifier column called "unique_id"
df_msms$unique_id <- paste0(df_msms$Raw.file,"_",df_msms$Scan.number)
## Sort PSM-table by raw-file column
df_msms <- df_msms[order(df_msms$Raw.file),]
```
```{r Use rawStallion to read relevant information from Thermo raw files and write as R-readable .tsv files, echo=FALSE}
## Prepare parallel processing
cores <- detectCores(logical = TRUE)
cl <- makeCluster(cores - 2)
registerDoParallel(cl=cl)
## Execute rawStallion via command line (note: rawStallion requires a Windows OS)
invisible(
foreach(i = list.files(rawfilefolder_filepath, full.names = TRUE)) %dopar% {
system2(command = rawStallion.exe_filepath, args = i)
}
)
print("Done.")
stopCluster(cl)
## Note: This code section should produce two tsv-files per Thermo raw file (contained in the folder "rawfilefolder_filepath") using rawStallion. If this code does not work, you might need to install some Windows extension, e.g. Windows .Net Desktop Runtime. I recommend running rawStallion.exe in the command line first (not via R) to see if and what exactly is missing! Conveniently, Windows will complain which exact extension is missing.
## Note: To run rawStallion via the command line, type the following line in the Windows command line:
# [rawStallion.exe-filepath] [rawfile-filepath]
# where [rawStallion.exe-filepath] is the filepath to the rawStallion.exe file, and
# [rawfile-filepath] is the filepath to a single Thermo .raw file
```
```{r Extract spectral features for each PSM from the raw data, echo=FALSE}
## Prepare parallel processing
cores <- detectCores(logical = TRUE)
cl <- makeCluster(cores - 1)
registerDoParallel(cl=cl)
## Apply function to extract spectral features (noise values, reporter ion intensities, PPF, etc.) from raw the raw file data. It takes some time to go through every single PSM, as the program extracts the spectral information of corresponding MS2 scans and parent MS1 scans. Tasks are split on multiple cores to process multiple raw files in parallel.
t1 <- Sys.time()
spectral_features <- extract_spectral_features_in_parallel(rawfilefolder_filepath = rawfilefolder_filepath, # (see functions_IM.R)
msms = df_msms,
scan_number = "Scan.number",
rawfile = "Raw.file",
charge = "Charge",
reporter_ions = reporter_ions,
mass_error_tolerance = 0.005)
stopCluster(cl)
t2 <- Sys.time()
writeLines("Duration of spectral feature extraction: ")
print(t2 - t1)
## Check if objects can be merged
if (!all(spectral_features$key_msms == df_msms$unique_id)){
writeLines(warning("The two objects differ in order"))
}
## Merge spectral features to PSM table (df_msms)
df_msms <- cbind(df_msms, spectral_features[,-which(names(spectral_features) == "key_msms")])
## Create new column that merges PSM-wise information of raw file name and compensation voltage
df_msms[,"Raw.file__CV"] <- paste0(df_msms$Raw.file, "__", df_msms$compensationVoltage)
## Compare PPF with exisitng purity metrics (if present)
if ("PIF" %in% names(df_msms)) {plot(x=df_msms$PIF, y=df_msms$PPF, cex=0.1, xlab="PIF",ylab="PPF")} # PIF in MaxQuant output
if ("Purity" %in% names(df_msms)) {plot(x=df_msms$Purity, y=df_msms$PPF, cex=0.1, xlab="Purity",ylab="PPF")} # Purity in FragPipe Output
## Visualization of some of the calculated variables
barplot(table(df_msms$parent_MS1), main="MS1-occurence of precursor peptide ion", xlab="MS1", border="grey")
hist(df_msms$PPF, main="Distribution of Precursor Purity Fraction", xlab="Precursor Purity Fraction (PPF)", border="grey", col="grey")
ggplot(df_msms) +
geom_line(aes(x=Retention.time, y=noiseValue, col=Raw.file__CV)) +
ylab("Noise value at precursor m/z") + xlab("Retention time") +
theme(legend.position = "bottom")
## Save session image after MS1 feature extraction. Can be loaded at a later stage via load().
save.image(paste0("session_including_MS1_features_",Sys.Date(),".RData"))
```
```{r Optional: Load session backup after feature extraction code block, echo=FALSE}
# load("session_including_MS1_features_2023-02-09.RData")
```
```{r Correct for isotopic impurities via specified impurity matrix, echo=FALSE}
## Infer reporter ion column names (Note: pattern = "reporters_" will retrieve reporter ion intensities extracted in the step above. You can also use the reporter ion columns from your search engine, but they have to be untransformed and normalized by injection time via division!)
names_reporterCol <- grep(names(df_msms), pattern="reporters_", value=TRUE)
## Perform impurity correction if impurity matrix was specified
if (!is.null(impuritymatrix)){
# Invert impurity matrix to be used to correct for isotopic impurities
writeLines("Impurity Matrix:")
print(impuritymatrix)
inverted_transposed_impuritymatrix <- solve(t(impuritymatrix)) # using linear algebra, this inverted matrix is later used to calculate the corrected intensities
# Calculate impurity correction:
m_uncorrected <- as.matrix(df_msms[,names_reporterCol])
m_uncorrected[is.na(m_uncorrected)] <- 0
rownames(m_uncorrected) <- df_msms$unique_id
m_corrected <- m_uncorrected
for (i in 1:nrow(m_uncorrected)) {
# extract row_i
row_i <- m_uncorrected[i,]
row_i
# calculate corrected intensities, replace any negative values with 0
row_i_corrected <- inverted_transposed_impuritymatrix %*% row_i
row_i_corrected <- pmax(row_i_corrected,0)
# store in m_corrected
m_corrected[i,] <- row_i_corrected
}
# Finally, replace values that were originally 0 again with 0 (i.e. NA). Else they would mimic real observed intensities, which they are not!
m_corrected[m_uncorrected == 0] <- 0
# Plot before and after correction:
par(mfrow=c(1,2))
barplot(colSums(m_uncorrected, na.rm = TRUE), las=2, main="colSums before impurity correction", cex.names = 0.7, border="grey")
barplot(colSums(m_corrected,na.rm=TRUE), las=2, main="colSums after impurity correction", cex.names = 0.7, border="#20217E", col="#20217E")
# Replace uncorrected intensities in df_msms with corrected intensities:
df_msms[,names_reporterCol] <- m_corrected
}
```
```{r Optional: Kick out reporter ion intensity columns of empty channels, echo=FALSE}
## Check if parameters specify the existence of any empty reporter ion columns
if (!is.null(groups) && !is.null(groups_empty)){
# Extract metadata and reporter ion data
df_reporter <- df_msms[,names_reporterCol]
df_meta <- df_msms[,!names(df_msms) %in% names_reporterCol]
# Subselect only non empty channels
df_reporter_nonEmpty <- df_reporter[,!groups %in% groups_empty]
# Reconstitute full dataframe
df_msms <- cbind(df_meta, df_reporter_nonEmpty)
# Print column names of reporter ion columns before and after filtering
writeLines("Reporter ion column names before filtering:")
print(names(df_reporter))
writeLines("Reporter ion column names after filtering:")
print(names(df_reporter_nonEmpty))
# redefine the objects names_reporterCol (needed downstream) and groups vector
names_reporterCol <- names(df_reporter_nonEmpty)
groups <- groups[!groups %in% groups_empty]
}
```
```{r Calculate number of TMT-labels per precursor peptide, echo=FALSE}
## Calculate the number of TMT-labels for each PSM precursor peptide ion if TMT labels were specified as fixed in the database search (i.e. TMT-labels are not shown in the Modified.Sequence column).
if (TMT_search_setting == "fixed"){
mod_sequence <- df_msms$Modified.sequence
number_K <- gregexpr(text= mod_sequence, pattern="K[^)]") # excluding ) because of patterns like "K (Acetyl K)" for example.
number_K <- sapply(number_K, FUN= ">", 0)
number_K <- sapply(number_K, FUN=sum)
numberAc <- gregexpr(text = mod_sequence, pattern = acetylation_pattern)
numberAc <- sapply(numberAc, FUN= ">", 0)
numberAc <- sapply(numberAc , FUN=sum)
number_labels <- number_K + 1 - numberAc # + 1 because of N-terminal amino group
df_msms$number_labels <- number_labels
}
## Calculate the number of TMT-labels for each PSM precursor peptide ion if TMT labels were specified as variable in the database search (i.e. TMT-labels are not shown in the Modified.Sequence column).
if (TMT_search_setting == "variable"){
mod_sequence <- df_msms$Modified.sequence
number_labels <- gregexpr(text = mod_sequence, pattern = label_pattern)
number_labels <- sapply(number_labels, FUN=">", 0)
number_labels <- sapply(number_labels, FUN=sum)
df_msms$number_labels <- number_labels
}
## Check for yourself in the first 30 rows if calculations are correct:
head(data.frame(Modified.Sequence = df_msms$Modified.sequence, Number_Labels= df_msms$number_labels), n=30)
## Plot overview of the number of TMT-labels per precursor peptide:
barplot(table(df_msms$number_labels), main="number of TMT labels per peptides", ylab="frequency", border="grey", xlab="number")
## Filter away PSMs with 0 TMT labels
df_msms <- df_msms[!df_msms$number_labels == 0,]
```
```{r Calculate measurement run-specific peptide densities, echo=FALSE}
## Add column precursorIntensity to the dataframe (signal of precursor + isotopes in the isolation window)
df_msms$precursorIntensity <- df_msms$TIW*df_msms$PPF
## Calculate empirical peptide density estimated
peptide_density <- calculate_peptide_density(msms=df_msms, # (see functions_IM.R)
rawfile = "Raw.file__CV",
retentionTime = "Retention.time",
precursorMZ = "precursorMz",
modifiedSequence = "Modified.sequence",
charge = "Charge",
precursorIntensity = "precursorIntensity")
## Add calculated peptide densities to the PSM-table (df_msms)
df_msms$peptideDensity <- peptide_density$peptideDensity
## Save session image after density estimation. Can be loaded at a later stage via load().
save.image(paste0("session_including_density_",Sys.Date(),".RData"))
```
```{r Generate dependent variable Y of multiple linear regression model (note: Y equals total reporter ion signal), echo=FALSE}
## Extract reporter ion intensity matrix for modelling. Filter out rows (PSMs) with 0 total reporter ion signal.
m_reporter <- as.matrix(df_msms[,names_reporterCol])
df_msms <- df_msms[!rowSums(m_reporter, na.rm = TRUE) == 0,]
## Generate dependent model variable Y as PSM-wise total reporter ion signal across all channels.
m_reporter <- as.matrix(df_msms[,names_reporterCol])
df_msms$Y <- rowSums(m_reporter, na.rm=TRUE)
```
```{r Optional: Filter out PSMs based on missing/valid values per group, echo=FALSE}
## Note: This section filters PSMs based on a minimum number threshold of valid values (i.e. non-NA values) in at least one group, if a group vector was specified.
if (!is.null(groups)){
# print "groups" and "minimum_valid_values" parameters
writeLines("groups:")
print(groups)
writeLines("\nthreshold of minimum valid values threshold in at least one group")
print(minimum_valid_threshold)
# print dimensions before filtering
writeLines("\nDimension of PSM-table before filtering:")
print(dim(df_msms))
# for each PSM, calculate the maximum number of values across all groups
m_reporter <- as.matrix(df_msms[,names_reporterCol])
m_reporter[m_reporter == 0] <- NA
v_max_valid_perGroup <- numeric(nrow(df_msms))
for (i in 1:nrow(df_msms)){
m_i <- m_reporter[i,]
v_max_valid_perGroup[i] <- max(tapply(!is.na(m_i), INDEX = groups, FUN=sum))
}
# visualize distribution of maximum number of valid values per group (across all groups)
barplot(table(factor(v_max_valid_perGroup)), col="grey", border="grey", xlab="max # of valid values per group before filtering", ylab="frequency", main = "distribution before filtering")
# perform filtering based on valid values
bool_keep <- v_max_valid_perGroup >= minimum_valid_threshold
df_msms <- df_msms[bool_keep,]
# print dimensions after filtering
writeLines("\nDimension of PSM-table after filtering:")
print(dim(df_msms))
}
```
```{r Generate explanatory variables (i.e. regressors Xi), warning=FALSE, echo=FALSE}
## 1) precursor: This variable describes the estimated intensity stemming from precursor peptide ions (including +1/-1 isotopes) in the total "peptide ion current" (PIC) of the MS2-scan.
df_msms$precursor <- df_msms$PIC * df_msms$PPF
writeLines("summary precursor:")
summary(df_msms$precursor)
## 2) nonprecursor: This variable describes the estimated intensity stemming from visible interfering non-precursor peptide ions in the total peptide ion current of the MS2-scan.
df_msms$nonprecursor <- df_msms$PIC* (1 - df_msms$PPF)
writeLines("\nsummary nonprecusor:")
summary(df_msms$nonprecursor)
## 3) noiseEstimate: This variable describes the estimated intensity stemming from interfering non-percursor peptide ions that are NOT visible in the isolation window range (in the MS1-scan), and are thus assumed to be equally invisible after fragmentation in the MS2 scan as they fall below the noise threshold in both spectra.
df_msms$noiseEstimate <- df_msms$peptideDensity * df_msms$noiseValue
writeLines("\nsummary noiseEstimate:")
summary(df_msms$noiseEstimate)
## 4) rawfileID: This categorial variable describes the raw-file identity of the PSM.
df_msms$rawfileID <- factor(df_msms$Raw.file__CV)
writeLines("\nsummaryr rawfileID:")
summary(df_msms$rawfileID)
## 5) factorRawfileCharge: The charge state that the mass spectrometer assumed for the precursor peptide (which sometimes differs from the true charge state of the precursor peptide). NCE depends on this variable.
rawfileCharge <- df_msms$rawfileCharge
rawfileCharge[rawfileCharge > 3] <- 3
rawfileCharge[rawfileCharge < 3] <- 2
df_msms$factorRawfileCharge <- factor(rawfileCharge)
writeLines("\nsummary factorRawfileCharge:")
summary(df_msms$factorRawfileCharge)
## 6) pepClass: This categorical variable describes specific characteristics of a peptide that influence its fragmentation (and thus Y in the Model). Some other variables need to be calculated in advance.
# factorCharge_2, factorCharge_3 & factorCharge_4: These binary categorical variable describe the charge state of the precursor peptide. Charge states above 4 will be reclassified as 4 to ensure a sufficient number of observations in each category.These variables are needed for 9).
charge <- df_msms$Charge
df_msms$factorCharge_2 <- factor(ifelse(charge <= 2, yes="Charge2.yes", no="Charge2.no"))
df_msms$factorCharge_3 <- factor(ifelse(charge == 3, yes="Charge3.yes", no="Charge3.no"))
df_msms$factorCharge_4 <- factor(ifelse(charge >= 4, yes="Charge4.yes", no="Charge4.no"))
writeLines("\nsummary factorCharge:")
summary(df_msms$factorCharge_2)
summary(df_msms$factorCharge_3)
summary(df_msms$factorCharge_4)
# factorLabels_greater1 & factorLabels_greater2: These binary categorical variables describe the number of isobaric label molecules on the precursor peptide. These variables are needed for 9)
df_msms$factorLabels_1 <- factor(ifelse(df_msms$number_labels == 1, yes="Labels1.yes", no="Labels1.no"))
df_msms$factorLabels_2 <- factor(ifelse(df_msms$number_labels == 2, yes="Labels2.yes", no="Labels2.no"))
df_msms$factorLabels_3 <- factor(ifelse(df_msms$number_labels >= 3, yes="Labels3.yes", no="Labels3.no"))
writeLines("\nsummary factorLabels:")
summary(df_msms$factorLabels_1)
summary(df_msms$factorLabels_2)
summary(df_msms$factorLabels_3)
# contains_PTM: This binary categorical variable describes whether precursor peptide contains specified PTM.
df_msms$factor_PTM <- ifelse(grepl(df_msms$Modified.sequence, pattern=ptm_pattern), yes="yes", no="no")
# The following variables describe absence or presence of certain precursor peptide characteristics (amino acids and extra charge).
numberR <- df_msms$Sequence %>%
gregexpr(text=., pattern="R") %>%
sapply(., FUN=">", 0) %>%
sapply(., FUN=sum)
df_msms$seqChar_R <- ifelse(numberR > 0, yes="R.yes", no= "R.no")
numberK <- df_msms$Sequence %>%
gregexpr(text=., pattern="K") %>%
sapply(., FUN=">", 0) %>%
sapply(., FUN=sum)
df_msms$seqChar_K <- ifelse(numberK > 0, yes="K.yes", no= "K.no")
numberH <- df_msms$Sequence %>%
gregexpr(text=., pattern="H") %>%
sapply(., FUN=">", 0) %>%
sapply(., FUN=sum)
df_msms$seqChar_H <- ifelse(numberH > 0, yes="H.yes", no= "H.no")
numberD <- df_msms$Sequence %>%
gregexpr(text=., pattern="D") %>%
sapply(., FUN=">", 0) %>%
sapply(., FUN=sum)
df_msms$seqChar_D <- ifelse(numberD > 0, yes="D.yes", no= "D.no")
numberE <- df_msms$Sequence %>%
gregexpr(text=., pattern="E") %>%
sapply(., FUN=">", 0) %>%
sapply(., FUN=sum)
df_msms$seqChar_E <- ifelse(numberE > 0, yes="E.yes", no= "E.no")
df_msms$factorExtra <- ifelse(df_msms$Charge > df_msms$number_labels + numberR + numberH, yes="Extra.yes", no="Extra.no")
# calculation of variable pepClass.
pepClass <- rep("", times=nrow(df_msms))
for (i in unique(df_msms$rawfileID)){
print(paste0("##### ", i , " #####"))
df_msms_i <- df_msms %>% filter(rawfileID == i)
pepClass_i <- determine_pepChar_classes(msms = df_msms_i, # (see functions_IM.R)
Y_var = "Y",
X_var = c("factorCharge_2",
"factorCharge_3",
"factorCharge_4",
"factorLabels_1",
"factorLabels_2",
"factorLabels_3",
"seqChar_D",
"seqChar_E",
"seqChar_R",
"seqChar_K",
"seqChar_H",
"factorExtra",
"factor_PTM"),
min_number = 100)
pepClass[df_msms$rawfileID == i] <- pepClass_i
}
df_msms$pepClass <- pepClass
# plot the classification results for the first raw file
df_msms_first <- df_msms %>% filter(rawfileID == unique(df_msms$rawfileID)[1])
n_pepClass <- length(unique(df_msms_first$pepClass))
brewerpal_pepClass <- colorRampPalette(c("#F65A3D", "#FD0606","#B04433","#91760B", "#BAA857", "#A9DEE3", "#1A99B8", "#2171b5","#8C78A1","#0C0250"))
col_pepClass <- brewerpal_pepClass(n_pepClass)
ggplot(df_msms_first) + # barplot illustrating PSMs per class
geom_bar(aes(y=pepClass,fill=pepClass)) +
scale_fill_manual(values = col_pepClass)+
theme_bw() +
theme(legend.position = "none")
ggplot(df_msms_first) + # scatterplot: Y vs TIW
geom_point(aes(x=TIW, y=Y, col=pepClass), cex=0.8, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
guides(colour = guide_legend(override.aes = list(size=3))) +
ylab("Total reporter ion signal (Y)") +
xlab("Total intensity in isolation window (TIW)") +
theme_bw() +
xlim(0, quantile(df_msms_first$TIW, 0.975)) +
ylim(0, quantile(df_msms_first$Y, 0.975)) +
theme(legend.position = "none")
ggplot(df_msms_first) + # scatterplot: Y vs PIC
geom_point(aes(x=PIC, y=Y, col=pepClass), cex=0.8, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
guides(colour = guide_legend(override.aes = list(size=3))) +
ylab("Total reporter ion signal (Y)") +
xlab("Total peptide ion currect (PIC)") +
theme_bw() +
xlim(0, quantile(df_msms_first$PIC, 0.975)) +
ylim(0, quantile(df_msms_first$Y, 0.975))+
theme(legend.position = "none")
ggplot(df_msms_first) + # scatterplot: Y vs TIW (log2-transformed)
geom_point(aes(x=log2(TIW), y=log2(Y), col=pepClass), cex=0.8, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
guides(colour = guide_legend(override.aes = list(size=3))) +
ylab("log2 Total reporter ion signal (Y)") +
xlab("log2 Total intensity in isolation window (Y)")+
theme_bw() +
theme(legend.position = "none")
ggplot(df_msms_first) + # scatterplot: Y vs PIC (log2-transformed)
geom_point(aes(x=log2(PIC), y=log2(Y), col=pepClass), cex=0.8, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
guides(colour = guide_legend(override.aes = list(size=3))) +
ylab("log2 Total reporter ion signal (Y)") +
xlab("log2 Total peptide ion current (PIC)")+
theme_bw() +
theme(legend.position = "none")
```
```{r Optional: Visualize Y/PIC ratios for all unique determined peptide classes in firt raw file, echo=FALSE}
## Calculate signal to noise; and Y/PIC ratio
df_msms_first$SnN <- df_msms_first$precursor/df_msms_first$noiseValue
df_msms_first$Y_to_PIC <- df_msms_first$Y / df_msms_first$PIC
df_msms_first$log2__Y_to_PIC <- log2(df_msms_first$Y / df_msms_first$PIC)
## Create dataframe slightly modified for plotting (by replacing extreme values)
df_msms_plot <- df_msms_first
df_msms_plot <- df_msms_plot %>% filter(SnN > quantile(SnN, probs=0.6),PPF > 0.6)
df_msms_plot$Y_to_PIC[df_msms_plot$Y_to_PIC > quantile(df_msms_plot$Y_to_PIC, probs=0.95)] <- quantile(df_msms_plot$Y_to_PIC, probs=0.95)
df_msms_plot$Y_to_PIC[df_msms_plot$Y_to_PIC < quantile(df_msms_plot$Y_to_PIC, probs=0.05)] <- quantile(df_msms_plot$Y_to_PIC, probs=0.05)
## Plot calculated ratios for all determined empirical peptide characteristics.
ggplot(data=df_msms_plot) +
geom_point(aes(x=Retention.time, y=precursorMz, col=log2(Y_to_PIC))) +
scale_color_gradientn(colors=pals::parula(20), name="log2 Y/PIC") +
facet_wrap(~pepClass) +
ggtitle("Y / PIC")
```
```{r Robust linear regression modelling of total reporter ion signal, warning=FALSE, echo=FALSE}
## Calculate appropriate plot margins
y_min <- log2(min(df_msms$Y)) - 0.5
y_max <- log2(max(df_msms$Y)) + 0.5
## 1) Estimate submodel: Y ~ 0 + precursor
if (length(unique(df_msms$rawfileID)) > 1){
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor:rawfileID)
bool_keep <- apply(X, MARGIN = 2, FUN=function(x){length(unique(x)) > 1})
X <- X[,bool_keep]
} else {
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor)
}
interference_model <- rlm(y= df_msms$Y,
x = X,
psi = "psi.bisquare",
maxit = 10000)
main_plot <- "Y ~ 0 + precursor "
df_msms$fitted.values <- interference_model$fitted.values
df_msms_first <- df_msms %>% filter(rawfileID == unique(df_msms$rawfileID)[1])
gg1 <- df_msms_first %>%
ggplot(data=.)+
coord_cartesian(xlim=c(y_min,y_max),ylim=c(y_min,y_max)) +
geom_abline(intercept = 0, slope=1, col="black", cex=0.5, linetype=2) +
geom_point(aes(y=log2(Y),x=log2(fitted.values),col=as.factor(pepClass)), cex=1, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
ylab("log2 Total reporter ion signal") + xlab("log2 fitted values") +
guides(colour = guide_legend(override.aes = list(size=3))) +
ggtitle(main_plot) +
theme_classic() +
theme(legend.position="none")
paste0("correlation log2(Y) vs log2(fitted) of model 1: ",
round(cor(log2(df_msms_first$Y), log2(df_msms_first$fitted.values), use="pairwise.complete.obs"), digits=2))
## 2) Estimate submodel: Y ~ 0 + precursor:pepClass
if (length(unique(df_msms$rawfileID)) > 1){
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor:pepClass:rawfileID)
bool_keep <- apply(X, MARGIN = 2, FUN=function(x){length(unique(x)) > 1})
X <- X[,bool_keep]
} else {
X <- model.matrix(data = df_msms,
object = ~ 0 +
precursor:pepClass)
}
interference_model <- rlm(y= df_msms$Y,
x = X,
psi = "psi.bisquare",
maxit = 10000)
main_plot <- "Y ~ 0 + precursor:pepClass "
df_msms$fitted.values <- interference_model$fitted.values
df_msms_first <- df_msms %>% filter(rawfileID == unique(df_msms$rawfileID)[1])
gg2 <- df_msms %>% filter(rawfileID == unique(df_msms$rawfileID)[1]) %>%
ggplot(data=.)+
coord_cartesian(xlim=c(y_min,y_max),ylim=c(y_min,y_max)) +
geom_abline(intercept = 0, slope=1, col="black", cex=0.5, linetype=2) +
geom_point(aes(y=log2(Y),x=log2(fitted.values),col=as.factor(pepClass)), cex=1, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
ylab("log2 Total reporter ion signal") + xlab("log2 fitted values") +
guides(colour = guide_legend(override.aes = list(size=3))) +
ggtitle(main_plot) +
theme_classic() +
theme(legend.position="none")
paste0("correlation log2(Y) vs log2(fitted) of model 2: ",
round(cor(log2(df_msms_first$Y), log2(df_msms_first$fitted.values), use="pairwise.complete.obs"), digits=2))
## 3) Estimate submodel: Y ~ 0 + precursor:pepClass + nonprecursor:rawfileCharge
if (length(unique(df_msms$rawfileID)) > 1){
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor:pepClass:rawfileID + nonprecursor:rawfileCharge:rawfileID)
bool_keep <- apply(X, MARGIN = 2, FUN=function(x){length(unique(x)) > 1})
X <- X[,bool_keep]
} else {
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor:pepClass+ nonprecursor:rawfileCharge)
}
interference_model <- rlm(y= df_msms$Y,
x = X,
psi = "psi.bisquare",
maxit = 10000)
main_plot <- "Y ~ 0 + precursor:pepClass + nonprecursor:rawfileCharge "
df_msms$fitted.values <- interference_model$fitted.values
df_msms_first <- df_msms %>% filter(rawfileID == unique(df_msms$rawfileID)[1])
gg3 <- df_msms_first %>%
ggplot(data=.)+
coord_cartesian(xlim=c(y_min,y_max),ylim=c(y_min,y_max)) +
geom_abline(intercept = 0, slope=1, col="black", cex=0.5, linetype=2) +
geom_point(aes(y=log2(Y),x=log2(fitted.values),col=as.factor(pepClass)), cex=1, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
ylab("log2 Total reporter ion signal") + xlab("log2 fitted values") +
guides(colour = guide_legend(override.aes = list(size=3))) +
ggtitle(main_plot) +
theme_classic() +
theme(legend.position="none")
paste0("correlation log2(Y) vs log2(fitted) of model 3: ",
round(cor(log2(df_msms_first$Y), log2(df_msms_first$fitted.values), use="pairwise.complete.obs"), digits=2))
## 4) Estimate full model: Y ~ 0 + precursor:pepClass + nonprecursor:rawfileCharge + noiseEstimate
if (length(unique(df_msms$rawfileID)) > 1){
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor:pepClass:rawfileID + nonprecursor:rawfileCharge:rawfileID + noiseEstimate:rawfileID)
bool_keep <- apply(X, MARGIN = 2, FUN=function(x){length(unique(x)) > 1})
X <- X[,bool_keep]
} else {
X <- model.matrix(data = df_msms,
object = ~ 0 + precursor:pepClass + nonprecursor:rawfileCharge + noiseEstimate)
}
interference_model <- rlm(y= df_msms$Y,
x = X,
psi = "psi.bisquare",
maxit = 10000)
main_plot <- "Y ~ 0 + precursor:pepClass + nonprecursor:rawfileCharge + noiseEstimate "
df_msms$fitted.values <- interference_model$fitted.values
df_msms_first <- df_msms %>% filter(rawfileID == unique(df_msms$rawfileID)[1])
gg4 <- df_msms_first %>%
ggplot(data=.)+
coord_cartesian(xlim=c(y_min,y_max),ylim=c(y_min,y_max)) +
geom_abline(intercept = 0, slope=1, col="black", cex=0.5, linetype=2) +
geom_point(aes(y=log2(Y),x=log2(fitted.values),col=as.factor(pepClass)), cex=1, alpha=0.5) +
scale_color_manual(values = col_pepClass) +
ylab("log2 Total reporter ion signal") + xlab("log2 fitted values") +
guides(colour = guide_legend(override.aes = list(size=3))) +
ggtitle(main_plot) +
theme_classic() +
theme(legend.position="none")
paste0("correlation log2(Y) vs log2(fitted values) of model 4: ",
round(cor(log2(df_msms_first$Y), log2(df_msms_first$fitted.values), use="pairwise.complete.obs"), digits=2))
## Plot prediction results
gg1
gg2
gg3
gg4
```
```{r Calculation of model-based Estimated Interference Level (EIL) values for all PSMs, echo=FALSE}
## Extract X-matrix. Differentiate between just one raw file vs multiple raw files
if (length(unique(df_msms$rawfileID)) > 1){
X <- model.matrix(data = df_msms,
object = ~ 0 +
precursor:pepClass:rawfileID + nonprecursor:rawfileCharge:rawfileID + noiseEstimate:rawfileID )
bool_keep <- apply(X, MARGIN = 2, FUN=function(x){length(unique(x)) > 1})
X <- X[,bool_keep]
} else {
X <- model.matrix(data = df_msms,
object = ~ 0 +
precursor:pepClass + nonprecursor:rawfileCharge + noiseEstimate )
}
## Calculate estimates of precursor peptide-derived reporter ion signal (precursor estimate)
precursor_bool <- grepl(names(interference_model$coefficients), pattern="^precursor")
precursor_coefficients <- interference_model$coefficients[precursor_bool]
precursor_estimate <- X[,precursor_bool] %*% precursor_coefficients
## Calculate estimates of non-precursor peptide-derived reporter ion signal (non-precursor estimate)
nonprecursor_bool <- grepl(names(interference_model$coefficients), pattern="nonprecursor")
nonprecursor_coefficients <- interference_model$coefficients[nonprecursor_bool]
nonprecursor_estimate <- X[,nonprecursor_bool] %*% as.matrix(nonprecursor_coefficients)
## Calculate estimates of noise-derived reporter ion signal (noise estimate)
noise_bool <- grepl(names(interference_model$coefficients), pattern="noiseEstimate")
noise_coefficients <- interference_model$coefficients[noise_bool]
noise_estimate <- X[,noise_bool] %*% as.matrix(noise_coefficients)
## Plot decomposition of total estimated reporter ion signal based on model estimates:
m_signal_estimates <- data.frame(precursor_estimate = precursor_estimate,
nonprecursor_estimate = nonprecursor_estimate,
noise_estimate = noise_estimate)
barplot(colSums(m_signal_estimates, na.rm=TRUE), main="Model-based decomposition of total reporter ion intensity", cex.names = 0.8, col="grey", border="grey", xlab="Partition", ylab="Predicted Intensity")
## Calculate Estimated Interference Level (EIL), and add it to dataframe. Compare Estimated Interference Level (EIL) with 1- Isolation Window Purity (1-PPF)
df_msms$EIL <- pmax(pmin(1 - precursor_estimate/(precursor_estimate + nonprecursor_estimate + noise_estimate),1),0)
ylim_hist <- max(table(cut(df_msms$PPF, breaks = 50)))
par(mfrow=c(1,2))
hist(1 - df_msms$PPF, breaks=50, xlab="Isolation Window Impurity (1 - PPF)", main="", xlim=c(0,1), col="grey", border="grey", ylim=c(0,ylim_hist))
hist(df_msms$EIL, breaks=50, xlab="Estimated Interference Level (EIL)", main="", xlim=c(0,1), col="#20217E", border="#20217E", ylim=c(0,ylim_hist))
```
```{r Conduct between-sample normalization, then replace unnormalized reporter ion columns with normalized intensities, warning=FALSE}
## Display intensities before normalization
barplot(colSums(df_msms[,names_reporterCol], na.rm=TRUE), main="ColSums before normalization", las=2, cex.names=0.7, border="grey")
boxplot(log2(df_msms[,names_reporterCol]), las=2, main="log2 Intensities before normalization")
## Normalize reporter ion intensities
if (norm_strategy == "loess"){
m_norm <- as.data.frame(loess_norm(m = as.matrix(df_msms[,names_reporterCol])))
}
if (norm_strategy == "DESeq"){
m_norm <- as.data.frame(DESeq_norm(m = as.matrix(df_msms[,names_reporterCol])))
}
colnames(m_norm) <- paste0(colnames(m_norm), "_norm")
## Display intensities after normalization
barplot(colSums(m_norm, na.rm=TRUE), main="ColSums after normalization", las=2, cex.names=0.7, col="#20217E", border="#20217E")
boxplot(log2(m_norm), las=2, main="log2 Intensities after normalization", col="#20217E")
## Kick old reporter ion columns, add new normalized ones (getting rid of empty reporter ion channels in the processcolumns in the process)
df_msms <- df_msms[,!names(df_msms) %in% names_reporterCol]
df_msms <- cbind(df_msms,m_norm)
## Update parameter "names_reporterCol"
names_reporterCol <- colnames(m_norm)
```
```{r Calculate interference-corrected normalized reporter ion intensities based on EIL values by arithmetic subtraction of interference}
## Perform interference correction
m_corrected <- interference_correction(reporterI_matrix = as.matrix(df_msms[,names_reporterCol]), # (see functions_IM.R)
EIL = df_msms$EIL,
max_interference = 0.8,
min_intensity = df_msms$minIntensity_MS2) %>% as.data.frame()
## Add interference-corrected intensities to the data frame. Also store names of interference-corrected reporter intensity columns (needed for downstream aggregation of PSMs to peptides and proteins).
df_msms <- cbind(df_msms, m_corrected)
names_reporterCorr <- names(m_corrected)
```
```{r Export PSM results table}
## Export the PSM table df_msms as "PSM.txt"
if (!file.exists("Results")){
dir.create("Results")
}
write.table(df_msms, file = paste0(getwd(),"/Results/modified_PSM.txt"), sep = "\t", col.names = TRUE, row.names=FALSE, quote=FALSE)
## Report the export
writeLines("The result table can be found in:")
print(paste0(getwd(),"/Results/modified_PSM.txt"))
## Save session after EIL estimation and interference correction.Can be loaded at a later stage via load().
save.image(paste0("session_including_EILs_",Sys.Date(),".RData"))
```
This concludes the standard section of the script. In the following, PSM-wise reporter ion intensities and EIL values are additionally aggregated to peptides and proteins by the respective columns contained in the PSM table. Intensities are aggregated by summation; metrics like EIL and PPF values are aggregated by calculating weighted averages, with weights corresponding to the relative contribution to the aggregated feature's reporter ion intensity.
However, it may be of interest to perform aggregation of interference-corrected reporter ion intensities and EIL-values from PSM level to higher feature level (e.g. peptides, proteins) as defined by other tables in search engine output (i.e. "proteinGroups.txt", "Phospho (STY)Sites.txt"). In MaxQuant, this is made possible by cross-referencing columns (e.g. "MS/MS IDs" column) within the tables of higher aggregation level. I created two scripts to perform this aggregation - one for the aggregation of PSMs to MaxQuant proteins listed in the "proteinGroups.txt" table; another for aggregation of PSMs to MaxQuant sites listed in site tables (i.e. "Phospho (STY)Sites.txt", "Acetyl (K)Sites.txt"). You can find the two scripts (named "Aggregate_PSMs_to_Proteins.Rmd" and "Aggregate_PSMs_to_Sites.Rmd", respectively) on GitHub: https://github.com/moritzmadern/SiteToProteinNormalization_in_MultiplexProteomics.
```{r Optional: Aggregate to peptide level (by Sequence column) and export result as table, warning = FALSE}
## Extract loop variable
unique_seq <- unique(df_msms$Sequence)
## Initialize output
list_peptideTable <- vector(mode="list", length=length(unique_seq))
names(list_peptideTable) <- unique_seq
## Extract relevant information per feature and aggregate
for (i in unique_seq){
bool_i <- df_msms$Sequence == i
df_i <- df_msms[bool_i,]
dim(df_i)
## calculate nrPSMs
nrPSMs_i <- nrow(df_i)
## extract protein name
protein_i <- df_i$Proteins[1]
## extract reporter ion intensities and aggregate them by summation
reporterIons_i <- df_i[,names_reporterCol]
SummedreporterIons_i <- matrix(colSums(reporterIons_i, na.rm=TRUE),nrow=1,
dimnames = list(NULL,names_reporterCol))
## calculate mean intensity for each PSM of the peptide i, and total mean reporterIon intensity
meanIntensity_i <- rowSums(reporterIons_i, na.rm=TRUE)
SummedmeanIntensity_i <- sum(reporterIons_i, na.rm=TRUE)
## aggregate PFF and EIL weighted by meanIntensity of each respective PSM
weights_i <- meanIntensity_i/SummedmeanIntensity_i
EIL_i <- sum(df_i$EIL*weights_i)
PPF_i <- sum(df_i$PPF*weights_i)
## extract corrected reporter ion intensities and aggregate them by summation
reporterIonscorr_i <- df_i[,names_reporterCorr]
SummedreporterIonscorr_i <- matrix(colSums(reporterIonscorr_i, na.rm=TRUE),nrow=1,
dimnames=list(NULL, names_reporterCorr))
## store result in list
list_peptideTable[[i]] <- cbind(data.frame(Proteins=protein_i,
Sequence=i,
PPF=PPF_i,
EIL=EIL_i,
nrPSMs = nrPSMs_i),
as.data.frame(SummedreporterIons_i),
as.data.frame(SummedreporterIonscorr_i))
}
df_peptide <- do.call(what="rbind", args=list_peptideTable)
## Export peptide table as "Peptides.txt"
if (!file.exists("Results")){
dir.create("Results")
}
write.table(df_peptide, file = paste0(getwd(),"/Results/Peptides.txt"), sep = "\t", col.names = TRUE, row.names=FALSE, quote=FALSE)