MaxQuant converter fixes for fractionated data, see #10

statisticalbiotechnology · Aug 16, 2019 · 05ab1ee · 05ab1ee
1 parent a0cf969
commit 05ab1ee
Show file tree

Hide file tree

Showing 8 changed files with 137 additions and 61 deletions.
diff --git a/README.rst b/README.rst
@@ -1,6 +1,12 @@
 Triqler: TRansparent Identification-Quantification-Linked Error Rates
 =====================================================================
 
+Method description / Citation
+-----------------------------
+
+The, M. & Käll, L. (2019). Integrated identification and quantification error probabilities for shotgun proteomics. *Molecular & Cellular Proteomics, 18* (3), 561-570.
+
+
 Requirements
 ------------
 
@@ -11,6 +17,7 @@ Packages needed:
 - numpy 1.12+
 - scipy 0.17+
 
+
 Installation via ``pip``
 ************************
 
@@ -33,9 +40,12 @@ Usage
 ::
 
   usage: python -m triqler [-h] [--out_file OUT] [--fold_change_eval F]
-                     [--decoy_pattern P] [--min_samples N] [--num_threads N]
-                     [--ttest]
-                     IN_FILE
+                   [--decoy_pattern P] [--min_samples N] [--num_threads N]
+                   [--ttest] [--write_spectrum_quants]
+                   [--write_protein_posteriors P_OUT]
+                   [--write_group_posteriors G_OUT]
+                   [--write_fold_change_posteriors F_OUT]
+                   IN_FILE
 
   positional arguments:
     IN_FILE               List of PSMs with abundances (not log transformed!)
@@ -54,9 +64,23 @@ Usage
                           quantified in. (default: 2)
     --num_threads N       Number of threads, by default this is equal to the
                           number of CPU cores available on the device. (default:
-                          auto detect)
+                          8)
     --ttest               Use t-test for evaluating differential expression
                           instead of posterior probabilities. (default: False)
+    --write_spectrum_quants
+                          Write quantifications for consensus spectra. Only
+                          works if consensus spectrum index are given in input.
+                          (default: False)
+    --write_protein_posteriors P_OUT
+                          Write raw data of protein posteriors to the specified
+                          file in TSV format. (default: )
+    --write_group_posteriors G_OUT
+                          Write raw data of treatment group posteriors to the
+                          specified file in TSV format. (default: )
+    --write_fold_change_posteriors F_OUT
+                          Write raw data of fold change posteriors to the
+                          specified file in TSV format. (default: )
+
 
 Example
 -------

diff --git a/run_test.sh b/run_test.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
 
+export OMP_NUM_THREADS=4
 # to replace the reference results execute the following line:
 #   rename -f 's/iPRG2016./iPRG2016_ref./g' example/iPRG2016.p*; mv example/iPRG2016.tsv.pqr.tsv example/iPRG2016_ref.tsv.pqr.tsv
 

diff --git a/triqler/convert/maxquant.py b/triqler/convert/maxquant.py
@@ -5,15 +5,19 @@
 
 from __future__ import print_function
 
+import sys
 import collections
 
+import numpy as np
+
 from .. import parsers
 from ..triqler import __version__, __copyright__
 
 from . import normalize_intensities as normalize
 
 def main():
   print('Triqler.convert.maxquant version %s\n%s' % (__version__, __copyright__))
+  print('Issued command:', "maxquant.py " + " ".join(map(str, sys.argv[1:])))
   args, params = parseArgs()
 
   convertMqToTriqler(args.file_list_file, args.in_file, args.out_file, params)
@@ -41,24 +45,23 @@ def parseArgs():
                      help='Skip retention-time based intensity normalization.',
                      action='store_true')
 
-  apars.add_argument('--skip_link_pep',
-                     help='Skips the linkPEP column from match-between-runs output.',
-                     action='store_true')
+  #apars.add_argument('--skip_link_pep',
+  #                   help='Skips the linkPEP column from match-between-runs output.',
+  #                   action='store_true')
 
   # ------------------------------------------------
   args = apars.parse_args()
 
   params = dict()
-  params['simpleOutputFormat'] = args.skip_link_pep
+  params['simpleOutputFormat'] = True
   params['skipNormalization'] = args.skip_normalization
   params['plotScatter'] = False
 
   return args, params
 
 def convertMqToTriqler(fileListFile, mqEvidenceFile, triqlerInputFile, params):
-  fileList, groups, groupLabels = parsers.parseFileList(fileListFile)  
-
-  fileNameConditionPairs = [[x.split("/")[-1], parsers.getGroupLabel(idx, groups, groupLabels)] for idx, x in enumerate(fileList)]
+  fileInfoList = parsers.parseFileList(fileListFile)
+  fileList, _, sampleList, fractionList = zip(*fileInfoList)
 
   writer = parsers.getTsvWriter(triqlerInputFile)
   if params['simpleOutputFormat']:
@@ -79,52 +82,62 @@ def convertMqToTriqler(fileListFile, mqEvidenceFile, triqlerInputFile, params):
   postErrCol = headers.index('PEP')
   rtCol = headers.index('Retention time')
 
+  fractionCol = headers.index('Fraction') if 'Fraction' in headers else -1
+  experimentCol = headers.index('Experiment')
+
   print("Parsing MaxQuant evidence.txt file")
   peptideToFeatureMap = collections.defaultdict(list)
-  for row in reader:
+  for lineIdx, row in enumerate(reader):
+    if lineIdx % 500000 == 0:
+      print("  Reading line", lineIdx)
+
     proteins = row[proteinCol].split(";")
 
     linkPEP = 0.0
     key = (row[peptCol], row[chargeCol])
 
     fileIdx = fileList.index(row[fileCol])
-    run, condition = fileNameConditionPairs[fileIdx]
+    run, condition, sample, fraction = fileInfoList[fileIdx]
+    if fraction == -1 and fractionCol != -1:
+      sample, fraction = row[experimentCol], row[fractionCol]
 
     if key in peptideToFeatureMap:
       featureClusterIdx = peptideToFeatureMap[key][0][0].featureClusterId
     else:
       featureClusterIdx = len(peptideToFeatureMap)
 
-    if row[intensityCol] == "":
+    if row[intensityCol] == "" or float(row[scoreCol]) <= 0:
       continue
-    #print(row[scoreCol], row[intensityCol])
-    triqlerRow = parsers.TriqlerInputRow(run, condition, row[chargeCol], row[idCol], linkPEP, featureClusterIdx, float(row[scoreCol]), float(row[intensityCol]), row[peptCol], proteins)
-    peptideToFeatureMap[key].append((triqlerRow, float(row[rtCol])))
+
+    triqlerRow = parsers.TriqlerInputRow(sample, condition, row[chargeCol], row[idCol], linkPEP, featureClusterIdx, np.log(float(row[scoreCol])), float(row[intensityCol]), row[peptCol], proteins)
+    peptideToFeatureMap[key].append((triqlerRow, float(row[rtCol]), fraction))
 
   if not params['skipNormalization']:
     print("Applying retention-time dependent intensity normalization")
-    minRunsObservedIn = len(fileNameConditionPairs) / 3 + 2
-    factorPairs = normalize.getIntensityFactorPairs(peptideToFeatureMap.values(), sortKey = lambda x : (x[0].run, -1*x[0].intensity, x[1]), minRunsObservedIn = minRunsObservedIn)
-
-    if params['plotScatter']:
-      normalize.plotFactorScatter(factorPairs)
-
-    rTimeFactorArrays = normalize.getFactorArrays(factorPairs)
-
+    minRunsObservedIn = len(set(sampleList)) / 3 + 1
     rTimeArrays, factorArrays = dict(), dict()
-    for key in rTimeFactorArrays:
-      rTimeArrays[key], factorArrays[key] = zip(*rTimeFactorArrays[key])
+    for fraction in sorted(list(set(fractionList))):
+      factorPairs = normalize.getIntensityFactorPairs(peptideToFeatureMap.values(), sortKey = lambda x : (x[2], x[0].run, -1*x[0].intensity, x[1]), minRunsObservedIn = minRunsObservedIn, fraction = fraction)
+      print("Fraction:", fraction, "#runs:", len(factorPairs))
+      if params['plotScatter']:
+        normalize.plotFactorScatter(factorPairs)
+
+      rTimeFactorArrays = normalize.getFactorArrays(factorPairs)
+
+      rTimeArrays[fraction], factorArrays[fraction] = dict(), dict()
+      for key in rTimeFactorArrays:
+        rTimeArrays[fraction][key], factorArrays[fraction][key] = zip(*rTimeFactorArrays[key])
   else:
     print("Skipping retention-time dependent intensity normalization")
 
   for featureClusterIdx, featureCluster in enumerate(peptideToFeatureMap.values()):
-    if featureClusterIdx % 10000 == 0:
+    if featureClusterIdx % 50000 == 0:
       print("Processing feature group", featureClusterIdx + 1)
     newRows = selectBestScorePerRun(featureCluster)
 
-    for (row, rTime) in newRows:
+    for (row, rTime, fraction) in newRows:
       if not params['skipNormalization']:
-        newIntensity = normalize.getNormalizedIntensity(rTimeArrays[row.run], factorArrays[row.run], rTime, row.intensity)
+        newIntensity = normalize.getNormalizedIntensity(rTimeArrays[fraction][row.run], factorArrays[fraction][row.run], rTime, row.intensity)
         row = row._replace(intensity = newIntensity)
 
       if params['simpleOutputFormat']:

diff --git a/triqler/convert/normalize_intensities.py b/triqler/convert/normalize_intensities.py
@@ -11,7 +11,7 @@
 
 def normalizeIntensitiesRtimeBased(clusterQuantExtraFile, clusterQuantExtraNormalizedFile, minRunsObservedIn, plotScatter = False, plotRunningAverage = False):
   featureGroups = parsers.parseFeatureClustersFile(clusterQuantExtraFile)
-  factorPairs = getIntensityFactorPairs(featureGroups, sortKey = lambda x : (x.fileName, -1.0 * x.intensity, x.rTime), minRunsObservedIn = minRunsObservedIn)
+  factorPairs = getIntensityFactorPairs(featureGroups, sortKey = lambda x : (1, x.fileName, -1.0 * x.intensity, x.rTime), minRunsObservedIn = minRunsObservedIn)
 
   if plotScatter:
     plotFactorScatter(factorPairs)
@@ -23,16 +23,17 @@ def normalizeIntensitiesRtimeBased(clusterQuantExtraFile, clusterQuantExtraNorma
 
   normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArrays, clusterQuantExtraNormalizedFile)
 
-def getIntensityFactorPairs(featureGroups, sortKey, minRunsObservedIn):
+def getIntensityFactorPairs(featureGroups, sortKey, minRunsObservedIn, fraction = 1):
   factorPairs = defaultdict(list)
   for i, featureGroup in enumerate(featureGroups):
     localFactorPairs = dict()
     sortedFeatures = sorted(featureGroup, key = sortKey)
     for row in sortedFeatures:
-      fileName, intensity, rTime = sortKey(row)
+      rowFraction, fileName, intensity, rTime = sortKey(row)
       intensity *= -1.0
-      if not np.isnan(intensity) and fileName not in localFactorPairs: # and row.charge == 2:
+      if fraction == rowFraction and not np.isnan(intensity) and fileName not in localFactorPairs:
         localFactorPairs[fileName] = (np.log2(intensity), rTime)
+
     if len(localFactorPairs) >= minRunsObservedIn:
       keys = sorted(localFactorPairs.keys())
       masterKey = keys[0]
@@ -41,24 +42,29 @@ def getIntensityFactorPairs(featureGroups, sortKey, minRunsObservedIn):
       for key in keys[1:]:
         factori = factorPairs[masterKey][-1][1] - (localFactorPairs[masterKey][0] - localFactorPairs[key][0])
         factorPairs[key].append((localFactorPairs[key][1], factori))
-    if (i+1) % 10000 == 0:
-      print("Processing cluster", i+1)
+    #if (i+1) % 100000 == 0:
+    #  print("Processing cluster", i+1)
   return factorPairs
 
 # returns running averages of factors
 def getFactorArrays(factorPairs, N = 2000):
   factorArrays = defaultdict(list)
   for i, key in enumerate(sorted(factorPairs.keys())):
-    print("Calculating factor array", i+1)
+    print("Calculating normalization factors for run", i+1, "using", len(factorPairs[key]), "precursors")
     factorPairs[key] = sorted(factorPairs[key], key = lambda x : x[0])
     rTimes = [x[0] for x in factorPairs[key]]
     factors = [x[1] for x in factorPairs[key]]
-    factorArrays[key] = zip(rTimes[int(N/2):int(-N/2)], runningMean(factors, N))
+    runningMeans = runningMean(factors, N)
+    factorArrays[key] = zip(rTimes, runningMeans)
   return factorArrays
 
 def runningMean(x, N):
-  cumsum = np.cumsum(np.insert(x, 0, 0)) 
-  return (cumsum[N:] - cumsum[:-N]) / N 
+  if len(x) <= N:
+    return np.array([np.mean(x)]*len(x))
+  else:
+    cumsum = np.cumsum(np.insert(x, 0, 0))
+    rm = (cumsum[N:] - cumsum[:-N]) / N 
+    return np.concatenate(([rm[0]]*(N/2), rm, [rm[-1]]*(N/2)))
 
 def normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArrays, clusterQuantExtraNormalizedFile):
   rTimeArrays, factorArrays = dict(), dict()
@@ -73,15 +79,15 @@ def normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArray
       outRow[4] = getNormalizedIntensity(rTimeArrays[row.fileName], factorArrays[row.fileName], row.rTime, row.intensity)
       writer.writerow(outRow)
     writer.writerow([])
-    if (i+1) % 10000 == 0:
+    if (i+1) % 50000 == 0:
       print("Writing cluster", i+1)
 
 def getNormalizedIntensity(rTimeArray, factorArray, rTime, intensity):
   rTimeIndex = min([bisect.bisect_left(rTimeArray, rTime), len(rTimeArray) - 1])
   return intensity / (2 ** factorArray[rTimeIndex])
 
 def plotFactorScatter(factorPairs):
-  import scatter
+  from . import scatter
   import matplotlib.pyplot as plt
   scatter.prepareSubplots()
   for i, key in enumerate(sorted(factorPairs.keys())):
@@ -98,7 +104,7 @@ def plotFactorScatter(factorPairs):
   plt.show()
 
 def plotFactorRunningAverage(factorArrays):
-  import scatter
+  from . import scatter
   import matplotlib.pyplot as plt
   scatter.prepareSubplots()
   for i, key in enumerate(sorted(factorArrays.keys())):

diff --git a/triqler/convert/quandenser.py b/triqler/convert/quandenser.py
@@ -77,14 +77,13 @@ def parseArgs():
   return args, params
 
 def convertQuandenserToTriqler(fileListFile, clusterQuantFile, psmsOutputFiles, peptQuantRowFile, params):
-  fileList, groups, groupLabels = parsers.parseFileList(fileListFile)
-  fileNameConditionPairs = [[x.split("/")[-1], parsers.getGroupLabel(idx, groups, groupLabels)] for idx, x in enumerate(fileList)]
+  fileInfoList = parsers.parseFileList(fileListFile)
 
   if not params['skipNormalization']:
     clusterQuantFileNormalized = clusterQuantFile.replace(".tsv", ".normalized.tsv")
     if not os.path.isfile(clusterQuantFileNormalized):
       print("Applying retention-time dependent intensity normalization")
-      minRunsObservedIn = len(fileNameConditionPairs) / 3 + 2
+      minRunsObservedIn = len(fileInfoList) / 3 + 1
       normalize.normalizeIntensitiesRtimeBased(clusterQuantFile, clusterQuantFileNormalized, minRunsObservedIn, plotScatter = params['plotScatter'])
     else:
       print("Reusing previously generated normalized feature group file:", clusterQuantFileNormalized, ". Remove this file to redo normalization")
@@ -94,7 +93,7 @@ def convertQuandenserToTriqler(fileListFile, clusterQuantFile, psmsOutputFiles,
 
   specToPeptideMap = parsePsmsPoutFiles(psmsOutputFiles)
 
-  printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, peptQuantRowFile, specToPeptideMap, params)
+  printTriqlerInputFile(fileInfoList, clusterQuantFile, peptQuantRowFile, specToPeptideMap, params)
 
 def parsePsmsPoutFiles(psmsOutputFiles):
   specToPeptideMap = collections.defaultdict(list)
@@ -110,7 +109,7 @@ def parsePeptideLinkPEP(peptLinkPEP):
   spectrumIdx, linkPEP = peptLinkPEP.split(";")
   return int(spectrumIdx), float(linkPEP)
 
-def printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, quantRowFile, specToPeptideMap, params):
+def printTriqlerInputFile(fileInfoList, clusterQuantFile, quantRowFile, specToPeptideMap, params):
   print("Parsing cluster quant file")
 
   writer = parsers.getTsvWriter(quantRowFile)
@@ -122,7 +121,7 @@ def printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, quantRowFile
   featureClusterRows = list()
   spectrumToFeatureMatch = dict() # stores the best peptideQuantRow per (peptide, spectrumIdx)-pair
   for featureClusterIdx, featureCluster in enumerate(parsers.parseFeatureClustersFile(clusterQuantFile)):
-    if featureClusterIdx % 10000 == 0:
+    if featureClusterIdx % 50000 == 0:
       print("Processing feature group", featureClusterIdx + 1)
 
     rows = list()
@@ -133,8 +132,8 @@ def printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, quantRowFile
         peptide, identPEP, proteins, searchScore, charge = specToPeptideMap(spectrumIdx)
         if pc.intensity > 0.0 and linkPEP < 1.0 and (params["retainUnidentified"] or peptide != "NA"):
           # run condition charge spectrumId linkPEP featureClusterId search_score intensity peptide proteins
-          run, condition = fileNameConditionPairs[fileIdx]
-          row = parsers.TriqlerInputRow(run, condition, charge, spectrumIdx, linkPEP, featureClusterIdx, searchScore, pc.intensity, peptide, proteins)
+          run, condition, sample, fraction = fileInfoList[fileIdx]
+          row = parsers.TriqlerInputRow(sample, condition, charge, spectrumIdx, linkPEP, featureClusterIdx, searchScore, pc.intensity, peptide, proteins)
           rows.append(row)
 
     newRows = list()

diff --git a/triqler/hyperparameters.py b/triqler/hyperparameters.py
@@ -85,12 +85,19 @@ def fitLogitNormal(observedValues, params, plot):
   bins = bins[:-1]
   popt, _ = curve_fit(funcLogitNormal, bins, vals, p0 = (m, s, m - s, s))
 
+  resetXICHyperparameters = False
   if popt[0] < popt[2] - 5*popt[3] or popt[0] > popt[2] + 5*popt[3]:
-    print("  Warning: muDetect outside of expected region [", popt[2] - 5*popt[3] , ",", popt[2] + 5*popt[3], ":", popt[0], " setting to default value of muXIC - 1.0 =", popt[2] - 1.0)
-    popt[0] = popt[2] - 1.0
+    print("  Warning: muDetect outside of expected region [", popt[2] - 5*popt[3] , ",", popt[2] + 5*popt[3], "]:", popt[0], ".")
+    resetXICHyperparameters = True
+
   if popt[1] < 0.1 or popt[1] > 2.0:
-    print("  Warning: sigmaDetect outside of expected region [0.1,2.0]:" , popt[1], " setting to default value of 0.3")
+    print("  Warning: sigmaDetect outside of expected region [0.1,2.0]:" , popt[1], ".")
+    resetXICHyperparameters = True
+
+  if resetXICHyperparameters:
+    print("    Resetting mu/sigmaDetect hyperparameters to default values of muDetect = muXIC - 1.0 and sigmaDetect = 0.3")
     popt[1] = 0.3
+    popt[0] = popt[2] - 1.0
 
   #print("  params[\"muDetectInit\"], params[\"sigmaDetectInit\"] = %f, %f" % (popt[0], popt[1]))
   print("  params[\"muDetect\"], params[\"sigmaDetect\"] = %f, %f" % (popt[0], popt[1]))