Skip to content

Commit

Permalink
MaxQuant converter fixes for fractionated data, see #10
Browse files Browse the repository at this point in the history
  • Loading branch information
MatthewThe committed Aug 16, 2019
1 parent a0cf969 commit 05ab1ee
Show file tree
Hide file tree
Showing 8 changed files with 137 additions and 61 deletions.
32 changes: 28 additions & 4 deletions README.rst
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@
Triqler: TRansparent Identification-Quantification-Linked Error Rates
=====================================================================

Method description / Citation
-----------------------------

The, M. & Käll, L. (2019). Integrated identification and quantification error probabilities for shotgun proteomics. *Molecular & Cellular Proteomics, 18* (3), 561-570.


Requirements
------------

Expand All @@ -11,6 +17,7 @@ Packages needed:
- numpy 1.12+
- scipy 0.17+


Installation via ``pip``
************************

Expand All @@ -33,9 +40,12 @@ Usage
::

usage: python -m triqler [-h] [--out_file OUT] [--fold_change_eval F]
[--decoy_pattern P] [--min_samples N] [--num_threads N]
[--ttest]
IN_FILE
[--decoy_pattern P] [--min_samples N] [--num_threads N]
[--ttest] [--write_spectrum_quants]
[--write_protein_posteriors P_OUT]
[--write_group_posteriors G_OUT]
[--write_fold_change_posteriors F_OUT]
IN_FILE

positional arguments:
IN_FILE List of PSMs with abundances (not log transformed!)
Expand All @@ -54,9 +64,23 @@ Usage
quantified in. (default: 2)
--num_threads N Number of threads, by default this is equal to the
number of CPU cores available on the device. (default:
auto detect)
8)
--ttest Use t-test for evaluating differential expression
instead of posterior probabilities. (default: False)
--write_spectrum_quants
Write quantifications for consensus spectra. Only
works if consensus spectrum index are given in input.
(default: False)
--write_protein_posteriors P_OUT
Write raw data of protein posteriors to the specified
file in TSV format. (default: )
--write_group_posteriors G_OUT
Write raw data of treatment group posteriors to the
specified file in TSV format. (default: )
--write_fold_change_posteriors F_OUT
Write raw data of fold change posteriors to the
specified file in TSV format. (default: )


Example
-------
Expand Down
1 change: 1 addition & 0 deletions run_test.sh
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
#!/bin/bash

export OMP_NUM_THREADS=4
# to replace the reference results execute the following line:
# rename -f 's/iPRG2016./iPRG2016_ref./g' example/iPRG2016.p*; mv example/iPRG2016.tsv.pqr.tsv example/iPRG2016_ref.tsv.pqr.tsv

Expand Down
65 changes: 39 additions & 26 deletions triqler/convert/maxquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,19 @@

from __future__ import print_function

import sys
import collections

import numpy as np

from .. import parsers
from ..triqler import __version__, __copyright__

from . import normalize_intensities as normalize

def main():
print('Triqler.convert.maxquant version %s\n%s' % (__version__, __copyright__))
print('Issued command:', "maxquant.py " + " ".join(map(str, sys.argv[1:])))
args, params = parseArgs()

convertMqToTriqler(args.file_list_file, args.in_file, args.out_file, params)
Expand Down Expand Up @@ -41,24 +45,23 @@ def parseArgs():
help='Skip retention-time based intensity normalization.',
action='store_true')

apars.add_argument('--skip_link_pep',
help='Skips the linkPEP column from match-between-runs output.',
action='store_true')
#apars.add_argument('--skip_link_pep',
# help='Skips the linkPEP column from match-between-runs output.',
# action='store_true')

# ------------------------------------------------
args = apars.parse_args()

params = dict()
params['simpleOutputFormat'] = args.skip_link_pep
params['simpleOutputFormat'] = True
params['skipNormalization'] = args.skip_normalization
params['plotScatter'] = False

return args, params

def convertMqToTriqler(fileListFile, mqEvidenceFile, triqlerInputFile, params):
fileList, groups, groupLabels = parsers.parseFileList(fileListFile)

fileNameConditionPairs = [[x.split("/")[-1], parsers.getGroupLabel(idx, groups, groupLabels)] for idx, x in enumerate(fileList)]
fileInfoList = parsers.parseFileList(fileListFile)
fileList, _, sampleList, fractionList = zip(*fileInfoList)

writer = parsers.getTsvWriter(triqlerInputFile)
if params['simpleOutputFormat']:
Expand All @@ -79,52 +82,62 @@ def convertMqToTriqler(fileListFile, mqEvidenceFile, triqlerInputFile, params):
postErrCol = headers.index('PEP')
rtCol = headers.index('Retention time')

fractionCol = headers.index('Fraction') if 'Fraction' in headers else -1
experimentCol = headers.index('Experiment')

print("Parsing MaxQuant evidence.txt file")
peptideToFeatureMap = collections.defaultdict(list)
for row in reader:
for lineIdx, row in enumerate(reader):
if lineIdx % 500000 == 0:
print(" Reading line", lineIdx)

proteins = row[proteinCol].split(";")

linkPEP = 0.0
key = (row[peptCol], row[chargeCol])

fileIdx = fileList.index(row[fileCol])
run, condition = fileNameConditionPairs[fileIdx]
run, condition, sample, fraction = fileInfoList[fileIdx]
if fraction == -1 and fractionCol != -1:
sample, fraction = row[experimentCol], row[fractionCol]

if key in peptideToFeatureMap:
featureClusterIdx = peptideToFeatureMap[key][0][0].featureClusterId
else:
featureClusterIdx = len(peptideToFeatureMap)

if row[intensityCol] == "":
if row[intensityCol] == "" or float(row[scoreCol]) <= 0:
continue
#print(row[scoreCol], row[intensityCol])
triqlerRow = parsers.TriqlerInputRow(run, condition, row[chargeCol], row[idCol], linkPEP, featureClusterIdx, float(row[scoreCol]), float(row[intensityCol]), row[peptCol], proteins)
peptideToFeatureMap[key].append((triqlerRow, float(row[rtCol])))

triqlerRow = parsers.TriqlerInputRow(sample, condition, row[chargeCol], row[idCol], linkPEP, featureClusterIdx, np.log(float(row[scoreCol])), float(row[intensityCol]), row[peptCol], proteins)
peptideToFeatureMap[key].append((triqlerRow, float(row[rtCol]), fraction))

if not params['skipNormalization']:
print("Applying retention-time dependent intensity normalization")
minRunsObservedIn = len(fileNameConditionPairs) / 3 + 2
factorPairs = normalize.getIntensityFactorPairs(peptideToFeatureMap.values(), sortKey = lambda x : (x[0].run, -1*x[0].intensity, x[1]), minRunsObservedIn = minRunsObservedIn)

if params['plotScatter']:
normalize.plotFactorScatter(factorPairs)

rTimeFactorArrays = normalize.getFactorArrays(factorPairs)

minRunsObservedIn = len(set(sampleList)) / 3 + 1
rTimeArrays, factorArrays = dict(), dict()
for key in rTimeFactorArrays:
rTimeArrays[key], factorArrays[key] = zip(*rTimeFactorArrays[key])
for fraction in sorted(list(set(fractionList))):
factorPairs = normalize.getIntensityFactorPairs(peptideToFeatureMap.values(), sortKey = lambda x : (x[2], x[0].run, -1*x[0].intensity, x[1]), minRunsObservedIn = minRunsObservedIn, fraction = fraction)
print("Fraction:", fraction, "#runs:", len(factorPairs))
if params['plotScatter']:
normalize.plotFactorScatter(factorPairs)

rTimeFactorArrays = normalize.getFactorArrays(factorPairs)

rTimeArrays[fraction], factorArrays[fraction] = dict(), dict()
for key in rTimeFactorArrays:
rTimeArrays[fraction][key], factorArrays[fraction][key] = zip(*rTimeFactorArrays[key])
else:
print("Skipping retention-time dependent intensity normalization")

for featureClusterIdx, featureCluster in enumerate(peptideToFeatureMap.values()):
if featureClusterIdx % 10000 == 0:
if featureClusterIdx % 50000 == 0:
print("Processing feature group", featureClusterIdx + 1)
newRows = selectBestScorePerRun(featureCluster)

for (row, rTime) in newRows:
for (row, rTime, fraction) in newRows:
if not params['skipNormalization']:
newIntensity = normalize.getNormalizedIntensity(rTimeArrays[row.run], factorArrays[row.run], rTime, row.intensity)
newIntensity = normalize.getNormalizedIntensity(rTimeArrays[fraction][row.run], factorArrays[fraction][row.run], rTime, row.intensity)
row = row._replace(intensity = newIntensity)

if params['simpleOutputFormat']:
Expand Down
32 changes: 19 additions & 13 deletions triqler/convert/normalize_intensities.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@

def normalizeIntensitiesRtimeBased(clusterQuantExtraFile, clusterQuantExtraNormalizedFile, minRunsObservedIn, plotScatter = False, plotRunningAverage = False):
featureGroups = parsers.parseFeatureClustersFile(clusterQuantExtraFile)
factorPairs = getIntensityFactorPairs(featureGroups, sortKey = lambda x : (x.fileName, -1.0 * x.intensity, x.rTime), minRunsObservedIn = minRunsObservedIn)
factorPairs = getIntensityFactorPairs(featureGroups, sortKey = lambda x : (1, x.fileName, -1.0 * x.intensity, x.rTime), minRunsObservedIn = minRunsObservedIn)

if plotScatter:
plotFactorScatter(factorPairs)
Expand All @@ -23,16 +23,17 @@ def normalizeIntensitiesRtimeBased(clusterQuantExtraFile, clusterQuantExtraNorma

normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArrays, clusterQuantExtraNormalizedFile)

def getIntensityFactorPairs(featureGroups, sortKey, minRunsObservedIn):
def getIntensityFactorPairs(featureGroups, sortKey, minRunsObservedIn, fraction = 1):
factorPairs = defaultdict(list)
for i, featureGroup in enumerate(featureGroups):
localFactorPairs = dict()
sortedFeatures = sorted(featureGroup, key = sortKey)
for row in sortedFeatures:
fileName, intensity, rTime = sortKey(row)
rowFraction, fileName, intensity, rTime = sortKey(row)
intensity *= -1.0
if not np.isnan(intensity) and fileName not in localFactorPairs: # and row.charge == 2:
if fraction == rowFraction and not np.isnan(intensity) and fileName not in localFactorPairs:
localFactorPairs[fileName] = (np.log2(intensity), rTime)

if len(localFactorPairs) >= minRunsObservedIn:
keys = sorted(localFactorPairs.keys())
masterKey = keys[0]
Expand All @@ -41,24 +42,29 @@ def getIntensityFactorPairs(featureGroups, sortKey, minRunsObservedIn):
for key in keys[1:]:
factori = factorPairs[masterKey][-1][1] - (localFactorPairs[masterKey][0] - localFactorPairs[key][0])
factorPairs[key].append((localFactorPairs[key][1], factori))
if (i+1) % 10000 == 0:
print("Processing cluster", i+1)
#if (i+1) % 100000 == 0:
# print("Processing cluster", i+1)
return factorPairs

# returns running averages of factors
def getFactorArrays(factorPairs, N = 2000):
factorArrays = defaultdict(list)
for i, key in enumerate(sorted(factorPairs.keys())):
print("Calculating factor array", i+1)
print("Calculating normalization factors for run", i+1, "using", len(factorPairs[key]), "precursors")
factorPairs[key] = sorted(factorPairs[key], key = lambda x : x[0])
rTimes = [x[0] for x in factorPairs[key]]
factors = [x[1] for x in factorPairs[key]]
factorArrays[key] = zip(rTimes[int(N/2):int(-N/2)], runningMean(factors, N))
runningMeans = runningMean(factors, N)
factorArrays[key] = zip(rTimes, runningMeans)
return factorArrays

def runningMean(x, N):
cumsum = np.cumsum(np.insert(x, 0, 0))
return (cumsum[N:] - cumsum[:-N]) / N
if len(x) <= N:
return np.array([np.mean(x)]*len(x))
else:
cumsum = np.cumsum(np.insert(x, 0, 0))
rm = (cumsum[N:] - cumsum[:-N]) / N
return np.concatenate(([rm[0]]*(N/2), rm, [rm[-1]]*(N/2)))

def normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArrays, clusterQuantExtraNormalizedFile):
rTimeArrays, factorArrays = dict(), dict()
Expand All @@ -73,15 +79,15 @@ def normalizeIntensitiesWithFactorArrays(clusterQuantExtraFile, rTimeFactorArray
outRow[4] = getNormalizedIntensity(rTimeArrays[row.fileName], factorArrays[row.fileName], row.rTime, row.intensity)
writer.writerow(outRow)
writer.writerow([])
if (i+1) % 10000 == 0:
if (i+1) % 50000 == 0:
print("Writing cluster", i+1)

def getNormalizedIntensity(rTimeArray, factorArray, rTime, intensity):
rTimeIndex = min([bisect.bisect_left(rTimeArray, rTime), len(rTimeArray) - 1])
return intensity / (2 ** factorArray[rTimeIndex])

def plotFactorScatter(factorPairs):
import scatter
from . import scatter
import matplotlib.pyplot as plt
scatter.prepareSubplots()
for i, key in enumerate(sorted(factorPairs.keys())):
Expand All @@ -98,7 +104,7 @@ def plotFactorScatter(factorPairs):
plt.show()

def plotFactorRunningAverage(factorArrays):
import scatter
from . import scatter
import matplotlib.pyplot as plt
scatter.prepareSubplots()
for i, key in enumerate(sorted(factorArrays.keys())):
Expand Down
15 changes: 7 additions & 8 deletions triqler/convert/quandenser.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,14 +77,13 @@ def parseArgs():
return args, params

def convertQuandenserToTriqler(fileListFile, clusterQuantFile, psmsOutputFiles, peptQuantRowFile, params):
fileList, groups, groupLabels = parsers.parseFileList(fileListFile)
fileNameConditionPairs = [[x.split("/")[-1], parsers.getGroupLabel(idx, groups, groupLabels)] for idx, x in enumerate(fileList)]
fileInfoList = parsers.parseFileList(fileListFile)

if not params['skipNormalization']:
clusterQuantFileNormalized = clusterQuantFile.replace(".tsv", ".normalized.tsv")
if not os.path.isfile(clusterQuantFileNormalized):
print("Applying retention-time dependent intensity normalization")
minRunsObservedIn = len(fileNameConditionPairs) / 3 + 2
minRunsObservedIn = len(fileInfoList) / 3 + 1
normalize.normalizeIntensitiesRtimeBased(clusterQuantFile, clusterQuantFileNormalized, minRunsObservedIn, plotScatter = params['plotScatter'])
else:
print("Reusing previously generated normalized feature group file:", clusterQuantFileNormalized, ". Remove this file to redo normalization")
Expand All @@ -94,7 +93,7 @@ def convertQuandenserToTriqler(fileListFile, clusterQuantFile, psmsOutputFiles,

specToPeptideMap = parsePsmsPoutFiles(psmsOutputFiles)

printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, peptQuantRowFile, specToPeptideMap, params)
printTriqlerInputFile(fileInfoList, clusterQuantFile, peptQuantRowFile, specToPeptideMap, params)

def parsePsmsPoutFiles(psmsOutputFiles):
specToPeptideMap = collections.defaultdict(list)
Expand All @@ -110,7 +109,7 @@ def parsePeptideLinkPEP(peptLinkPEP):
spectrumIdx, linkPEP = peptLinkPEP.split(";")
return int(spectrumIdx), float(linkPEP)

def printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, quantRowFile, specToPeptideMap, params):
def printTriqlerInputFile(fileInfoList, clusterQuantFile, quantRowFile, specToPeptideMap, params):
print("Parsing cluster quant file")

writer = parsers.getTsvWriter(quantRowFile)
Expand All @@ -122,7 +121,7 @@ def printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, quantRowFile
featureClusterRows = list()
spectrumToFeatureMatch = dict() # stores the best peptideQuantRow per (peptide, spectrumIdx)-pair
for featureClusterIdx, featureCluster in enumerate(parsers.parseFeatureClustersFile(clusterQuantFile)):
if featureClusterIdx % 10000 == 0:
if featureClusterIdx % 50000 == 0:
print("Processing feature group", featureClusterIdx + 1)

rows = list()
Expand All @@ -133,8 +132,8 @@ def printTriqlerInputFile(fileNameConditionPairs, clusterQuantFile, quantRowFile
peptide, identPEP, proteins, searchScore, charge = specToPeptideMap(spectrumIdx)
if pc.intensity > 0.0 and linkPEP < 1.0 and (params["retainUnidentified"] or peptide != "NA"):
# run condition charge spectrumId linkPEP featureClusterId search_score intensity peptide proteins
run, condition = fileNameConditionPairs[fileIdx]
row = parsers.TriqlerInputRow(run, condition, charge, spectrumIdx, linkPEP, featureClusterIdx, searchScore, pc.intensity, peptide, proteins)
run, condition, sample, fraction = fileInfoList[fileIdx]
row = parsers.TriqlerInputRow(sample, condition, charge, spectrumIdx, linkPEP, featureClusterIdx, searchScore, pc.intensity, peptide, proteins)
rows.append(row)

newRows = list()
Expand Down
13 changes: 10 additions & 3 deletions triqler/hyperparameters.py
Original file line number Diff line number Diff line change
Expand Up @@ -85,12 +85,19 @@ def fitLogitNormal(observedValues, params, plot):
bins = bins[:-1]
popt, _ = curve_fit(funcLogitNormal, bins, vals, p0 = (m, s, m - s, s))

resetXICHyperparameters = False
if popt[0] < popt[2] - 5*popt[3] or popt[0] > popt[2] + 5*popt[3]:
print(" Warning: muDetect outside of expected region [", popt[2] - 5*popt[3] , ",", popt[2] + 5*popt[3], ":", popt[0], " setting to default value of muXIC - 1.0 =", popt[2] - 1.0)
popt[0] = popt[2] - 1.0
print(" Warning: muDetect outside of expected region [", popt[2] - 5*popt[3] , ",", popt[2] + 5*popt[3], "]:", popt[0], ".")
resetXICHyperparameters = True

if popt[1] < 0.1 or popt[1] > 2.0:
print(" Warning: sigmaDetect outside of expected region [0.1,2.0]:" , popt[1], " setting to default value of 0.3")
print(" Warning: sigmaDetect outside of expected region [0.1,2.0]:" , popt[1], ".")
resetXICHyperparameters = True

if resetXICHyperparameters:
print(" Resetting mu/sigmaDetect hyperparameters to default values of muDetect = muXIC - 1.0 and sigmaDetect = 0.3")
popt[1] = 0.3
popt[0] = popt[2] - 1.0

#print(" params[\"muDetectInit\"], params[\"sigmaDetectInit\"] = %f, %f" % (popt[0], popt[1]))
print(" params[\"muDetect\"], params[\"sigmaDetect\"] = %f, %f" % (popt[0], popt[1]))
Expand Down
Loading

0 comments on commit 05ab1ee

Please sign in to comment.