From 5de47ec9883940d983c53a49b589f0f4de2bdf3e Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Wed, 2 Aug 2023 15:27:03 -0700 Subject: [PATCH 1/3] #634 scoring overhaul --- __test__/unittest/score.test.js | 117 +++++++++++++++++++++++++ src/config.js | 9 ++ src/results_assembly/query_results.js | 21 +++-- src/results_assembly/score.js | 119 +++++++++++++++++--------- 4 files changed, 220 insertions(+), 46 deletions(-) create mode 100644 __test__/unittest/score.test.js diff --git a/__test__/unittest/score.test.js b/__test__/unittest/score.test.js new file mode 100644 index 00000000..c4bf12b3 --- /dev/null +++ b/__test__/unittest/score.test.js @@ -0,0 +1,117 @@ +const { calculateScore, exportForTesting } = require('../../src/results_assembly/score'); +const { record_weight, text_mined_record_weight, ngd_weight, LENGTH_PENALTY, scaled_sigmoid } = exportForTesting; + +describe('Test score function', () => { + const ngdPairs = { + 'C0678941-C0267841': 0.5, + 'C4548369-C0678941': 0.6, + 'C4548369-C0267841': 0.7 + }; + + const sampleComboSimple = [ + { + inputQNodeID: 'nB', + outputQNodeID: 'nC', + inputPrimaryCuries: new Set(['UMLS:C0678941']), + outputPrimaryCuries: new Set(['MONDO:0006633']), + inputUMLS: new Set(['C0678941']), + outputUMLS: new Set(['C0267841']), + isTextMined: [ true ], + qEdgeID: 'eB', + recordHashes: new Set(['a']) + }, + { + inputQNodeID: 'nA', + outputQNodeID: 'nB', + inputPrimaryCuries: new Set(['PUBCHEM.COMPOUND:77843966']), + outputPrimaryCuries: new Set(['UMLS:C0678941']), + inputUMLS: new Set(['C4548369']), + outputUMLS: new Set(['C0678941']), + isTextMined: [ true ], + qEdgeID: 'eA', + recordHashes: new Set(['b']) + } + ]; + + const sampleComboComplex = [ + { + inputQNodeID: 'nB', + outputQNodeID: 'nC', + inputPrimaryCuries: new Set(['UMLS:C0678941']), + outputPrimaryCuries: new Set(['MONDO:0006633']), + inputUMLS: new Set(['C0678941']), + outputUMLS: new Set(['C0267841']), + isTextMined: [ true, false, true ], + qEdgeID: 'eB', + recordHashes: new Set(['a', 'b', 'c']) + }, + { + inputQNodeID: 'nA', + outputQNodeID: 'nB', + inputPrimaryCuries: new Set(['PUBCHEM.COMPOUND:77843966']), + outputPrimaryCuries: new Set(['UMLS:C0678941']), + inputUMLS: new Set(['C4548369']), + outputUMLS: new Set(['C0678941']), + isTextMined: [ true, true, true ], + qEdgeID: 'eA', + recordHashes: new Set(['b', 'c', 'd']) + }, + { + inputQNodeID: 'nA', + outputQNodeID: 'nC', + inputPrimaryCuries: new Set(['PUBCHEM.COMPOUND:77843966']), + outputPrimaryCuries: new Set(['MONDO:0006633']), + inputUMLS: new Set(['C4548369']), + outputUMLS: new Set(['C0267841']), + isTextMined: [ false, false ], + qEdgeID: 'eC', + recordHashes: new Set(['c', 'd']) + } + ]; + + test('Test calculateScore function - simple case w/ ngd', () => { + const eAScore = text_mined_record_weight + ngd_weight * (1 / ngdPairs['C4548369-C0678941']); + const eBScore = text_mined_record_weight + ngd_weight * (1 / ngdPairs['C0678941-C0267841']); + const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY)); + + const res = calculateScore(sampleComboSimple, ngdPairs); + expect(res.score).toBe(expected_score); + expect(res.scoredByNGD).toBeTruthy(); + }); + + test('Test calculateScore function - simple case w/o ngd', () => { + const eAScore = text_mined_record_weight; + const eBScore = text_mined_record_weight; + const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY)); + + const res = calculateScore(sampleComboSimple, {}); + expect(res.score).toBe(expected_score); + expect(res.scoredByNGD).toBeFalsy(); + }); + + test('Test calculateScore function - complex case w/ ngd', () => { + const eAScore = 2 * text_mined_record_weight + 1 * record_weight + ngd_weight * (1 / ngdPairs['C4548369-C0678941']); + const eBScore = 3 * text_mined_record_weight + 0 * record_weight + ngd_weight * (1 / ngdPairs['C0678941-C0267841']); + const eCScore = 0 * text_mined_record_weight + 2 * record_weight + ngd_weight * (1 / ngdPairs['C4548369-C0267841']); + + const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY) + eCScore / Math.pow(1, LENGTH_PENALTY)); + + const res = calculateScore(sampleComboComplex, ngdPairs); + expect(res.score).toBe(expected_score); + expect(res.scoredByNGD).toBeTruthy(); + }); + + test('Test calculateScore function - complex case w/o ngd', () => { + const eAScore = 2 * text_mined_record_weight + 1 * record_weight; + const eBScore = 3 * text_mined_record_weight + 0 * record_weight; + const eCScore = 0 * text_mined_record_weight + 2 * record_weight; + + const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY) + eCScore / Math.pow(1, LENGTH_PENALTY)); + + const res = calculateScore(sampleComboComplex, {}); + expect(res.score).toBe(expected_score); + expect(res.scoredByNGD).toBeFalsy(); + }); +}); + + diff --git a/src/config.js b/src/config.js index 871d9b47..e16365c5 100644 --- a/src/config.js +++ b/src/config.js @@ -60,3 +60,12 @@ exports.EDGE_ATTRIBUTES_USED_IN_RECORD_HASH = [ "biolink:log_odds_ratio", "biolink:total_sample_size", ]; + +// based on https://github.com/biolink/biolink-model/blob/master/infores_catalog.yaml +exports.text_mining_api_infores = [ + 'infores:biothings-semmeddb', + 'infores:scibite', + 'infores:semmeddb', + 'infores:text-mining-provider-cooccurrence', + 'infores:text-mining-provider-targeted' +]; diff --git a/src/results_assembly/query_results.js b/src/results_assembly/query_results.js index 910cb697..40aff8b9 100644 --- a/src/results_assembly/query_results.js +++ b/src/results_assembly/query_results.js @@ -4,6 +4,7 @@ const LogEntry = require('../log_entry'); const { getScores, calculateScore } = require('./score'); const { Record } = require('@biothings-explorer/api-response-transform'); const { enrichTrapiResultsWithPfocrFigures } = require('./pfocr'); +const config = require('../config'); /** * @type { Record } @@ -171,8 +172,11 @@ module.exports = class TrapiResultsAssembler { outputQNodeID: record.object.qNodeID, inputPrimaryCurie: record.subject.curie, outputPrimaryCurie: record.object.curie, - inputUMLS: record.subject.UMLS, //add umls for scoring - outputUMLS: record.object.UMLS, //add umls for scoring + // info for scoring + inputUMLS: record.subject.UMLS, + outputUMLS: record.object.UMLS, + isTextMined: config.text_mining_api_infores.includes(record.apiInforesCurie), + // end info for scoring qEdgeID: qEdgeID, recordHash: record.recordHash, }); @@ -361,18 +365,23 @@ module.exports = class TrapiResultsAssembler { const consolidatedSolutionRecord = { inputQNodeID: solutionRecord_0.inputQNodeID, outputQNodeID: solutionRecord_0.outputQNodeID, - inputUMLS: solutionRecord_0.inputUMLS, - outputUMLS: solutionRecord_0.outputUMLS, inputPrimaryCuries: new Set(), outputPrimaryCuries: new Set(), + inputUMLS: new Set(), + outputUMLS: new Set(), + isTextMined: [], qEdgeID: solutionRecord_0.qEdgeID, recordHashes: new Set(), }; solutionRecords.forEach( - ({ inputQNodeID, outputQNodeID, inputPrimaryCurie, outputPrimaryCurie, qEdgeID, recordHash }) => { - //debug(` inputQNodeID: ${inputQNodeID}, inputPrimaryCurie: ${inputPrimaryCurie}, outputQNodeID ${outputQNodeID}, outputPrimaryCurie: ${outputPrimaryCurie}`) + ({ inputQNodeID, outputQNodeID, inputPrimaryCurie, outputPrimaryCurie, inputUMLS, outputUMLS, isTextMined, qEdgeID, recordHash }) => { consolidatedSolutionRecord.inputPrimaryCuries.add(inputPrimaryCurie); consolidatedSolutionRecord.outputPrimaryCuries.add(outputPrimaryCurie); + consolidatedSolutionRecord.inputUMLS.add(...inputUMLS); + consolidatedSolutionRecord.outputUMLS.add(...outputUMLS); + if (!consolidatedSolutionRecord.recordHashes.has(recordHash)) { + consolidatedSolutionRecord.isTextMined.push(isTextMined); + } consolidatedSolutionRecord.recordHashes.add(recordHash); }, ); diff --git a/src/results_assembly/score.js b/src/results_assembly/score.js index 62be7041..cf26abed 100644 --- a/src/results_assembly/score.js +++ b/src/results_assembly/score.js @@ -2,8 +2,15 @@ const debug = require('debug')('bte:biothings-explorer-trapi:Score'); const axios = require('axios'); const _ = require('lodash'); -const tuning_param = 1.1; +const tuning_param = 2.0; + +const record_weight = 1.0; +const text_mined_record_weight = 0.5; +const ngd_weight = 0.25; +const LENGTH_PENALTY = 2.0; + +// create lookup table for ngd scores in the format: {inputUMLS-outputUMLS: ngd} async function query(queryPairs) { const url = 'https://biothings.ncats.io/semmeddb/query/ngd'; const batchSize = 1000; @@ -21,12 +28,13 @@ async function query(queryPairs) { //convert res array into single object with all curies let res = await Promise.all(axios_queries); res = res.map((r) => r.data.filter((combo) => Number.isFinite(combo.ngd))).flat(); // get numerical scores and flatten array - return res; + return res.reduce((acc, cur) => ({...acc, [`${cur.umls[0]}-${cur.umls[1]}`]: cur.ngd}), {}); } catch (err) { debug('Failed to query for scores: ', err); } } +// retrieve all ngd scores at once async function getScores(recordsByQEdgeID) { let pairs = {}; @@ -62,28 +70,9 @@ async function getScores(recordsByQEdgeID) { let results = await query(queries); debug('Combos no UMLS ID: ', combosWithoutIDs); - return results || []; // in case results is undefined, avoid TypeErrors + return results || {}; // in case results is undefined, avoid TypeErrors } -// //multiply the inverses of the ngds together to get the total score for a combo -// function calculateScore(comboInfo, scoreCombos) { -// let score = 1; - -// Object.keys(comboInfo).forEach((edgeKey) => { -// let multiplier = 0; - -// for (const combo of scoreCombos) { -// if (comboInfo[edgeKey].inputUMLS?.includes(combo.umls[0]) && comboInfo[edgeKey].outputUMLS?.includes(combo.umls[1])) { -// multiplier = Math.max(1/combo.ngd, multiplier); -// } -// } - -// score *= multiplier; -// }) - -// return score; -// } - // sigmoid function scaled from 0 to 1 function scaled_sigmoid(input) { const tuned_input = Math.max(input, 0) / tuning_param; @@ -91,31 +80,81 @@ function scaled_sigmoid(input) { return sigmoid * 2 - 1; } -function reverse_scaled_sigmoid(score) { - const unscaled_sigmoid = (score + 1) / 2; - const tuned_input = -Math.log(1 / unscaled_sigmoid - 1); - return tuned_input * tuning_param; -} - -//addition of scores function calculateScore(comboInfo, scoreCombos) { - let score = 0.1; + const sum = array => array.reduce((a, b) => a + b, 0); + const average = array => array.length ? sum(array) / array.length : 0; + + let score = 0; let scoredByNGD = false; - Object.keys(comboInfo).forEach((edgeKey) => { - score += 0.05 * comboInfo[edgeKey].recordHashes.size; - for (const combo of scoreCombos) { - if ( - comboInfo[edgeKey].inputUMLS?.includes(combo.umls[0]) && - comboInfo[edgeKey].outputUMLS?.includes(combo.umls[1]) - ) { - score += 1 / combo.ngd; + let edgeScores = {}; + let nodeDegrees = {}; + let edgesStartingFromNode = {}; + for (const [idx, edge] of comboInfo.entries()) { + // keep track of indegrees and outdegrees to find start and end nodes later + if (nodeDegrees.hasOwnProperty(edge.inputQNodeID)) { + nodeDegrees[edge.inputQNodeID].out += 1; + } else { + nodeDegrees[edge.inputQNodeID] = { in: 0, out: 1 }; + } + + if (nodeDegrees.hasOwnProperty(edge.outputQNodeID)) { + nodeDegrees[edge.outputQNodeID].in += 1; + } else { + nodeDegrees[edge.outputQNodeID] = { in: 1, out: 0 }; + } + + // track edge connections to find paths + if (edgesStartingFromNode.hasOwnProperty(edge.inputQNodeID)) { + edgesStartingFromNode[edge.inputQNodeID].push(idx); + } else { + edgesStartingFromNode[edge.inputQNodeID] = [idx]; + } + + let record_scores = edge.isTextMined.reduce((acc, val) => ( + acc + (val ? text_mined_record_weight : record_weight) + ), 0); + + // compute ngd score for node pair + pairs = []; + edge.inputUMLS.forEach((inputUMLS) => { + edge.outputUMLS.forEach((outputUMLS) => { + pairs.push(`${inputUMLS}-${outputUMLS}`); + }); + }); + ngd_scores = []; + pairs.forEach((pair) => { + if (scoreCombos.hasOwnProperty(pair)) { + ngd = scoreCombos[pair]; + ngd_scores.push(1 / ngd); scoredByNGD = true; } - } - }); + }); + + edgeScores[idx] = ngd_weight * average(ngd_scores) + record_scores; + } + //bfs to find paths + let startNode = Object.keys(nodeDegrees).find(node => nodeDegrees[node].in === 0); + let endNode = Object.keys(nodeDegrees).find(node => nodeDegrees[node].out === 0); + + let queue = [[startNode, 0, 0]]; + + while (queue.length > 0) { + let node, path_score, path_length; + [node, path_score, path_length] = queue.shift(); + if (node === endNode) { + score += path_score / Math.pow(path_length, LENGTH_PENALTY); + } else if (edgesStartingFromNode.hasOwnProperty(node)) { + for (let edgeIdx of edgesStartingFromNode[node]) { + queue.push([comboInfo[edgeIdx].outputQNodeID, path_score + edgeScores[edgeIdx], path_length + 1]); + } + } + } return { score: scaled_sigmoid(score), scoredByNGD }; } module.exports.getScores = getScores; module.exports.calculateScore = calculateScore; +module.exports.exportForTesting = { + record_weight, text_mined_record_weight, ngd_weight, LENGTH_PENALTY, scaled_sigmoid +}; From ff671fcc7760c54d4c225dbc6a7c6fb11347bd71 Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Wed, 2 Aug 2023 15:53:10 -0700 Subject: [PATCH 2/3] fix scoring logs --- src/inferred_mode/inferred_mode.js | 19 ------------------- src/results_assembly/query_results.js | 4 ++-- 2 files changed, 2 insertions(+), 21 deletions(-) diff --git a/src/inferred_mode/inferred_mode.js b/src/inferred_mode/inferred_mode.js index 875d22a0..0f6139b9 100644 --- a/src/inferred_mode/inferred_mode.js +++ b/src/inferred_mode/inferred_mode.js @@ -519,25 +519,6 @@ module.exports = class InferredQueryHandler { this.parent .getSummaryLog(combinedResponse, combinedResponse.logs, resultQueries) .forEach((log) => combinedResponse.logs.push(log)); - let scoredResults = 0; - let unscoredResults = 0; - combinedResponse.message.results.forEach((result) => { - const scoreFromEdges = Object.values(result.analyses[0].edge_bindings).reduce((count, qEdge_bindings) => { - return count + qEdge_bindings.length; - }, 0); - if (result.analyses[0].score > scoreFromEdges) { - scoredResults += 1; - } else { - unscoredResults += 1; - } - }); - combinedResponse.logs.push( - new LogEntry( - 'INFO', - null, - `Scoring Summary: (${scoredResults}) scored / (${unscoredResults}) unscored`, - ).getLog(), - ); } combinedResponse.logs = combinedResponse.logs.map((log) => log.toJSON()); diff --git a/src/results_assembly/query_results.js b/src/results_assembly/query_results.js index 40aff8b9..b8d17826 100644 --- a/src/results_assembly/query_results.js +++ b/src/results_assembly/query_results.js @@ -452,12 +452,12 @@ module.exports = class TrapiResultsAssembler { debug('Error enriching with PFOCR figures: ', err); this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog()); } - debug(`Successfully scored ${resultsWithScore} results, couldn't score ${resultsWithoutScore} results.`); + debug(`Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`); this.logs.push( new LogEntry( 'DEBUG', null, - `Successfully scored ${resultsWithScore} results, couldn't score ${resultsWithoutScore} results.`, + `Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`, { type: 'scoring', scored: resultsWithScore, From f3ca0aad9d7d6b02f540e42756a7b950d5580cd3 Mon Sep 17 00:00:00 2001 From: Eric Zhou Date: Thu, 3 Aug 2023 15:12:28 -0700 Subject: [PATCH 3/3] add scores in inferred mode instead of taking max --- src/inferred_mode/inferred_mode.js | 4 ++-- src/results_assembly/score.js | 6 ++++++ 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/src/inferred_mode/inferred_mode.js b/src/inferred_mode/inferred_mode.js index 0f6139b9..454794f6 100644 --- a/src/inferred_mode/inferred_mode.js +++ b/src/inferred_mode/inferred_mode.js @@ -4,7 +4,7 @@ const utils = require('../utils'); const async = require('async'); const biolink = require('../biolink'); const { getTemplates } = require('./template_lookup'); -const { addNormalizedScores } = require('../results_assembly/score'); +const { scaled_sigmoid, inverse_scaled_sigmoid } = require('../results_assembly/score'); module.exports = class InferredQueryHandler { constructor(parent, TRAPIQueryHandler, queryGraph, logs, options, path, predicatePath, includeReasoner) { @@ -304,7 +304,7 @@ module.exports = class InferredQueryHandler { const resScore = translatedResult.analyses[0].score; if (typeof combinedResponse.message.results[resultID].analyses[0].score !== 'undefined') { combinedResponse.message.results[resultID].analyses[0].score = resScore - ? Math.max(combinedResponse.message.results[resultID].analyses[0].score, resScore) + ? scaled_sigmoid(inverse_scaled_sigmoid(combinedResponse.message.results[resultID].analyses[0].score) + inverse_scaled_sigmoid(resScore)) : combinedResponse.message.results[resultID].analyses[0].score; } else { combinedResponse.message.results[resultID].analyses[0].score = resScore; diff --git a/src/results_assembly/score.js b/src/results_assembly/score.js index cf26abed..d3f8780c 100644 --- a/src/results_assembly/score.js +++ b/src/results_assembly/score.js @@ -80,6 +80,10 @@ function scaled_sigmoid(input) { return sigmoid * 2 - 1; } +function inverse_scaled_sigmoid(input) { + return -tuning_param * Math.log(2 / (input + 1) - 1); +} + function calculateScore(comboInfo, scoreCombos) { const sum = array => array.reduce((a, b) => a + b, 0); const average = array => array.length ? sum(array) / array.length : 0; @@ -155,6 +159,8 @@ function calculateScore(comboInfo, scoreCombos) { module.exports.getScores = getScores; module.exports.calculateScore = calculateScore; +module.exports.scaled_sigmoid = scaled_sigmoid; +module.exports.inverse_scaled_sigmoid = inverse_scaled_sigmoid; module.exports.exportForTesting = { record_weight, text_mined_record_weight, ngd_weight, LENGTH_PENALTY, scaled_sigmoid };