From 5de47ec9883940d983c53a49b589f0f4de2bdf3e Mon Sep 17 00:00:00 2001
From: Eric Zhou <ericz1803@gmail.com>
Date: Wed, 2 Aug 2023 15:27:03 -0700
Subject: [PATCH 1/3] #634 scoring overhaul

---
 __test__/unittest/score.test.js       | 117 +++++++++++++++++++++++++
 src/config.js                         |   9 ++
 src/results_assembly/query_results.js |  21 +++--
 src/results_assembly/score.js         | 119 +++++++++++++++++---------
 4 files changed, 220 insertions(+), 46 deletions(-)
 create mode 100644 __test__/unittest/score.test.js

diff --git a/__test__/unittest/score.test.js b/__test__/unittest/score.test.js
new file mode 100644
index 00000000..c4bf12b3
--- /dev/null
+++ b/__test__/unittest/score.test.js
@@ -0,0 +1,117 @@
+const { calculateScore, exportForTesting } = require('../../src/results_assembly/score');
+const { record_weight, text_mined_record_weight, ngd_weight, LENGTH_PENALTY, scaled_sigmoid } = exportForTesting;
+
+describe('Test score function', () => {
+  const ngdPairs = {
+    'C0678941-C0267841': 0.5,
+    'C4548369-C0678941': 0.6,
+    'C4548369-C0267841': 0.7
+  };
+
+  const sampleComboSimple = [
+    {
+      inputQNodeID: 'nB',
+      outputQNodeID: 'nC',
+      inputPrimaryCuries: new Set(['UMLS:C0678941']),
+      outputPrimaryCuries: new Set(['MONDO:0006633']),
+      inputUMLS: new Set(['C0678941']),
+      outputUMLS: new Set(['C0267841']),
+      isTextMined: [ true ],
+      qEdgeID: 'eB',
+      recordHashes: new Set(['a'])
+    },
+    {
+      inputQNodeID: 'nA',
+      outputQNodeID: 'nB',
+      inputPrimaryCuries: new Set(['PUBCHEM.COMPOUND:77843966']),
+      outputPrimaryCuries: new Set(['UMLS:C0678941']),
+      inputUMLS: new Set(['C4548369']),
+      outputUMLS: new Set(['C0678941']),
+      isTextMined: [ true ],
+      qEdgeID: 'eA',
+      recordHashes: new Set(['b'])
+    }
+  ];
+
+  const sampleComboComplex = [
+    {
+      inputQNodeID: 'nB',
+      outputQNodeID: 'nC',
+      inputPrimaryCuries: new Set(['UMLS:C0678941']),
+      outputPrimaryCuries: new Set(['MONDO:0006633']),
+      inputUMLS: new Set(['C0678941']),
+      outputUMLS: new Set(['C0267841']),
+      isTextMined: [ true, false, true ],
+      qEdgeID: 'eB',
+      recordHashes: new Set(['a', 'b', 'c'])
+    },
+    {
+      inputQNodeID: 'nA',
+      outputQNodeID: 'nB',
+      inputPrimaryCuries: new Set(['PUBCHEM.COMPOUND:77843966']),
+      outputPrimaryCuries: new Set(['UMLS:C0678941']),
+      inputUMLS: new Set(['C4548369']),
+      outputUMLS: new Set(['C0678941']),
+      isTextMined: [ true, true, true ],
+      qEdgeID: 'eA',
+      recordHashes: new Set(['b', 'c', 'd'])
+    },
+    {
+      inputQNodeID: 'nA',
+      outputQNodeID: 'nC',
+      inputPrimaryCuries: new Set(['PUBCHEM.COMPOUND:77843966']),
+      outputPrimaryCuries: new Set(['MONDO:0006633']),
+      inputUMLS: new Set(['C4548369']),
+      outputUMLS: new Set(['C0267841']),
+      isTextMined: [ false, false ],
+      qEdgeID: 'eC',
+      recordHashes: new Set(['c', 'd'])
+    }
+  ];
+
+  test('Test calculateScore function - simple case w/ ngd', () => {
+    const eAScore = text_mined_record_weight + ngd_weight * (1 / ngdPairs['C4548369-C0678941']);
+    const eBScore = text_mined_record_weight + ngd_weight * (1 / ngdPairs['C0678941-C0267841']);
+    const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY));
+
+    const res = calculateScore(sampleComboSimple, ngdPairs);
+    expect(res.score).toBe(expected_score);
+    expect(res.scoredByNGD).toBeTruthy();
+  });
+
+  test('Test calculateScore function - simple case w/o ngd', () => {
+    const eAScore = text_mined_record_weight;
+    const eBScore = text_mined_record_weight;
+    const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY));
+
+    const res = calculateScore(sampleComboSimple, {});
+    expect(res.score).toBe(expected_score);
+    expect(res.scoredByNGD).toBeFalsy();
+  });
+
+  test('Test calculateScore function - complex case w/ ngd', () => {
+    const eAScore = 2 * text_mined_record_weight + 1 * record_weight + ngd_weight * (1 / ngdPairs['C4548369-C0678941']);
+    const eBScore = 3 * text_mined_record_weight + 0 * record_weight + ngd_weight * (1 / ngdPairs['C0678941-C0267841']);
+    const eCScore = 0 * text_mined_record_weight + 2 * record_weight + ngd_weight * (1 / ngdPairs['C4548369-C0267841']);
+    
+    const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY) + eCScore / Math.pow(1, LENGTH_PENALTY));
+    
+    const res = calculateScore(sampleComboComplex, ngdPairs);
+    expect(res.score).toBe(expected_score);
+    expect(res.scoredByNGD).toBeTruthy();
+  });
+
+  test('Test calculateScore function - complex case w/o ngd', () => {
+    const eAScore = 2 * text_mined_record_weight + 1 * record_weight;
+    const eBScore = 3 * text_mined_record_weight + 0 * record_weight;
+    const eCScore = 0 * text_mined_record_weight + 2 * record_weight;
+
+    const expected_score = scaled_sigmoid((eBScore + eAScore) / Math.pow(2, LENGTH_PENALTY) + eCScore / Math.pow(1, LENGTH_PENALTY));
+    
+    const res = calculateScore(sampleComboComplex, {});
+    expect(res.score).toBe(expected_score);
+    expect(res.scoredByNGD).toBeFalsy();
+  });
+});
+
+
diff --git a/src/config.js b/src/config.js
index 871d9b47..e16365c5 100644
--- a/src/config.js
+++ b/src/config.js
@@ -60,3 +60,12 @@ exports.EDGE_ATTRIBUTES_USED_IN_RECORD_HASH = [
   "biolink:log_odds_ratio",
   "biolink:total_sample_size",
 ];
+
+// based on https://github.com/biolink/biolink-model/blob/master/infores_catalog.yaml
+exports.text_mining_api_infores = [
+  'infores:biothings-semmeddb',
+  'infores:scibite',
+  'infores:semmeddb',
+  'infores:text-mining-provider-cooccurrence',
+  'infores:text-mining-provider-targeted'
+];
diff --git a/src/results_assembly/query_results.js b/src/results_assembly/query_results.js
index 910cb697..40aff8b9 100644
--- a/src/results_assembly/query_results.js
+++ b/src/results_assembly/query_results.js
@@ -4,6 +4,7 @@ const LogEntry = require('../log_entry');
 const { getScores, calculateScore } = require('./score');
 const { Record } = require('@biothings-explorer/api-response-transform');
 const { enrichTrapiResultsWithPfocrFigures } = require('./pfocr');
+const config = require('../config');
 
 /**
  * @type { Record }
@@ -171,8 +172,11 @@ module.exports = class TrapiResultsAssembler {
           outputQNodeID: record.object.qNodeID,
           inputPrimaryCurie: record.subject.curie,
           outputPrimaryCurie: record.object.curie,
-          inputUMLS: record.subject.UMLS, //add umls for scoring
-          outputUMLS: record.object.UMLS, //add umls for scoring
+          // info for scoring
+          inputUMLS: record.subject.UMLS,
+          outputUMLS: record.object.UMLS,
+          isTextMined: config.text_mining_api_infores.includes(record.apiInforesCurie),
+          // end info for scoring
           qEdgeID: qEdgeID,
           recordHash: record.recordHash,
         });
@@ -361,18 +365,23 @@ module.exports = class TrapiResultsAssembler {
           const consolidatedSolutionRecord = {
             inputQNodeID: solutionRecord_0.inputQNodeID,
             outputQNodeID: solutionRecord_0.outputQNodeID,
-            inputUMLS: solutionRecord_0.inputUMLS,
-            outputUMLS: solutionRecord_0.outputUMLS,
             inputPrimaryCuries: new Set(),
             outputPrimaryCuries: new Set(),
+            inputUMLS: new Set(),
+            outputUMLS: new Set(),
+            isTextMined: [],
             qEdgeID: solutionRecord_0.qEdgeID,
             recordHashes: new Set(),
           };
           solutionRecords.forEach(
-            ({ inputQNodeID, outputQNodeID, inputPrimaryCurie, outputPrimaryCurie, qEdgeID, recordHash }) => {
-              //debug(`  inputQNodeID: ${inputQNodeID}, inputPrimaryCurie: ${inputPrimaryCurie}, outputQNodeID ${outputQNodeID}, outputPrimaryCurie: ${outputPrimaryCurie}`)
+            ({ inputQNodeID, outputQNodeID, inputPrimaryCurie, outputPrimaryCurie, inputUMLS, outputUMLS, isTextMined, qEdgeID, recordHash }) => {
               consolidatedSolutionRecord.inputPrimaryCuries.add(inputPrimaryCurie);
               consolidatedSolutionRecord.outputPrimaryCuries.add(outputPrimaryCurie);
+              consolidatedSolutionRecord.inputUMLS.add(...inputUMLS);
+              consolidatedSolutionRecord.outputUMLS.add(...outputUMLS);
+              if (!consolidatedSolutionRecord.recordHashes.has(recordHash)) {
+                consolidatedSolutionRecord.isTextMined.push(isTextMined);
+              }
               consolidatedSolutionRecord.recordHashes.add(recordHash);
             },
           );
diff --git a/src/results_assembly/score.js b/src/results_assembly/score.js
index 62be7041..cf26abed 100644
--- a/src/results_assembly/score.js
+++ b/src/results_assembly/score.js
@@ -2,8 +2,15 @@ const debug = require('debug')('bte:biothings-explorer-trapi:Score');
 const axios = require('axios');
 
 const _ = require('lodash');
-const tuning_param = 1.1;
 
+const tuning_param = 2.0;
+
+const record_weight = 1.0;
+const text_mined_record_weight = 0.5;
+const ngd_weight = 0.25;
+const LENGTH_PENALTY = 2.0;
+
+// create lookup table for ngd scores in the format: {inputUMLS-outputUMLS: ngd}
 async function query(queryPairs) {
   const url = 'https://biothings.ncats.io/semmeddb/query/ngd';
   const batchSize = 1000;
@@ -21,12 +28,13 @@ async function query(queryPairs) {
     //convert res array into single object with all curies
     let res = await Promise.all(axios_queries);
     res = res.map((r) => r.data.filter((combo) => Number.isFinite(combo.ngd))).flat(); // get numerical scores and flatten array
-    return res;
+    return res.reduce((acc, cur) => ({...acc, [`${cur.umls[0]}-${cur.umls[1]}`]: cur.ngd}), {});
   } catch (err) {
     debug('Failed to query for scores: ', err);
   }
 }
 
+// retrieve all ngd scores at once
 async function getScores(recordsByQEdgeID) {
   let pairs = {};
 
@@ -62,28 +70,9 @@ async function getScores(recordsByQEdgeID) {
   let results = await query(queries);
 
   debug('Combos no UMLS ID: ', combosWithoutIDs);
-  return results || []; // in case results is undefined, avoid TypeErrors
+  return results || {}; // in case results is undefined, avoid TypeErrors
 }
 
-// //multiply the inverses of the ngds together to get the total score for a combo
-// function calculateScore(comboInfo, scoreCombos) {
-//   let score = 1;
-
-//   Object.keys(comboInfo).forEach((edgeKey) => {
-//     let multiplier = 0;
-
-//     for (const combo of scoreCombos) {
-//       if (comboInfo[edgeKey].inputUMLS?.includes(combo.umls[0]) && comboInfo[edgeKey].outputUMLS?.includes(combo.umls[1])) {
-//         multiplier = Math.max(1/combo.ngd, multiplier);
-//       }
-//     }
-
-//     score *= multiplier;
-//   })
-
-//   return score;
-// }
-
 // sigmoid function scaled from 0 to 1
 function scaled_sigmoid(input) {
   const tuned_input = Math.max(input, 0) / tuning_param;
@@ -91,31 +80,81 @@ function scaled_sigmoid(input) {
   return sigmoid * 2 - 1;
 }
 
-function reverse_scaled_sigmoid(score) {
-  const unscaled_sigmoid = (score + 1) / 2;
-  const tuned_input = -Math.log(1 / unscaled_sigmoid - 1);
-  return tuned_input * tuning_param;
-}
-
-//addition of scores
 function calculateScore(comboInfo, scoreCombos) {
-  let score = 0.1;
+  const sum = array => array.reduce((a, b) => a + b, 0);
+  const average = array => array.length ? sum(array) / array.length : 0;
+
+  let score = 0;
   let scoredByNGD = false;
-  Object.keys(comboInfo).forEach((edgeKey) => {
-    score += 0.05 * comboInfo[edgeKey].recordHashes.size;
-    for (const combo of scoreCombos) {
-      if (
-        comboInfo[edgeKey].inputUMLS?.includes(combo.umls[0]) &&
-        comboInfo[edgeKey].outputUMLS?.includes(combo.umls[1])
-      ) {
-        score += 1 / combo.ngd;
+  let edgeScores = {};
+  let nodeDegrees = {};
+  let edgesStartingFromNode = {};
+  for (const [idx, edge] of comboInfo.entries()) {
+    // keep track of indegrees and outdegrees to find start and end nodes later
+    if (nodeDegrees.hasOwnProperty(edge.inputQNodeID)) {
+      nodeDegrees[edge.inputQNodeID].out += 1;
+    } else {
+      nodeDegrees[edge.inputQNodeID] = { in: 0, out: 1 };
+    }
+
+    if (nodeDegrees.hasOwnProperty(edge.outputQNodeID)) {
+      nodeDegrees[edge.outputQNodeID].in += 1;
+    } else {
+      nodeDegrees[edge.outputQNodeID] = { in: 1, out: 0 };
+    }
+
+    // track edge connections to find paths
+    if (edgesStartingFromNode.hasOwnProperty(edge.inputQNodeID)) {
+      edgesStartingFromNode[edge.inputQNodeID].push(idx);
+    } else {
+      edgesStartingFromNode[edge.inputQNodeID] = [idx];
+    }
+
+    let record_scores = edge.isTextMined.reduce((acc, val) => (
+      acc + (val ? text_mined_record_weight : record_weight)
+    ), 0);
+
+    // compute ngd score for node pair
+    pairs = [];
+    edge.inputUMLS.forEach((inputUMLS) => {
+      edge.outputUMLS.forEach((outputUMLS) => {
+        pairs.push(`${inputUMLS}-${outputUMLS}`);
+      });
+    });
+    ngd_scores = [];
+    pairs.forEach((pair) => {
+      if (scoreCombos.hasOwnProperty(pair)) {
+        ngd = scoreCombos[pair];
+        ngd_scores.push(1 / ngd);
         scoredByNGD = true;
       }
-    }
-  });
+    });
+
+    edgeScores[idx] = ngd_weight * average(ngd_scores) + record_scores;
+  }
 
+  //bfs to find paths
+  let startNode = Object.keys(nodeDegrees).find(node => nodeDegrees[node].in === 0);
+  let endNode = Object.keys(nodeDegrees).find(node => nodeDegrees[node].out === 0);
+
+  let queue = [[startNode, 0, 0]];
+  
+  while (queue.length > 0) {
+    let node, path_score, path_length;
+    [node, path_score, path_length] = queue.shift();
+    if (node === endNode) {
+      score += path_score / Math.pow(path_length, LENGTH_PENALTY);
+    } else if (edgesStartingFromNode.hasOwnProperty(node)) {
+      for (let edgeIdx of edgesStartingFromNode[node]) {
+        queue.push([comboInfo[edgeIdx].outputQNodeID, path_score + edgeScores[edgeIdx], path_length + 1]);
+      }
+    }
+  }
   return { score: scaled_sigmoid(score), scoredByNGD };
 }
 
 module.exports.getScores = getScores;
 module.exports.calculateScore = calculateScore;
+module.exports.exportForTesting = {
+  record_weight, text_mined_record_weight, ngd_weight, LENGTH_PENALTY, scaled_sigmoid
+};

From ff671fcc7760c54d4c225dbc6a7c6fb11347bd71 Mon Sep 17 00:00:00 2001
From: Eric Zhou <ericz1803@gmail.com>
Date: Wed, 2 Aug 2023 15:53:10 -0700
Subject: [PATCH 2/3] fix scoring logs

---
 src/inferred_mode/inferred_mode.js    | 19 -------------------
 src/results_assembly/query_results.js |  4 ++--
 2 files changed, 2 insertions(+), 21 deletions(-)

diff --git a/src/inferred_mode/inferred_mode.js b/src/inferred_mode/inferred_mode.js
index 875d22a0..0f6139b9 100644
--- a/src/inferred_mode/inferred_mode.js
+++ b/src/inferred_mode/inferred_mode.js
@@ -519,25 +519,6 @@ module.exports = class InferredQueryHandler {
       this.parent
         .getSummaryLog(combinedResponse, combinedResponse.logs, resultQueries)
         .forEach((log) => combinedResponse.logs.push(log));
-      let scoredResults = 0;
-      let unscoredResults = 0;
-      combinedResponse.message.results.forEach((result) => {
-        const scoreFromEdges = Object.values(result.analyses[0].edge_bindings).reduce((count, qEdge_bindings) => {
-          return count + qEdge_bindings.length;
-        }, 0);
-        if (result.analyses[0].score > scoreFromEdges) {
-          scoredResults += 1;
-        } else {
-          unscoredResults += 1;
-        }
-      });
-      combinedResponse.logs.push(
-        new LogEntry(
-          'INFO',
-          null,
-          `Scoring Summary: (${scoredResults}) scored / (${unscoredResults}) unscored`,
-        ).getLog(),
-      );
     }
     combinedResponse.logs = combinedResponse.logs.map((log) => log.toJSON());
 
diff --git a/src/results_assembly/query_results.js b/src/results_assembly/query_results.js
index 40aff8b9..b8d17826 100644
--- a/src/results_assembly/query_results.js
+++ b/src/results_assembly/query_results.js
@@ -452,12 +452,12 @@ module.exports = class TrapiResultsAssembler {
         debug('Error enriching with PFOCR figures: ', err);
         this.logs.push(new LogEntry('DEBUG', null, 'Error enriching with PFOCR figures: ', err).getLog());
       }
-      debug(`Successfully scored ${resultsWithScore} results, couldn't score ${resultsWithoutScore} results.`);
+      debug(`Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`);
       this.logs.push(
         new LogEntry(
           'DEBUG',
           null,
-          `Successfully scored ${resultsWithScore} results, couldn't score ${resultsWithoutScore} results.`,
+          `Scored ${resultsWithScore} results with NGD score, scored ${resultsWithoutScore} results without NGD.`,
           {
             type: 'scoring',
             scored: resultsWithScore,

From f3ca0aad9d7d6b02f540e42756a7b950d5580cd3 Mon Sep 17 00:00:00 2001
From: Eric Zhou <ericz1803@gmail.com>
Date: Thu, 3 Aug 2023 15:12:28 -0700
Subject: [PATCH 3/3] add scores in inferred mode instead of taking max

---
 src/inferred_mode/inferred_mode.js | 4 ++--
 src/results_assembly/score.js      | 6 ++++++
 2 files changed, 8 insertions(+), 2 deletions(-)

diff --git a/src/inferred_mode/inferred_mode.js b/src/inferred_mode/inferred_mode.js
index 0f6139b9..454794f6 100644
--- a/src/inferred_mode/inferred_mode.js
+++ b/src/inferred_mode/inferred_mode.js
@@ -4,7 +4,7 @@ const utils = require('../utils');
 const async = require('async');
 const biolink = require('../biolink');
 const { getTemplates } = require('./template_lookup');
-const { addNormalizedScores } = require('../results_assembly/score');
+const { scaled_sigmoid, inverse_scaled_sigmoid } = require('../results_assembly/score');
 
 module.exports = class InferredQueryHandler {
   constructor(parent, TRAPIQueryHandler, queryGraph, logs, options, path, predicatePath, includeReasoner) {
@@ -304,7 +304,7 @@ module.exports = class InferredQueryHandler {
         const resScore = translatedResult.analyses[0].score;
         if (typeof combinedResponse.message.results[resultID].analyses[0].score !== 'undefined') {
           combinedResponse.message.results[resultID].analyses[0].score = resScore
-            ? Math.max(combinedResponse.message.results[resultID].analyses[0].score, resScore)
+            ? scaled_sigmoid(inverse_scaled_sigmoid(combinedResponse.message.results[resultID].analyses[0].score) + inverse_scaled_sigmoid(resScore))
             : combinedResponse.message.results[resultID].analyses[0].score;
         } else {
           combinedResponse.message.results[resultID].analyses[0].score = resScore;
diff --git a/src/results_assembly/score.js b/src/results_assembly/score.js
index cf26abed..d3f8780c 100644
--- a/src/results_assembly/score.js
+++ b/src/results_assembly/score.js
@@ -80,6 +80,10 @@ function scaled_sigmoid(input) {
   return sigmoid * 2 - 1;
 }
 
+function inverse_scaled_sigmoid(input) {
+  return -tuning_param * Math.log(2 / (input + 1) - 1);
+}
+
 function calculateScore(comboInfo, scoreCombos) {
   const sum = array => array.reduce((a, b) => a + b, 0);
   const average = array => array.length ? sum(array) / array.length : 0;
@@ -155,6 +159,8 @@ function calculateScore(comboInfo, scoreCombos) {
 
 module.exports.getScores = getScores;
 module.exports.calculateScore = calculateScore;
+module.exports.scaled_sigmoid = scaled_sigmoid;
+module.exports.inverse_scaled_sigmoid = inverse_scaled_sigmoid;
 module.exports.exportForTesting = {
   record_weight, text_mined_record_weight, ngd_weight, LENGTH_PENALTY, scaled_sigmoid
 };