Skip to content

Commit

Permalink
Some refactoring, implemented base cell summary similarities and rui …
Browse files Browse the repository at this point in the history
…location distances. WIP.
  • Loading branch information
bherr2 committed Sep 5, 2023
1 parent bdba1b4 commit 2a289f8
Show file tree
Hide file tree
Showing 13 changed files with 785,616 additions and 45 deletions.
96 changes: 96 additions & 0 deletions data-processor/package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions data-processor/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
"fetch-sparql-endpoint": "^3.3.3",
"glob": "^10.3.0",
"jsonld": "^8.2.0",
"mathjs": "^11.10.1",
"papaparse": "^5.4.1",
"shelljs": "^0.8.5"
}
Expand Down
43 changes: 0 additions & 43 deletions data-processor/src/ctpop-db.js

This file was deleted.

21 changes: 21 additions & 0 deletions data-processor/src/gen-cell-summary-similarities.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import { readFileSync, writeFileSync } from 'fs';
import { getAllCellSummarySimilarities } from './utils/cell-summary-similarity.js';

const OUTPUT = '../data/cell-summary-similarities.jsonld';

const allSummaries = [
'../data/dataset-cell-summaries.jsonld',
'../data/as-cell-summaries.jsonld',
'../data/rui-location-cell-summaries.jsonld',
]
.map((path) => JSON.parse(readFileSync(path))['@graph'])
.reduce((acc, arr) => acc.concat(arr), []);

const results = [...getAllCellSummarySimilarities(allSummaries)];

// Write out the new enriched_rui_locations.jsonld file
const jsonld = {
...JSON.parse(readFileSync('ccf-context.jsonld')),
'@graph': results,
};
writeFileSync(OUTPUT, JSON.stringify(jsonld, null, 2));
2 changes: 1 addition & 1 deletion data-processor/src/gen-registrations.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
// Requires Node v18+ (for fetch support)
import { readFileSync, writeFileSync } from 'fs';
import Papa from 'papaparse';
import { getHbmToUuidLookup } from './hubmap-uuid-lookup.js';
import { getHbmToUuidLookup } from './utils/hubmap-uuid-lookup.js';

const CSV_URL =
'https://docs.google.com/spreadsheets/d/1cwxztPg9sLq0ASjJ5bntivUk6dSKHsVyR1bE6bXvMkY/export?format=csv&gid=1529271254';
Expand Down
2 changes: 1 addition & 1 deletion data-processor/src/gen-reports.js
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ import { readFileSync, writeFileSync } from 'fs';
import { globSync } from 'glob';
import { basename } from 'path';
import sh from 'shelljs';
import { selectCsvRemote } from './sparql.js';
import { selectCsvRemote } from './utils/sparql.js';

// SPARQL endpoint with ctpop data loaded
//const SPARQL_ENDPOINT = 'https://api.triplydb.com/datasets/bherr/ctpop/services/ctpop/sparql';
Expand Down
25 changes: 25 additions & 0 deletions data-processor/src/gen-rui-location-distances.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
import { readFileSync, writeFileSync } from 'fs';
import { getAllSpatialEntityDistances } from './utils/spatial-entity-distance.js';

const OUTPUT = '../data/rui-location-distances.jsonld';
const donors = JSON.parse(readFileSync('../data/rui_locations.jsonld'))['@graph'];

const ruiLocations = [];
for (const donor of donors) {
for (const block of donor['samples']) {
const ruiLocation = block.rui_location;
ruiLocations.push(ruiLocation);
}
}

const results = [];
for await (const distance of getAllSpatialEntityDistances(ruiLocations)) {
results.push(distance);
};

// Write out the new enriched_rui_locations.jsonld file
const jsonld = {
...JSON.parse(readFileSync('ccf-context.jsonld')),
'@graph': results,
};
writeFileSync(OUTPUT, JSON.stringify(jsonld, null, 2));
61 changes: 61 additions & 0 deletions data-processor/src/utils/cell-summary-similarity.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
import { dot, norm } from 'mathjs';

/**
* A function to return a cosine sim for two vectors
*
* @param {number[]} a
* @param {number[]} b
* @returns cosine similarity between a and b
*/
function cosineSim(a, b) {
return dot(a, b) / (norm(a) * norm(b));
}

function getCellCountsSimilarity(cellsA, cellsB) {
const keySet = new Set(Object.keys(cellsA));
let sharedKey = false;
for (const key of Object.keys(cellsB)) {
if (keySet.has(key)) {
sharedKey = true;
}
keySet.add(key);
}

// Only compute cosine sim if there is at least one shared key
if (sharedKey) {
const keys = [...keySet];
const valuesA = keys.map((key) => cellsA[key] ?? 0);
const valuesB = keys.map((key) => cellsB[key] ?? 0);
return cosineSim(valuesA, valuesB);
} else {
return 0;
}
}

function getCellDistribution(summary) {
return summary['summary'].reduce((acc, row) => ((acc[row['cell_id']] = row['percentage']), acc), {});
}

export function getCellSummarySimilarity(summaryA, summaryB) {
const cellsA = getCellDistribution(summaryA);
const cellsB = getCellDistribution(summaryB);
return getCellCountsSimilarity(cellsA, cellsB);
}

export function* getAllCellSummarySimilarities(summaries, minSimilarity = 0) {
for (let i = 0; i < summaries.length; i++) {
for (let j = i + 1; j < summaries.length; j++) {
const summaryA = summaries[i];
const summaryB = summaries[j];
const similarity = getCellSummarySimilarity(summaryA, summaryB);
if (similarity > minSimilarity) {
yield {
'@type': 'CellSummarySimilarity',
cell_source_a: summaryA['cell_source'],
cell_source_b: summaryB['cell_source'],
similarity,
};
}
}
}
}
File renamed without changes.
File renamed without changes.
Loading

0 comments on commit 2a289f8

Please sign in to comment.