Skip to content

Commit

Permalink
refactor(api): Optimize pipeline, close #42
Browse files Browse the repository at this point in the history
  • Loading branch information
annelhote committed Nov 27, 2023
1 parent 027d626 commit e1620f4
Show file tree
Hide file tree
Showing 3 changed files with 85 additions and 73 deletions.
82 changes: 49 additions & 33 deletions server/src/routes/works.routes.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
import express from 'express';

import { groupByAffiliations } from '../utils/utils';
import { deduplicateWorks, getFosmWorks, getOpenAlexPublications } from '../utils/works';
import { deduplicateWorks, getFosmWorks, getOpenAlexPublications, groupByAffiliations } from '../utils/works';

const router = new express.Router();

Expand All @@ -21,43 +20,60 @@ router.route('/works')
getFosmWorks({ options: { ...options, filter: { field: 'genre', value: 'dataset' } }, index: process.env.VITE_FOSM_DATASETS_INDEX }),
]);
console.timeEnd(`0. Requests ${options.affiliations}`);
console.time(`1. Filter ${options.affiliations}`);
const data = {};
data.publications = {
results: [
...responses[0].results.filter((result) => result.genre_raw !== 'dataset'),
...responses[1].results,
],
};
data.datasets = {
results: [
...responses[0].results.filter((result) => result.genre_raw === 'dataset'),
...responses[2].results,
],
};
console.timeEnd(`1. Filter ${options.affiliations}`);
console.time(`1. Concat ${options.affiliations}`);
const works = [
...responses[0],
...responses[1],
...responses[2],
];
console.timeEnd(`1. Concat ${options.affiliations}`);
console.time(`2. Dedup ${options.affiliations}`);
// Deduplicate publications by ids
data.publications.results = deduplicateWorks(data.publications.results);
const deduplicatedWorks = deduplicateWorks(works);
console.timeEnd(`2. Dedup ${options.affiliations}`);
// Goup by affiliations
console.time(`3. GroupBy ${options.affiliations}`);
const uniqueAffiliations = groupByAffiliations({ options, works: deduplicatedWorks });
console.timeEnd(`3. GroupBy ${options.affiliations}`);
// Sort between publications and datasets
console.time(`4. Sort ${options.affiliations}`);
const publications = [];
const datasets = [];
deduplicatedWorks.forEach((deduplicatedWork) => {
if (
(deduplicatedWork.datasource.includes('fosm') && deduplicatedWork.genre_raw !== 'dataset')
|| (deduplicatedWork.datasource.includes('openalex') && deduplicatedWork.type !== 'dataset')
) {
publications.push(deduplicatedWork);
} else if (
(deduplicatedWork.datasource.includes('fosm') && deduplicatedWork.genre_raw === 'dataset')
|| (deduplicatedWork.datasource.includes('openalex') && deduplicatedWork.type === 'dataset')
) {
datasets.push(deduplicatedWork);
} else {
console.log(`Work not sort : ${JSON.stringify(deduplicatedWork)}`);
}
});
console.timeEnd(`4. Sort ${options.affiliations}`);
// Compute distinct types & years for facet
console.time(`3. Facet ${options.affiliations}`);
data.publications.years = [...new Set(
data.publications.results.filter((publication) => !!publication?.year).map((publication) => Number(publication.year)),
console.time(`5. Facet ${options.affiliations}`);
const publicationsYears = [...new Set(
publications.filter((publication) => !!publication?.year).map((publication) => Number(publication.year)),
)].sort((a, b) => b - a);
data.datasets.years = [...new Set(
data.datasets.results.filter((dataset) => !!dataset?.year).map((dataset) => Number(dataset.year)),
const datasetsYears = [...new Set(
datasets.filter((dataset) => !!dataset?.year).map((dataset) => Number(dataset.year)),
)].sort((a, b) => b - a);
data.publications.types = [...new Set(data.publications.results.map((publication) => publication?.type))];
data.datasets.types = [...new Set(data.datasets.results.map((dataset) => dataset?.type))];
console.timeEnd(`3. Facet ${options.affiliations}`);
// Goup by affiliations
console.time(`4. GroupBy ${options.affiliations}`);
data.affiliations = groupByAffiliations({ ...data, options });
console.timeEnd(`4. GroupBy ${options.affiliations}`);
console.time(`5. Serialization ${options.affiliations}`);
res.status(200).json(data);
console.timeEnd(`5. Serialization ${options.affiliations}`);
const publicationsTypes = [...new Set(publications.map((publication) => publication?.type))];
const datasetsTypes = [...new Set(datasets.map((dataset) => dataset?.type))];
console.timeEnd(`5. Facet ${options.affiliations}`);
// Build and serialize response
console.time(`6. Serialization ${options.affiliations}`);
res.status(200).json({
affiliations: uniqueAffiliations,
datasets: { results: datasets, types: datasetsTypes, years: datasetsYears },
publications: { results: publications, types: publicationsTypes, years: publicationsYears },
});
console.timeEnd(`6. Serialization ${options.affiliations}`);
}
} catch (err) {
console.error(err);
Expand Down
34 changes: 2 additions & 32 deletions server/src/utils/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -39,39 +39,9 @@ const normalizedName = (name) => name
.replace(/\s+/g, ' ')
.trim();

const groupByAffiliations = ({ datasets, options, publications }) => {
const regexp = getRegexpFromOptions(options);
// Compute distinct affiliations of the undecided works
let allAffiliationsTmp = {};
[...datasets.results, ...publications.results].forEach((work) => {
(work?.affiliations ?? [])
.forEach((affiliation) => {
const normalizedAffiliationName = normalizedName(affiliation);
if (!allAffiliationsTmp?.[normalizedAffiliationName]) {
// Check matches in affiliation name
let matches = affiliation?.match(regexp) ?? [];
// Normalize matched strings
matches = matches.map((match) => normalizedName(match));
// Filter matches as unique
matches = [...new Set(matches)];
allAffiliationsTmp[normalizedAffiliationName] = {
matches: matches.length,
name: affiliation,
nameHtml: affiliation.replace(regexp, '<b>$&</b>'),
works: [],
};
}
allAffiliationsTmp[normalizedAffiliationName].works.push(work.id);
});
});

allAffiliationsTmp = Object.values(allAffiliationsTmp)
.map((affiliation, index) => ({ ...affiliation, id: index.toString(), works: [...new Set(affiliation.works)], worksNumber: [...new Set(affiliation.works)].length }));
return allAffiliationsTmp;
};

export {
cleanId,
groupByAffiliations,
getRegexpFromOptions,
normalizedName,
range,
};
42 changes: 34 additions & 8 deletions server/src/utils/works.js
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import { cleanId, range } from './utils';
import { cleanId, getRegexpFromOptions, normalizedName, range } from './utils';

const VITE_OPENALEX_MAX_PAGE = Math.floor(process.env.VITE_OPENALEX_SIZE / process.env.VITE_OPENALEX_PER_PAGE);

Expand Down Expand Up @@ -114,10 +114,7 @@ const getFosmWorks = async ({ options }) => {
const years = range(startYear, endYear);
const promises = years.map((year) => getFosmWorksByYear({ options: { ...options, year } }));
const allResults = await Promise.all(promises);
return ({
datasource: 'fosm',
results: allResults.flat(),
});
return allResults.flat();
};

const getTypeFromOpenAlex = (type) => {
Expand Down Expand Up @@ -210,14 +207,43 @@ const getOpenAlexPublications = async ({ options }) => {
const years = range(startYear, endYear);
const promises = years.map((year) => getOpenAlexPublicationsByYear({ ...options, year }));
const allResults = await Promise.all(promises);
return ({
datasource: 'openalex',
results: allResults.flat(),
return allResults.flat();
};

const groupByAffiliations = ({ options, works }) => {
const regexp = getRegexpFromOptions(options);
// Compute distinct affiliations of works
let allAffiliationsTmp = {};
works.forEach((work) => {
(work?.affiliations ?? [])
.forEach((affiliation) => {
const normalizedAffiliationName = normalizedName(affiliation);
if (!allAffiliationsTmp?.[normalizedAffiliationName]) {
// Check matches in affiliation name
let matches = affiliation?.match(regexp) ?? [];
// Normalize matched strings
matches = matches.map((match) => normalizedName(match));
// Filter matches as unique
matches = [...new Set(matches)];
allAffiliationsTmp[normalizedAffiliationName] = {
matches: matches.length,
name: affiliation,
nameHtml: affiliation.replace(regexp, '<b>$&</b>'),
works: [],
};
}
allAffiliationsTmp[normalizedAffiliationName].works.push(work.id);
});
});

allAffiliationsTmp = Object.values(allAffiliationsTmp)
.map((affiliation, index) => ({ ...affiliation, id: index.toString(), works: [...new Set(affiliation.works)], worksNumber: [...new Set(affiliation.works)].length }));
return allAffiliationsTmp;
};

export {
deduplicateWorks,
getFosmWorks,
getOpenAlexPublications,
groupByAffiliations,
};

0 comments on commit e1620f4

Please sign in to comment.