Skip to content

Commit

Permalink
feat(affiliations): Create a dedicated endpoint to improve affiliations
Browse files Browse the repository at this point in the history
will return no publications and no datasets
  • Loading branch information
annelhote committed Dec 24, 2024
1 parent 16df7d7 commit 0d030eb
Show file tree
Hide file tree
Showing 6 changed files with 225 additions and 8 deletions.
6 changes: 3 additions & 3 deletions client/src/pages/openalex-affiliations/results/index.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ import getFlagEmoji from '../../../utils/flags';
import { getRorData, isRor } from '../../../utils/ror';
import { normalize, removeDiacritics } from '../../../utils/strings';
import { getTagColor } from '../../../utils/tags';
import { getWorks } from '../../../utils/works';
import { getAffiliations } from '../../../utils/works';
import ExportErrorsButton from '../components/export-errors-button';
import SendFeedbackButton from '../components/send-feedback-button';
import ListView from './list-view';
Expand Down Expand Up @@ -83,9 +83,9 @@ export default function Affiliations() {
];

const { data, error, isFetched, isFetching, refetch } = useQuery({
queryKey: ['works', JSON.stringify(options)],
queryKey: ['affiliations', JSON.stringify(options)],
// Search for works from affiliations for each affiliation strictly longer than 2 letters
queryFn: () => getWorks(
queryFn: () => getAffiliations(
{
...options,
affiliationStrings: options.affiliations
Expand Down
36 changes: 35 additions & 1 deletion client/src/utils/works.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,40 @@ const timeout = (time) => {
return controller;
};

const getAffiliations = async (body, toast) => {
const response = await fetch(`${VITE_API}/affiliations`, {
body: JSON.stringify(body),
headers: { 'Content-Type': 'application/json' },
method: 'POST',
signal: timeout(1200).signal, // 20 minutes
});
if (!response.ok) {
throw new Error('Oops... FOSM API request did not work for works !');
}
const { affiliations, warnings } = await response.json();
const resAffiliations = await unzipAll(affiliations);
let warningMessage = '';
if (warnings?.isMaxFosmReached) {
warningMessage = warningMessage.concat(
`More than ${warnings.maxFosmValue} publications found in French OSM, only the first ${warnings.maxFosmValue} were retrieved.\n`,
);
}
if (warnings?.isMaxOpenalexReached) {
warningMessage = warningMessage.concat(
`More than ${warnings.maxOpenalexValue} publications found in OpenAlex, only the first ${warnings.maxOpenalexValue} were retrieved.\n`,
);
}
if (warningMessage) {
toast({
description: warningMessage,
id: 'tooManyPublications',
title: 'Too Many publications found',
toastType: 'error',
});
}
return { affiliations: resAffiliations, warnings };
};

const getIdLink = (type, id) => {
let prefix = null;
switch (type) {
Expand Down Expand Up @@ -78,7 +112,6 @@ const getMentions = async (options) => {
};

const getWorks = async (body, toast) => {
// TODO: Replace by useQuery
const response = await fetch(`${VITE_API}/works`, {
body: JSON.stringify(body),
headers: { 'Content-Type': 'application/json' },
Expand Down Expand Up @@ -160,6 +193,7 @@ const renderButtonDataset = (selected, fn, label, icon) => (
);

export {
getAffiliations,
getIdLink,
getMentions,
getWorks,
Expand Down
2 changes: 2 additions & 0 deletions server/src/router.js
Original file line number Diff line number Diff line change
@@ -1,11 +1,13 @@
import express from 'express';

import affiliationsRouter from './routes/affiliations.routes';
import filesRouter from './routes/files.routes';
import mentionsRouter from './routes/mentions.routes';
import worksRouter from './routes/works.routes';

const router = new express.Router();

router.use(affiliationsRouter);
router.use(filesRouter);
router.use(mentionsRouter);
router.use(worksRouter);
Expand Down
182 changes: 182 additions & 0 deletions server/src/routes/affiliations.routes.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,182 @@
import crypto from 'crypto';
import express from 'express';

import { getInstitutionIdFromRor } from '../utils/openalex';
import { getCache, saveCache } from '../utils/s3';
import { chunkArray, countUniqueValues, range } from '../utils/utils';
import {
datasetsType,
deduplicateWorks,
getFosmWorks,
getOpenAlexPublications,
groupByAffiliations,
} from '../utils/works';

const SEED_MAX = 2048;
const USE_CACHE = true;

const router = new express.Router();

const arrayBufferToBase64 = (buffer) => {
let binary = '';
const bytes = new Uint8Array(buffer);
const len = bytes.byteLength;
for (let i = 0; i < len; i += 1) {
binary += String.fromCharCode(bytes[i]);
}
return btoa(binary);
};

const compressData = async (result) => {
// Convert JSON to Stream
const stream = new Blob([JSON.stringify(result)], {
type: 'application/json',
}).stream();
const compressedReadableStream = stream.pipeThrough(
new CompressionStream('gzip'),
);
// create Response
const compressedResponse = new Response(compressedReadableStream);
const blob = await compressedResponse.blob();
// Get the ArrayBuffer
const buffer = await blob.arrayBuffer();
// convert ArrayBuffer to base64 encoded string
return arrayBufferToBase64(buffer);
};

const chunkAndCompress = (data) => {
const chunks = chunkArray({ array: data, perChunk: 1000 });
return Promise.all(chunks.map((c) => compressData(c)));
};

const getWorks = async ({ options, resetCache = false }) => {
const shasum = crypto.createHash('sha1');
shasum.update(JSON.stringify({ ...options, type: 'affiliations' }));
const searchId = shasum.digest('hex');
const start = new Date();
const queryId = start
.toISOString()
.concat(' - ', Math.floor(Math.random() * SEED_MAX).toString());
let cache = false;
if (USE_CACHE) {
console.time(
`0. Query ${queryId} | Retrieve cache if exists ${options.affiliationStrings}`,
);
cache = await getCache({ searchId });
console.timeEnd(
`0. Query ${queryId} | Retrieve cache if exists ${options.affiliationStrings}`,
);
}
if (cache && !resetCache) {
const extractionDate = new Date(cache.extractionDate);
console.log(
`0. Query ${queryId} | Returning cached data from ${extractionDate}`,
);
return cache;
}
console.time(`1. Query ${queryId} | Requests ${options.affiliationStrings}`);
// eslint-disable-next-line no-param-reassign
options.years = range(options.startYear, options.endYear);
// eslint-disable-next-line no-param-reassign
options.openAlexExclusions = await Promise.all(
options.rorExclusions.map((ror) => getInstitutionIdFromRor(ror)),
);
const queries = [];
queries.push(getFosmWorks({ options }));
const affiliationStringsChunks = chunkArray({
array: options.affiliationStrings,
});
const rorsChunks = chunkArray({ array: options.rors });
// Separate RoRs from Affiliations strings to query OpenAlex
affiliationStringsChunks.forEach((affiliationStrings) => {
queries.push(
getOpenAlexPublications({
options: { ...options, affiliationStrings, rors: [] },
}),
);
});
rorsChunks.forEach((rors) => {
queries.push(
getOpenAlexPublications({
options: { ...options, rors, affiliationStrings: [] },
}),
);
});
const responses = await Promise.all(queries);
const warnings = {};
const MAX_FOSM = Number(process.env.ES_MAX_SIZE);
if (MAX_FOSM > 0 && responses.length > 0 && responses[0].length >= MAX_FOSM) {
warnings.isMaxFosmReached = true;
warnings.maxFosmValue = MAX_FOSM;
}
const MAX_OPENALEX = Number(process.env.OPENALEX_MAX_SIZE);
if (
MAX_OPENALEX > 0
&& responses.length > 1
&& responses[1].length >= MAX_OPENALEX
) {
warnings.isMaxOpenalexReached = true;
warnings.maxOpenalexValue = MAX_OPENALEX;
}
console.timeEnd(
`1. Query ${queryId} | Requests ${options.affiliationStrings}`,
);
const works = responses.flat();
console.time(`2. Query ${queryId} | Dedup ${options.affiliationStrings}`);
// Deduplicate publications by ids
const deduplicatedWorks = deduplicateWorks(works);
console.timeEnd(`2. Query ${queryId} | Dedup ${options.affiliationStrings}`);
// Compute distinct affiliations of works
console.time(`3. Query ${queryId} | GroupBy ${options.affiliationStrings}`);
const uniqueAffiliations = groupByAffiliations({
options,
works: deduplicatedWorks,
});
console.timeEnd(
`3. Query ${queryId} | GroupBy ${options.affiliationStrings}`,
);
// Build and serialize response
console.time(
`4. Query ${queryId} | Serialization ${options.affiliationStrings}`,
);
const affiliations = await chunkAndCompress(uniqueAffiliations);
console.log(
'serialization',
`${uniqueAffiliations.length} affiliations serialized`,
);
const result = {
affiliations,
extractionDate: Date.now(),
warnings,
};
console.timeEnd(
`4. Query ${queryId} | Serialization ${options.affiliationStrings}`,
);
console.time(
`5. Query ${queryId} | Save cache ${options.affiliationStrings}`,
);
await saveCache({ queryId, result, searchId });
console.timeEnd(
`5. Query ${queryId} | Save cache ${options.affiliationStrings}`,
);
return result;
};

router.route('/affiliations').post(async (req, res) => {
try {
const options = req?.body ?? {};
if (!options?.affiliationStrings && !options?.rors) {
res.status(400).json({
message: 'You must provide at least one affiliation string or RoR.',
});
} else {
const compressedResult = await getWorks({ options });
res.status(200).json(compressedResult);
}
} catch (err) {
console.error(err);
res.status(500).json({ message: 'Internal Server Error.' });
}
});

export default router;
5 changes: 2 additions & 3 deletions server/src/routes/files.routes.js
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,8 @@ const router = new express.Router();
const key = process.env.OS_PASSWORD;
const projectName = process.env.OS_PROJECT_NAME;
const projectId = process.env.OS_TENANT_ID;
const tenantName = process.env.OS_TENANT_NAME;
const username = process.env.OS_USERNAME;
const user = `${tenantName}:${username}`;

const user = `${process.env.OS_TENANT_NAME}:${process.env.OS_USERNAME}`;

router.route('/download')
.get(async (_, res) => {
Expand Down
2 changes: 1 addition & 1 deletion server/src/routes/works.routes.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ const chunkAndCompress = (data) => {

const getWorks = async ({ options, resetCache = false }) => {
const shasum = crypto.createHash('sha1');
shasum.update(JSON.stringify(options));
shasum.update(JSON.stringify({ ...options, type: 'works' }));
const searchId = shasum.digest('hex');
const start = new Date();
const queryId = start
Expand Down

0 comments on commit 0d030eb

Please sign in to comment.