From 029571dc222388b605d7690118fae84786546396 Mon Sep 17 00:00:00 2001 From: eric Date: Wed, 21 Feb 2024 11:25:37 +0100 Subject: [PATCH] search for rors in FOSM --- client/src/pages/affiliationsTab.jsx | 6 +++-- client/src/pages/datasetsTab.jsx | 2 +- client/src/pages/filters.jsx | 29 ++++++++++++----------- client/src/pages/index.jsx | 4 ++-- client/src/pages/publicationsTab.jsx | 2 +- client/src/utils/ror.jsx | 8 +++---- client/src/utils/templates.jsx | 2 +- server/src/routes/works.routes.js | 35 +++++++++++++++------------- server/src/utils/works.js | 13 +++++++---- 9 files changed, 56 insertions(+), 45 deletions(-) diff --git a/client/src/pages/affiliationsTab.jsx b/client/src/pages/affiliationsTab.jsx index 932a86ca..e671041f 100644 --- a/client/src/pages/affiliationsTab.jsx +++ b/client/src/pages/affiliationsTab.jsx @@ -16,7 +16,7 @@ import { normalizeName, renderButtons } from '../utils/works'; export default function AffiliationsTab({ affiliations, selectedAffiliations, setSelectedAffiliations, tagAffiliations }) { const [filteredAffiliations, setFilteredAffiliations] = useState([]); const [filteredAffiliationName, setFilteredAffiliationName] = useState(''); - const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id]); + const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id, status.validated.id, status.excluded.id]); const [timer, setTimer] = useState(); useEffect(() => { @@ -70,7 +70,8 @@ export default function AffiliationsTab({ affiliations, selectedAffiliations, se - + { /* + + */ } datasource.key)); - const [filteredStatus, setFilteredStatus] = useState([status.validated.id]); + const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id, status.validated.id, status.excluded.id]); const [filteredPublishers, setFilteredPublishers] = useState([]); const [filteredTypes, setFilteredTypes] = useState([]); const [filteredYears, setFilteredYears] = useState([]); diff --git a/client/src/pages/filters.jsx b/client/src/pages/filters.jsx index fd27e8a9..636f359d 100644 --- a/client/src/pages/filters.jsx +++ b/client/src/pages/filters.jsx @@ -9,13 +9,15 @@ import { import PropTypes from 'prop-types'; import { useEffect, useState } from 'react'; import { useSearchParams } from 'react-router-dom'; -import { getRorNames } from '../utils/ror'; +import { getRorData, isRor } from '../utils/ror'; import TagInput from '../components/tag-input'; const START_YEAR = 2010; const years = [...Array(new Date().getFullYear() - START_YEAR + 1).keys()].map((year) => (year + START_YEAR).toString()).map((year) => ({ label: year, value: year })); +const normalizeStr = (x) => x.replaceAll(',', ' ').replaceAll(' ', ' '); + export default function Filters({ sendQuery }) { const [searchParams, setSearchParams] = useSearchParams(); const [currentSearchParams, setCurrentSearchParams] = useState({}); @@ -42,33 +44,31 @@ export default function Filters({ sendQuery }) { startYear: searchParams.get('startYear'), }); const affiliations = searchParams.getAll('affiliations'); - const queries = affiliations.map((affiliation) => getRorNames(affiliation)); + const queries = affiliations.map((affiliation) => getRorData(affiliation)); const rorNames = await Promise.all(queries); const allTags = []; const knownTags = {}; affiliations.forEach((affiliation) => { - allTags.push({ label: affiliation, source: 'user' }); + if (isRor(affiliation)) { + allTags.push({ label: affiliation.replace('https://ror.org/', '').replace('ror.org/', ''), source: 'user', type: 'rorId' }); + } else { + allTags.push({ label: affiliation, source: 'user', type: 'affiliationString' }); + } knownTags[affiliation.toLowerCase()] = 1; }); rorNames.flat().forEach((rorElt) => { if (knownTags[rorElt.rorId.toLowerCase()] === undefined) { - allTags.push({ label: rorElt.rorId, source: 'rorId' }); + allTags.push({ label: rorElt.rorId, source: 'ror', type: 'rorId' }); knownTags[rorElt.rorId.toLowerCase()] = 1; } rorElt.names.forEach((rorName) => { if (knownTags[rorName.toLowerCase()] === undefined) { - allTags.push({ label: rorName, source: 'rorName' }); + const isDangerous = rorName.length < 4; + allTags.push({ label: rorName, source: 'ror', type: 'affiliationString', rorId: rorElt.rorId, isDangerous }); knownTags[rorName.toLowerCase()] = 1; } }); }); - // let allTags = [ - // ...affiliations.map((affiliation) => ({ label: affiliation, source: 'user' })), - // ...rorNames.flat().map((name) => ({ label: name, source: 'ror' })), - // ]; - // Remove duplicates - // allTags = [...new Map(allTags.reverse().map((v) => [v.label.toLowerCase(), v])).values()].reverse(); - console.log('allTags', allTags); setTags(allTags); } }; @@ -92,7 +92,10 @@ export default function Filters({ sendQuery }) { } setMessageType(''); setMessage(''); - sendQuery(currentSearchParams); + const queryParams = { datasets: currentSearchParams.datasets, startYear: currentSearchParams.startYear, endYear: currentSearchParams.endYear }; + queryParams.affiliationStrings = tags.filter((tag) => tag.type === 'affiliationString').map((tag) => normalizeStr(tag.label)); + queryParams.rors = tags.filter((tag) => tag.type === 'rorId').map((tag) => tag.label); + sendQuery(queryParams); }; return ( diff --git a/client/src/pages/index.jsx b/client/src/pages/index.jsx index 90dcf328..afe7ed92 100644 --- a/client/src/pages/index.jsx +++ b/client/src/pages/index.jsx @@ -59,7 +59,7 @@ export default function Home() { }; useEffect(() => { - const regexpTmp = new RegExp(`(${(options?.affiliations ?? []) + const regexpTmp = new RegExp(`(${(options?.affiliationStrings ?? []) .map((affiliationQuery) => affiliationQuery .replaceAll(/(a|à|á|â|ã|ä|å)/g, '(a|à|á|â|ã|ä|å)') .replaceAll(/(e|è|é|ê|ë)/g, '(e|è|é|ê|ë)') @@ -73,7 +73,7 @@ export default function Home() { .replaceAll(/œ/g, '(œ|oe)')) .join('|')})`, 'gi'); setRegexp(regexpTmp); - }, [options?.affiliations]); + }, [options?.affiliationStrings]); useEffect(() => { if (data) { diff --git a/client/src/pages/publicationsTab.jsx b/client/src/pages/publicationsTab.jsx index 0d2c0bfd..b84c5a90 100644 --- a/client/src/pages/publicationsTab.jsx +++ b/client/src/pages/publicationsTab.jsx @@ -18,7 +18,7 @@ export default function PublicationsTab({ publications, publishers, selectedPubl const [filteredDatasources, setFilteredDatasources] = useState(datasources.map((datasource) => datasource.key)); const [filteredPublications, setFilteredPublications] = useState([]); const [filteredPublishers, setFilteredPublishers] = useState([]); - const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id]); + const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id, status.validated.id, status.excluded.id]); const [filteredTypes, setFilteredTypes] = useState([]); const [filteredYears, setFilteredYears] = useState([]); const [timer, setTimer] = useState(); diff --git a/client/src/utils/ror.jsx b/client/src/utils/ror.jsx index 101e6658..62680bdd 100644 --- a/client/src/utils/ror.jsx +++ b/client/src/utils/ror.jsx @@ -3,7 +3,7 @@ const rorRegex = /^0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}$/; const isRor = (affiliation) => rorRegex.test(affiliation); -const getRorNames = async (affiliation) => { +const getRorData = async (affiliation) => { const affiliationId = affiliation.replace('https://ror.org/', '').replace('ror.org/', ''); if (!isRor(affiliationId)) return []; let response = await fetch(`https://api.ror.org/organizations/${affiliationId}`); @@ -12,14 +12,13 @@ const getRorNames = async (affiliation) => { let childrenRes = []; const childrenQueries = []; children.forEach((child) => { - childrenQueries.push(getRorNames(child.id)); + childrenQueries.push(getRorData(child.id)); }); if (childrenQueries.length > 0) { childrenRes = await Promise.all(childrenQueries); } const topLevel = [{ rorId: affiliationId, - children, names: [ response.name, ...response.acronyms, @@ -31,5 +30,6 @@ const getRorNames = async (affiliation) => { }; export { - getRorNames, + getRorData, + isRor, }; diff --git a/client/src/utils/templates.jsx b/client/src/utils/templates.jsx index 418fc499..29026fb3 100644 --- a/client/src/utils/templates.jsx +++ b/client/src/utils/templates.jsx @@ -70,7 +70,7 @@ const getAffiliationsHtmlField = (rowData, regexp) => { const getAffiliationsTooltipField = (rowData) => { let html = '
    '; - html += rowData.affiliations.map((affiliation, index) => `
  • ${affiliation}
  • `).join(''); + html += rowData.affiliations?.map((affiliation, index) => `
  • ${affiliation}
  • `).join(''); html += '
'; return html; }; diff --git a/server/src/routes/works.routes.js b/server/src/routes/works.routes.js index c3fdfd1e..f809a28e 100644 --- a/server/src/routes/works.routes.js +++ b/server/src/routes/works.routes.js @@ -10,39 +10,42 @@ router.route('/works') .get(async (req, res) => { try { const options = req?.query ?? {}; - if (!options?.affiliations) { + if (!options?.affiliationStrings) { res.status(400).json({ message: 'You must provide at least one affiliation.' }); } else { webSocketServer.broadcast(0); - console.time(`1. Requests ${options.affiliations}`); - options.affiliations = options.affiliations.split(','); + console.time(`1. Requests ${options}`); + options.affiliationStrings = options.affiliationStrings.split(','); + if (options?.rors) { + options.rors = options.rors.split(','); + } options.datasets = options.datasets === 'true'; options.years = range(options.startYear, options.endYear); const responses = await Promise.all([ getFosmWorks({ options }), getOpenAlexPublications({ options }), ]); - console.timeEnd(`1. Requests ${options.affiliations}`); + console.timeEnd(`1. Requests ${options}`); webSocketServer.broadcast(1); - console.time(`2. Concat ${options.affiliations}`); + console.time(`2. Concat ${options}`); const works = [ ...responses[0], ...responses[1], ]; - console.timeEnd(`2. Concat ${options.affiliations}`); + console.timeEnd(`2. Concat ${options}`); webSocketServer.broadcast(2); - console.time(`3. Dedup ${options.affiliations}`); + console.time(`3. Dedup ${options}`); // Deduplicate publications by ids const deduplicatedWorks = deduplicateWorks(works); - console.timeEnd(`3. Dedup ${options.affiliations}`); + console.timeEnd(`3. Dedup ${options}`); webSocketServer.broadcast(3); // Compute distinct affiliations of works - console.time(`4. GroupBy ${options.affiliations}`); + console.time(`4. GroupBy ${options}`); const uniqueAffiliations = groupByAffiliations({ options, works: deduplicatedWorks }); - console.timeEnd(`4. GroupBy ${options.affiliations}`); + console.timeEnd(`4. GroupBy ${options}`); webSocketServer.broadcast(4); // Sort between publications and datasets - console.time(`5. Sort works ${options.affiliations}`); + console.time(`5. Sort works ${options}`); const publications = []; let datasets = []; const deduplicatedWorksLength = deduplicatedWorks.length; @@ -58,20 +61,20 @@ router.route('/works') } } } - console.timeEnd(`5. Sort works ${options.affiliations}`); + console.timeEnd(`5. Sort works ${options}`); webSocketServer.broadcast(5); // Compute distinct types & years for facet - console.time(`6. Facet ${options.affiliations}`); + console.time(`6. Facet ${options}`); const publicationsYears = countUniqueValues({ data: publications, field: 'year' }); const datasetsYears = countUniqueValues({ data: datasets, field: 'year' }); const publicationsTypes = countUniqueValues({ data: publications, field: 'type' }); const datasetsTypes = countUniqueValues({ data: datasets, field: 'type' }); const publicationsPublishers = countUniqueValues({ data: publications, field: 'publisher' }); const datasetsPublishers = countUniqueValues({ data: datasets, field: 'publisher' }); - console.timeEnd(`6. Facet ${options.affiliations}`); + console.timeEnd(`6. Facet ${options}`); webSocketServer.broadcast(6); // Build and serialize response - console.time(`7. Serialization ${options.affiliations}`); + console.time(`7. Serialization ${options}`); res.status(200).json({ affiliations: uniqueAffiliations, datasets: { @@ -87,7 +90,7 @@ router.route('/works') years: publicationsYears, }, }); - console.timeEnd(`7. Serialization ${options.affiliations}`); + console.timeEnd(`7. Serialization ${options}`); webSocketServer.broadcast(7); } } catch (err) { diff --git a/server/src/utils/works.js b/server/src/utils/works.js index 65ff16c2..0117600d 100644 --- a/server/src/utils/works.js +++ b/server/src/utils/works.js @@ -29,9 +29,12 @@ const deduplicateWorks = (works) => { const getFosmQuery = (options, pit, searchAfter) => { const query = { size: process.env.FOSM_SIZE, query: { bool: { filter: [], must: [], must_not: [], should: [] } } }; const affiliationsFields = ['affiliations.name']; - options.affiliations.forEach((affiliation) => { + options.affiliationStrings.forEach((affiliation) => { query.query.bool.should.push({ multi_match: { fields: affiliationsFields, query: `"${affiliation}"`, operator: 'and' } }); }); + options.rors.forEach((ror) => { + query.query.bool.should.push({ match: { rors: ror } }); + }); query.query.bool.must.push({ range: { year: { gte: options.year, lte: options.year } } }); // Exclude files for Datacite query.query.bool.must_not.push({ terms: { genre: ['file', 'version', 'file_'] } }); @@ -85,7 +88,7 @@ const getFosmWorksByYear = async ({ results = [], options, pit, searchAfter }) = // eslint-disable-next-line no-param-reassign results = results.concat(hits.map((result) => ({ // Filter ids on unique values - affiliations: result._source.affiliations.map((affiliation) => affiliation.name).filter((affiliation) => !!affiliation), + affiliations: result._source.affiliations?.map((affiliation) => affiliation.name).filter((affiliation) => !!affiliation), allIds: Object.values((result?._source?.external_ids ?? []).reduce((acc, obj) => ({ ...acc, [obj.id_value]: obj }), {})), authors: (result._source?.authors ?? []).map((author) => author.full_name), datasource: ['fosm'], @@ -170,8 +173,8 @@ const getOpenAlexPublicationsByYear = (options, cursor = '*', previousResponse = let url = `https://api.openalex.org/works?per_page=${process.env.OPENALEX_PER_PAGE}`; url += '&filter=is_paratext:false'; url += `,publication_year:${Number(options.year)}-${Number(options?.year)}`; - if (options.affiliations.length) { - url += `,raw_affiliation_string.search:(${options.affiliations.map((aff) => `"${aff}"`).join(' OR ')})`; + if (options.affiliationStrings.length) { + url += `,raw_affiliation_string.search:(${options.affiliationStrings.map((aff) => `"${aff}"`).join(' OR ')})`; } if (options.datasets) { url += ',type:dataset'; @@ -221,7 +224,7 @@ const getOpenAlexPublications = async ({ options }) => { }; const groupByAffiliations = ({ options, works }) => { - const normalizedAffiliations = options.affiliations.map((affiliation) => removeDiacritics(affiliation)); + const normalizedAffiliations = options.affiliationStrings.map((affiliation) => removeDiacritics(affiliation)); // Compute distinct affiliations of works let allAffiliationsTmp = works.reduce((deduplicatedAffiliations, work) => { const { affiliations = [], id } = work;