Skip to content

Commit

Permalink
search for rors in FOSM
Browse files Browse the repository at this point in the history
  • Loading branch information
ericjeangirard committed Feb 21, 2024
1 parent 93cf73e commit 029571d
Show file tree
Hide file tree
Showing 9 changed files with 56 additions and 45 deletions.
6 changes: 4 additions & 2 deletions client/src/pages/affiliationsTab.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ import { normalizeName, renderButtons } from '../utils/works';
export default function AffiliationsTab({ affiliations, selectedAffiliations, setSelectedAffiliations, tagAffiliations }) {
const [filteredAffiliations, setFilteredAffiliations] = useState([]);
const [filteredAffiliationName, setFilteredAffiliationName] = useState('');
const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id]);
const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id, status.validated.id, status.excluded.id]);
const [timer, setTimer] = useState();

useEffect(() => {
Expand Down Expand Up @@ -70,7 +70,8 @@ export default function AffiliationsTab({ affiliations, selectedAffiliations, se
</Col>
</Row>
<Row gutters>
<Col n="2">
{ /*
<Col n="1">
<CheckboxGroup
hint="Filter affilitions on the decisions already made"
legend="Filter on decision status"
Expand All @@ -86,6 +87,7 @@ export default function AffiliationsTab({ affiliations, selectedAffiliations, se
))}
</CheckboxGroup>
</Col>
*/ }
<Col n="10">
<AffiliationsView
allAffiliations={filteredAffiliations}
Expand Down
2 changes: 1 addition & 1 deletion client/src/pages/datasetsTab.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ export default function DatasetsTab({ datasets, publishers, selectedDatasets, se
const [filteredAffiliationName, setFilteredAffiliationName] = useState('');
const [filteredDatasets, setFilteredDatasets] = useState([]);
const [filteredDatasources, setFilteredDatasources] = useState(datasources.map((datasource) => datasource.key));
const [filteredStatus, setFilteredStatus] = useState([status.validated.id]);
const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id, status.validated.id, status.excluded.id]);
const [filteredPublishers, setFilteredPublishers] = useState([]);
const [filteredTypes, setFilteredTypes] = useState([]);
const [filteredYears, setFilteredYears] = useState([]);
Expand Down
29 changes: 16 additions & 13 deletions client/src/pages/filters.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -9,13 +9,15 @@ import {
import PropTypes from 'prop-types';
import { useEffect, useState } from 'react';
import { useSearchParams } from 'react-router-dom';
import { getRorNames } from '../utils/ror';
import { getRorData, isRor } from '../utils/ror';

import TagInput from '../components/tag-input';

const START_YEAR = 2010;
const years = [...Array(new Date().getFullYear() - START_YEAR + 1).keys()].map((year) => (year + START_YEAR).toString()).map((year) => ({ label: year, value: year }));

const normalizeStr = (x) => x.replaceAll(',', ' ').replaceAll(' ', ' ');

export default function Filters({ sendQuery }) {
const [searchParams, setSearchParams] = useSearchParams();
const [currentSearchParams, setCurrentSearchParams] = useState({});
Expand All @@ -42,33 +44,31 @@ export default function Filters({ sendQuery }) {
startYear: searchParams.get('startYear'),
});
const affiliations = searchParams.getAll('affiliations');
const queries = affiliations.map((affiliation) => getRorNames(affiliation));
const queries = affiliations.map((affiliation) => getRorData(affiliation));
const rorNames = await Promise.all(queries);
const allTags = [];
const knownTags = {};
affiliations.forEach((affiliation) => {
allTags.push({ label: affiliation, source: 'user' });
if (isRor(affiliation)) {
allTags.push({ label: affiliation.replace('https://ror.org/', '').replace('ror.org/', ''), source: 'user', type: 'rorId' });
} else {
allTags.push({ label: affiliation, source: 'user', type: 'affiliationString' });
}
knownTags[affiliation.toLowerCase()] = 1;
});
rorNames.flat().forEach((rorElt) => {
if (knownTags[rorElt.rorId.toLowerCase()] === undefined) {
allTags.push({ label: rorElt.rorId, source: 'rorId' });
allTags.push({ label: rorElt.rorId, source: 'ror', type: 'rorId' });
knownTags[rorElt.rorId.toLowerCase()] = 1;
}
rorElt.names.forEach((rorName) => {
if (knownTags[rorName.toLowerCase()] === undefined) {
allTags.push({ label: rorName, source: 'rorName' });
const isDangerous = rorName.length < 4;
allTags.push({ label: rorName, source: 'ror', type: 'affiliationString', rorId: rorElt.rorId, isDangerous });
knownTags[rorName.toLowerCase()] = 1;
}
});
});
// let allTags = [
// ...affiliations.map((affiliation) => ({ label: affiliation, source: 'user' })),
// ...rorNames.flat().map((name) => ({ label: name, source: 'ror' })),
// ];
// Remove duplicates
// allTags = [...new Map(allTags.reverse().map((v) => [v.label.toLowerCase(), v])).values()].reverse();
console.log('allTags', allTags);
setTags(allTags);
}
};
Expand All @@ -92,7 +92,10 @@ export default function Filters({ sendQuery }) {
}
setMessageType('');
setMessage('');
sendQuery(currentSearchParams);
const queryParams = { datasets: currentSearchParams.datasets, startYear: currentSearchParams.startYear, endYear: currentSearchParams.endYear };
queryParams.affiliationStrings = tags.filter((tag) => tag.type === 'affiliationString').map((tag) => normalizeStr(tag.label));
queryParams.rors = tags.filter((tag) => tag.type === 'rorId').map((tag) => tag.label);
sendQuery(queryParams);
};

return (
Expand Down
4 changes: 2 additions & 2 deletions client/src/pages/index.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ export default function Home() {
};

useEffect(() => {
const regexpTmp = new RegExp(`(${(options?.affiliations ?? [])
const regexpTmp = new RegExp(`(${(options?.affiliationStrings ?? [])
.map((affiliationQuery) => affiliationQuery
.replaceAll(/(a|à|á|â|ã|ä|å)/g, '(a|à|á|â|ã|ä|å)')
.replaceAll(/(e|è|é|ê|ë)/g, '(e|è|é|ê|ë)')
Expand All @@ -73,7 +73,7 @@ export default function Home() {
.replaceAll(/œ/g, '(œ|oe)'))
.join('|')})`, 'gi');
setRegexp(regexpTmp);
}, [options?.affiliations]);
}, [options?.affiliationStrings]);

useEffect(() => {
if (data) {
Expand Down
2 changes: 1 addition & 1 deletion client/src/pages/publicationsTab.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ export default function PublicationsTab({ publications, publishers, selectedPubl
const [filteredDatasources, setFilteredDatasources] = useState(datasources.map((datasource) => datasource.key));
const [filteredPublications, setFilteredPublications] = useState([]);
const [filteredPublishers, setFilteredPublishers] = useState([]);
const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id]);
const [filteredStatus, setFilteredStatus] = useState([status.tobedecided.id, status.validated.id, status.excluded.id]);
const [filteredTypes, setFilteredTypes] = useState([]);
const [filteredYears, setFilteredYears] = useState([]);
const [timer, setTimer] = useState();
Expand Down
8 changes: 4 additions & 4 deletions client/src/utils/ror.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ const rorRegex = /^0[a-hj-km-np-tv-z|0-9]{6}[0-9]{2}$/;

const isRor = (affiliation) => rorRegex.test(affiliation);

const getRorNames = async (affiliation) => {
const getRorData = async (affiliation) => {
const affiliationId = affiliation.replace('https://ror.org/', '').replace('ror.org/', '');
if (!isRor(affiliationId)) return [];
let response = await fetch(`https://api.ror.org/organizations/${affiliationId}`);
Expand All @@ -12,14 +12,13 @@ const getRorNames = async (affiliation) => {
let childrenRes = [];
const childrenQueries = [];
children.forEach((child) => {
childrenQueries.push(getRorNames(child.id));
childrenQueries.push(getRorData(child.id));
});
if (childrenQueries.length > 0) {
childrenRes = await Promise.all(childrenQueries);
}
const topLevel = [{
rorId: affiliationId,
children,
names: [
response.name,
...response.acronyms,
Expand All @@ -31,5 +30,6 @@ const getRorNames = async (affiliation) => {
};

export {
getRorNames,
getRorData,
isRor,
};
2 changes: 1 addition & 1 deletion client/src/utils/templates.jsx
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@ const getAffiliationsHtmlField = (rowData, regexp) => {

const getAffiliationsTooltipField = (rowData) => {
let html = '<ul>';
html += rowData.affiliations.map((affiliation, index) => `<li key="tooltip-affiliation-${rowData.id}-${index}">${affiliation}</li>`).join('');
html += rowData.affiliations?.map((affiliation, index) => `<li key="tooltip-affiliation-${rowData.id}-${index}">${affiliation}</li>`).join('');
html += '</ul>';
return html;
};
Expand Down
35 changes: 19 additions & 16 deletions server/src/routes/works.routes.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,39 +10,42 @@ router.route('/works')
.get(async (req, res) => {
try {
const options = req?.query ?? {};
if (!options?.affiliations) {
if (!options?.affiliationStrings) {
res.status(400).json({ message: 'You must provide at least one affiliation.' });
} else {
webSocketServer.broadcast(0);
console.time(`1. Requests ${options.affiliations}`);
options.affiliations = options.affiliations.split(',');
console.time(`1. Requests ${options}`);
options.affiliationStrings = options.affiliationStrings.split(',');
if (options?.rors) {
options.rors = options.rors.split(',');
}
options.datasets = options.datasets === 'true';
options.years = range(options.startYear, options.endYear);
const responses = await Promise.all([
getFosmWorks({ options }),
getOpenAlexPublications({ options }),
]);
console.timeEnd(`1. Requests ${options.affiliations}`);
console.timeEnd(`1. Requests ${options}`);
webSocketServer.broadcast(1);
console.time(`2. Concat ${options.affiliations}`);
console.time(`2. Concat ${options}`);
const works = [
...responses[0],
...responses[1],
];
console.timeEnd(`2. Concat ${options.affiliations}`);
console.timeEnd(`2. Concat ${options}`);
webSocketServer.broadcast(2);
console.time(`3. Dedup ${options.affiliations}`);
console.time(`3. Dedup ${options}`);
// Deduplicate publications by ids
const deduplicatedWorks = deduplicateWorks(works);
console.timeEnd(`3. Dedup ${options.affiliations}`);
console.timeEnd(`3. Dedup ${options}`);
webSocketServer.broadcast(3);
// Compute distinct affiliations of works
console.time(`4. GroupBy ${options.affiliations}`);
console.time(`4. GroupBy ${options}`);
const uniqueAffiliations = groupByAffiliations({ options, works: deduplicatedWorks });
console.timeEnd(`4. GroupBy ${options.affiliations}`);
console.timeEnd(`4. GroupBy ${options}`);
webSocketServer.broadcast(4);
// Sort between publications and datasets
console.time(`5. Sort works ${options.affiliations}`);
console.time(`5. Sort works ${options}`);
const publications = [];
let datasets = [];
const deduplicatedWorksLength = deduplicatedWorks.length;
Expand All @@ -58,20 +61,20 @@ router.route('/works')
}
}
}
console.timeEnd(`5. Sort works ${options.affiliations}`);
console.timeEnd(`5. Sort works ${options}`);
webSocketServer.broadcast(5);
// Compute distinct types & years for facet
console.time(`6. Facet ${options.affiliations}`);
console.time(`6. Facet ${options}`);
const publicationsYears = countUniqueValues({ data: publications, field: 'year' });
const datasetsYears = countUniqueValues({ data: datasets, field: 'year' });
const publicationsTypes = countUniqueValues({ data: publications, field: 'type' });
const datasetsTypes = countUniqueValues({ data: datasets, field: 'type' });
const publicationsPublishers = countUniqueValues({ data: publications, field: 'publisher' });
const datasetsPublishers = countUniqueValues({ data: datasets, field: 'publisher' });
console.timeEnd(`6. Facet ${options.affiliations}`);
console.timeEnd(`6. Facet ${options}`);
webSocketServer.broadcast(6);
// Build and serialize response
console.time(`7. Serialization ${options.affiliations}`);
console.time(`7. Serialization ${options}`);
res.status(200).json({
affiliations: uniqueAffiliations,
datasets: {
Expand All @@ -87,7 +90,7 @@ router.route('/works')
years: publicationsYears,
},
});
console.timeEnd(`7. Serialization ${options.affiliations}`);
console.timeEnd(`7. Serialization ${options}`);
webSocketServer.broadcast(7);
}
} catch (err) {
Expand Down
13 changes: 8 additions & 5 deletions server/src/utils/works.js
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,12 @@ const deduplicateWorks = (works) => {
const getFosmQuery = (options, pit, searchAfter) => {
const query = { size: process.env.FOSM_SIZE, query: { bool: { filter: [], must: [], must_not: [], should: [] } } };
const affiliationsFields = ['affiliations.name'];
options.affiliations.forEach((affiliation) => {
options.affiliationStrings.forEach((affiliation) => {
query.query.bool.should.push({ multi_match: { fields: affiliationsFields, query: `"${affiliation}"`, operator: 'and' } });
});
options.rors.forEach((ror) => {
query.query.bool.should.push({ match: { rors: ror } });
});
query.query.bool.must.push({ range: { year: { gte: options.year, lte: options.year } } });
// Exclude files for Datacite
query.query.bool.must_not.push({ terms: { genre: ['file', 'version', 'file_'] } });
Expand Down Expand Up @@ -85,7 +88,7 @@ const getFosmWorksByYear = async ({ results = [], options, pit, searchAfter }) =
// eslint-disable-next-line no-param-reassign
results = results.concat(hits.map((result) => ({
// Filter ids on unique values
affiliations: result._source.affiliations.map((affiliation) => affiliation.name).filter((affiliation) => !!affiliation),
affiliations: result._source.affiliations?.map((affiliation) => affiliation.name).filter((affiliation) => !!affiliation),
allIds: Object.values((result?._source?.external_ids ?? []).reduce((acc, obj) => ({ ...acc, [obj.id_value]: obj }), {})),
authors: (result._source?.authors ?? []).map((author) => author.full_name),
datasource: ['fosm'],
Expand Down Expand Up @@ -170,8 +173,8 @@ const getOpenAlexPublicationsByYear = (options, cursor = '*', previousResponse =
let url = `https://api.openalex.org/works?per_page=${process.env.OPENALEX_PER_PAGE}`;
url += '&filter=is_paratext:false';
url += `,publication_year:${Number(options.year)}-${Number(options?.year)}`;
if (options.affiliations.length) {
url += `,raw_affiliation_string.search:(${options.affiliations.map((aff) => `"${aff}"`).join(' OR ')})`;
if (options.affiliationStrings.length) {
url += `,raw_affiliation_string.search:(${options.affiliationStrings.map((aff) => `"${aff}"`).join(' OR ')})`;
}
if (options.datasets) {
url += ',type:dataset';
Expand Down Expand Up @@ -221,7 +224,7 @@ const getOpenAlexPublications = async ({ options }) => {
};

const groupByAffiliations = ({ options, works }) => {
const normalizedAffiliations = options.affiliations.map((affiliation) => removeDiacritics(affiliation));
const normalizedAffiliations = options.affiliationStrings.map((affiliation) => removeDiacritics(affiliation));
// Compute distinct affiliations of works
let allAffiliationsTmp = works.reduce((deduplicatedAffiliations, work) => {
const { affiliations = [], id } = work;
Expand Down

0 comments on commit 029571d

Please sign in to comment.