Skip to content

Commit

Permalink
Merge pull request #4298 from inspirehep/matcher-on-hep
Browse files Browse the repository at this point in the history
workflows: enable matcher on hep
  • Loading branch information
drjova authored Jul 14, 2023
2 parents ecbf35a + 79078b8 commit 544e6d7
Show file tree
Hide file tree
Showing 6 changed files with 407 additions and 174 deletions.
15 changes: 5 additions & 10 deletions inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@
extract_references_from_text,
extract_references_from_text_data,
)
from inspirehep.modules.refextract.matcher import match_references
from inspirehep.modules.workflows.utils import _get_headers_for_hep_root_table_request, create_error
from inspirehep.modules.workflows.errors import BadGatewayError, MissingRecordControlNumber
from inspirehep.modules.workflows.utils import (
Expand Down Expand Up @@ -474,10 +473,6 @@ def match_references_hep(references):
create_error(response)


def match_references_based_on_flag(references):
return match_references(references)


@with_debug_logging
def refextract(obj, eng):
"""Extract references from various sources and add them to the workflow.
Expand All @@ -497,7 +492,7 @@ def refextract(obj, eng):
if 'references' in obj.data:
extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references']))
obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
obj.data['references'] = match_references_based_on_flag(extracted_raw_references)
obj.data['references'] = match_references_hep(extracted_raw_references)
return

matched_pdf_references, matched_text_references = [], []
Expand All @@ -511,12 +506,12 @@ def refextract(obj, eng):
url, source=source, custom_kbs_file=journal_kb_dict
)
)
matched_pdf_references = match_references_based_on_flag(pdf_references)
matched_pdf_references = match_references_hep(pdf_references)
else:
with get_document_in_workflow(obj) as tmp_document:
if tmp_document:
pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source))
matched_pdf_references = match_references_based_on_flag(pdf_references)
matched_pdf_references = match_references_hep(pdf_references)

text = get_value(obj.extra_data, 'formdata.references')
if text and current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"):
Expand All @@ -525,10 +520,10 @@ def refextract(obj, eng):
text, source=source, custom_kbs_file=journal_kb_dict
)
)
matched_text_references = match_references_based_on_flag(text_references)
matched_text_references = match_references_hep(text_references)
elif text:
text_references = dedupe_list(extract_references_from_text(text, source))
matched_text_references = match_references_based_on_flag(text_references)
matched_text_references = match_references_hep(text_references)

if not matched_pdf_references and not matched_text_references:
obj.log.info('No references extracted.')
Expand Down
117 changes: 110 additions & 7 deletions tests/integration/workflows/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,21 @@

from inspirehep.factory import create_app
from inspirehep.modules.fixtures.files import init_all_storage_paths
from inspirehep.modules.fixtures.users import (init_authentication_token,
init_users_and_permissions)
from inspirehep.modules.fixtures.users import (
init_authentication_token,
init_users_and_permissions,
)
from inspirehep.modules.records.api import InspireRecord
from inspirehep.modules.workflows.utils import \
_get_headers_for_hep_root_table_request
from inspirehep.modules.workflows.utils import _get_headers_for_hep_root_table_request

# Use the helpers folder to store test helpers.
# See: http://stackoverflow.com/a/33515264/374865
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "helpers"))


from factories.db.invenio_records import \
cleanup as invenio_records_factory_cleanup # noqa
from factories.db.invenio_records import (
cleanup as invenio_records_factory_cleanup,
) # noqa

HIGGS_ONTOLOGY = """<?xml version="1.0" encoding="UTF-8" ?>
Expand Down Expand Up @@ -123,7 +125,6 @@ def workflow_app(higgs_ontology):
with mock.patch(
"inspirehep.modules.records.receivers.index_modified_citations_from_record.apply_async"
):

yield app


Expand Down Expand Up @@ -289,6 +290,108 @@ def mocked_external_services(workflow_app):
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_references_from_url".format(
workflow_app.config["REFEXTRACT_SERVICE_URL"]
),
json={
"extracted_references": [
{
"author": ["G. Chalons, M. D. Goodsell, S. Kraml"],
"journal_page": ["113"],
"journal_reference": ["JHEP,1904,113"],
"journal_title": ["JHEP"],
"journal_volume": ["1904"],
"journal_year": ["2019"],
"linemarker": ["67"],
"misc": ["H. Reyes-González, S. L. Williamson"],
"raw_ref": [
"[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293."
],
"reportnumber": ["arXiv:1812.09293"],
"title": [
"LHC limits on gluinos and squarks in the minimal Dirac gaugino model"
],
"year": ["2019"],
},
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/api/matcher/linked_references/".format(
workflow_app.config["INSPIREHEP_URL"]
),
json={
"references": [
{
"record": {
"$ref": "http://localhost:5000/api/literature/1000",
},
"raw_refs": [
{
"source": "submitter",
"schema": "That's a schema",
"value": "That's a reference",
}
],
}
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_references_from_text".format(
workflow_app.config["REFEXTRACT_SERVICE_URL"]
),
json={
"extracted_references": [
{
"author": [
"G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson"
],
"journal_page": ["113"],
"journal_reference": ["JHEP,1904,113"],
"journal_title": ["JHEP"],
"journal_volume": ["1904"],
"journal_year": ["2019"],
"linemarker": ["67"],
"raw_ref": [
"[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-Gonz´ alez, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293."
],
"reportnumber": ["arXiv:1812.09293"],
"title": [
"LHC limits on gluinos and squarks in the minimal Dirac gaugino model"
],
"year": ["2019"],
}
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_journal_info".format(
workflow_app.config["REFEXTRACT_SERVICE_URL"]
),
json={
"extracted_publication_infos": [
{
"title": "A test title",
"year": 2014,
'title': 'A test title'
}
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
if "INSPIREHEP_URL" in workflow_app.config:
# HEP record upload
requests_mocker.register_uri(
Expand Down
23 changes: 13 additions & 10 deletions tests/integration/workflows/test_arxiv_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1723,10 +1723,9 @@ def test_workflow_checks_affiliations_if_record_is_not_important(
):
workflow_id = build_workflow(record).id
start("article", object_id=workflow_id)

collections_in_record = mocked_external_services.request_history[4].json()[
"_collections"
]
collections_in_record = filter(
lambda x: x.path == '/literature',
mocked_external_services.request_history).pop().json().get('_collections')
assert "CDS Hidden" in collections_in_record
assert "HAL Hidden" in collections_in_record
assert "Fermilab" in collections_in_record
Expand Down Expand Up @@ -1781,9 +1780,11 @@ def test_workflow_do_not_changes_to_hidden_if_record_authors_do_not_have_interes
wf.save()
wf.continue_workflow(delayed=False)

collections_in_record = mocked_external_services.request_history[4].json()[
"_collections"
]
collections_in_record = filter(
lambda x: x.path == '/literature',
mocked_external_services.request_history
).pop().json().get('_collections')

assert "CDS Hidden" not in collections_in_record
assert "HAL Hidden" not in collections_in_record
assert "Fermilab" not in collections_in_record
Expand Down Expand Up @@ -1876,9 +1877,11 @@ def test_workflow_checks_affiliations_if_record_is_rejected_by_curator(
wf.save()
wf.continue_workflow(delayed=False)

collections_in_record = mocked_external_services.request_history[4].json()[
"_collections"
]
collections_in_record = filter(
lambda x: x.path == '/literature',
mocked_external_services.request_history
).pop().json().get('_collections')

assert "CDS Hidden" in collections_in_record
assert "HAL Hidden" in collections_in_record
assert "Fermilab" in collections_in_record
Expand Down
Loading

0 comments on commit 544e6d7

Please sign in to comment.