Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

workflows: enable matcher on hep #4298

Merged
merged 1 commit into from
Jul 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 5 additions & 10 deletions inspirehep/modules/workflows/tasks/actions.py
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,6 @@
extract_references_from_text,
extract_references_from_text_data,
)
from inspirehep.modules.refextract.matcher import match_references
from inspirehep.modules.workflows.utils import _get_headers_for_hep_root_table_request, create_error
from inspirehep.modules.workflows.errors import BadGatewayError, MissingRecordControlNumber
from inspirehep.modules.workflows.utils import (
Expand Down Expand Up @@ -474,10 +473,6 @@ def match_references_hep(references):
create_error(response)


def match_references_based_on_flag(references):
return match_references(references)


@with_debug_logging
def refextract(obj, eng):
"""Extract references from various sources and add them to the workflow.
Expand All @@ -497,7 +492,7 @@ def refextract(obj, eng):
if 'references' in obj.data:
extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references']))
obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references))
obj.data['references'] = match_references_based_on_flag(extracted_raw_references)
obj.data['references'] = match_references_hep(extracted_raw_references)
return

matched_pdf_references, matched_text_references = [], []
Expand All @@ -511,12 +506,12 @@ def refextract(obj, eng):
url, source=source, custom_kbs_file=journal_kb_dict
)
)
matched_pdf_references = match_references_based_on_flag(pdf_references)
matched_pdf_references = match_references_hep(pdf_references)
else:
with get_document_in_workflow(obj) as tmp_document:
if tmp_document:
pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source))
matched_pdf_references = match_references_based_on_flag(pdf_references)
matched_pdf_references = match_references_hep(pdf_references)

text = get_value(obj.extra_data, 'formdata.references')
if text and current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"):
Expand All @@ -525,10 +520,10 @@ def refextract(obj, eng):
text, source=source, custom_kbs_file=journal_kb_dict
)
)
matched_text_references = match_references_based_on_flag(text_references)
matched_text_references = match_references_hep(text_references)
elif text:
text_references = dedupe_list(extract_references_from_text(text, source))
matched_text_references = match_references_based_on_flag(text_references)
matched_text_references = match_references_hep(text_references)

if not matched_pdf_references and not matched_text_references:
obj.log.info('No references extracted.')
Expand Down
117 changes: 110 additions & 7 deletions tests/integration/workflows/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,19 +40,21 @@

from inspirehep.factory import create_app
from inspirehep.modules.fixtures.files import init_all_storage_paths
from inspirehep.modules.fixtures.users import (init_authentication_token,
init_users_and_permissions)
from inspirehep.modules.fixtures.users import (
init_authentication_token,
init_users_and_permissions,
)
from inspirehep.modules.records.api import InspireRecord
from inspirehep.modules.workflows.utils import \
_get_headers_for_hep_root_table_request
from inspirehep.modules.workflows.utils import _get_headers_for_hep_root_table_request

# Use the helpers folder to store test helpers.
# See: http://stackoverflow.com/a/33515264/374865
sys.path.insert(0, os.path.join(os.path.dirname(__file__), "helpers"))


from factories.db.invenio_records import \
cleanup as invenio_records_factory_cleanup # noqa
from factories.db.invenio_records import (
cleanup as invenio_records_factory_cleanup,
) # noqa

HIGGS_ONTOLOGY = """<?xml version="1.0" encoding="UTF-8" ?>

Expand Down Expand Up @@ -123,7 +125,6 @@ def workflow_app(higgs_ontology):
with mock.patch(
"inspirehep.modules.records.receivers.index_modified_citations_from_record.apply_async"
):

yield app


Expand Down Expand Up @@ -289,6 +290,108 @@ def mocked_external_services(workflow_app):
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_references_from_url".format(
workflow_app.config["REFEXTRACT_SERVICE_URL"]
),
json={
"extracted_references": [
{
"author": ["G. Chalons, M. D. Goodsell, S. Kraml"],
"journal_page": ["113"],
"journal_reference": ["JHEP,1904,113"],
"journal_title": ["JHEP"],
"journal_volume": ["1904"],
"journal_year": ["2019"],
"linemarker": ["67"],
"misc": ["H. Reyes-González, S. L. Williamson"],
"raw_ref": [
"[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293."
],
"reportnumber": ["arXiv:1812.09293"],
"title": [
"LHC limits on gluinos and squarks in the minimal Dirac gaugino model"
],
"year": ["2019"],
},
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/api/matcher/linked_references/".format(
workflow_app.config["INSPIREHEP_URL"]
),
json={
"references": [
{
"record": {
"$ref": "http://localhost:5000/api/literature/1000",
},
"raw_refs": [
{
"source": "submitter",
"schema": "That's a schema",
"value": "That's a reference",
}
],
}
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_references_from_text".format(
workflow_app.config["REFEXTRACT_SERVICE_URL"]
),
json={
"extracted_references": [
{
"author": [
"G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson"
],
"journal_page": ["113"],
"journal_reference": ["JHEP,1904,113"],
"journal_title": ["JHEP"],
"journal_volume": ["1904"],
"journal_year": ["2019"],
"linemarker": ["67"],
"raw_ref": [
"[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-Gonz´ alez, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293."
],
"reportnumber": ["arXiv:1812.09293"],
"title": [
"LHC limits on gluinos and squarks in the minimal Dirac gaugino model"
],
"year": ["2019"],
}
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
requests_mocker.register_uri(
"POST",
"{}/extract_journal_info".format(
workflow_app.config["REFEXTRACT_SERVICE_URL"]
),
json={
"extracted_publication_infos": [
{
"title": "A test title",
"year": 2014,
'title': 'A test title'
}
]
},
headers=_get_headers_for_hep_root_table_request(),
status_code=200,
)
if "INSPIREHEP_URL" in workflow_app.config:
# HEP record upload
requests_mocker.register_uri(
Expand Down
23 changes: 13 additions & 10 deletions tests/integration/workflows/test_arxiv_workflow.py
Original file line number Diff line number Diff line change
Expand Up @@ -1723,10 +1723,9 @@ def test_workflow_checks_affiliations_if_record_is_not_important(
):
workflow_id = build_workflow(record).id
start("article", object_id=workflow_id)

collections_in_record = mocked_external_services.request_history[4].json()[
"_collections"
]
collections_in_record = filter(
lambda x: x.path == '/literature',
mocked_external_services.request_history).pop().json().get('_collections')
assert "CDS Hidden" in collections_in_record
assert "HAL Hidden" in collections_in_record
assert "Fermilab" in collections_in_record
Expand Down Expand Up @@ -1781,9 +1780,11 @@ def test_workflow_do_not_changes_to_hidden_if_record_authors_do_not_have_interes
wf.save()
wf.continue_workflow(delayed=False)

collections_in_record = mocked_external_services.request_history[4].json()[
"_collections"
]
collections_in_record = filter(
lambda x: x.path == '/literature',
mocked_external_services.request_history
).pop().json().get('_collections')

assert "CDS Hidden" not in collections_in_record
assert "HAL Hidden" not in collections_in_record
assert "Fermilab" not in collections_in_record
Expand Down Expand Up @@ -1876,9 +1877,11 @@ def test_workflow_checks_affiliations_if_record_is_rejected_by_curator(
wf.save()
wf.continue_workflow(delayed=False)

collections_in_record = mocked_external_services.request_history[4].json()[
"_collections"
]
collections_in_record = filter(
lambda x: x.path == '/literature',
mocked_external_services.request_history
).pop().json().get('_collections')

assert "CDS Hidden" in collections_in_record
assert "HAL Hidden" in collections_in_record
assert "Fermilab" in collections_in_record
Expand Down
Loading