From 79078b8f8c09504b50f2692ad29cbc4476534568 Mon Sep 17 00:00:00 2001 From: Harris Tzovanakis Date: Thu, 13 Jul 2023 10:51:50 +0200 Subject: [PATCH] workflows: enable matcher on hep * ref: cern-sis/issues-inspire#347 --- inspirehep/modules/workflows/tasks/actions.py | 15 +- tests/integration/workflows/conftest.py | 117 ++++++++- .../workflows/test_arxiv_workflow.py | 23 +- .../workflows/test_workflows_tasks_actions.py | 142 +++-------- tests/integration_async/test_workflows.py | 44 ++++ .../workflows/test_workflows_tasks_actions.py | 240 ++++++++++++++---- 6 files changed, 407 insertions(+), 174 deletions(-) diff --git a/inspirehep/modules/workflows/tasks/actions.py b/inspirehep/modules/workflows/tasks/actions.py index 14f4b3f2ff..5c0563ed66 100644 --- a/inspirehep/modules/workflows/tasks/actions.py +++ b/inspirehep/modules/workflows/tasks/actions.py @@ -78,7 +78,6 @@ extract_references_from_text, extract_references_from_text_data, ) -from inspirehep.modules.refextract.matcher import match_references from inspirehep.modules.workflows.utils import _get_headers_for_hep_root_table_request, create_error from inspirehep.modules.workflows.errors import BadGatewayError, MissingRecordControlNumber from inspirehep.modules.workflows.utils import ( @@ -474,10 +473,6 @@ def match_references_hep(references): create_error(response) -def match_references_based_on_flag(references): - return match_references(references) - - @with_debug_logging def refextract(obj, eng): """Extract references from various sources and add them to the workflow. @@ -497,7 +492,7 @@ def refextract(obj, eng): if 'references' in obj.data: extracted_raw_references = dedupe_list(extract_references_from_raw_refs(obj.data['references'])) obj.log.info('Extracted %d references from raw refs.', len(extracted_raw_references)) - obj.data['references'] = match_references_based_on_flag(extracted_raw_references) + obj.data['references'] = match_references_hep(extracted_raw_references) return matched_pdf_references, matched_text_references = [], [] @@ -511,12 +506,12 @@ def refextract(obj, eng): url, source=source, custom_kbs_file=journal_kb_dict ) ) - matched_pdf_references = match_references_based_on_flag(pdf_references) + matched_pdf_references = match_references_hep(pdf_references) else: with get_document_in_workflow(obj) as tmp_document: if tmp_document: pdf_references = dedupe_list(extract_references_from_pdf(tmp_document, source)) - matched_pdf_references = match_references_based_on_flag(pdf_references) + matched_pdf_references = match_references_hep(pdf_references) text = get_value(obj.extra_data, 'formdata.references') if text and current_app.config.get("FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE"): @@ -525,10 +520,10 @@ def refextract(obj, eng): text, source=source, custom_kbs_file=journal_kb_dict ) ) - matched_text_references = match_references_based_on_flag(text_references) + matched_text_references = match_references_hep(text_references) elif text: text_references = dedupe_list(extract_references_from_text(text, source)) - matched_text_references = match_references_based_on_flag(text_references) + matched_text_references = match_references_hep(text_references) if not matched_pdf_references and not matched_text_references: obj.log.info('No references extracted.') diff --git a/tests/integration/workflows/conftest.py b/tests/integration/workflows/conftest.py index f85095bc80..b4c14eb0a5 100644 --- a/tests/integration/workflows/conftest.py +++ b/tests/integration/workflows/conftest.py @@ -40,19 +40,21 @@ from inspirehep.factory import create_app from inspirehep.modules.fixtures.files import init_all_storage_paths -from inspirehep.modules.fixtures.users import (init_authentication_token, - init_users_and_permissions) +from inspirehep.modules.fixtures.users import ( + init_authentication_token, + init_users_and_permissions, +) from inspirehep.modules.records.api import InspireRecord -from inspirehep.modules.workflows.utils import \ - _get_headers_for_hep_root_table_request +from inspirehep.modules.workflows.utils import _get_headers_for_hep_root_table_request # Use the helpers folder to store test helpers. # See: http://stackoverflow.com/a/33515264/374865 sys.path.insert(0, os.path.join(os.path.dirname(__file__), "helpers")) -from factories.db.invenio_records import \ - cleanup as invenio_records_factory_cleanup # noqa +from factories.db.invenio_records import ( + cleanup as invenio_records_factory_cleanup, +) # noqa HIGGS_ONTOLOGY = """ @@ -123,7 +125,6 @@ def workflow_app(higgs_ontology): with mock.patch( "inspirehep.modules.records.receivers.index_modified_citations_from_record.apply_async" ): - yield app @@ -289,6 +290,108 @@ def mocked_external_services(workflow_app): headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + requests_mocker.register_uri( + "POST", + "{}/extract_references_from_url".format( + workflow_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_references": [ + { + "author": ["G. Chalons, M. D. Goodsell, S. Kraml"], + "journal_page": ["113"], + "journal_reference": ["JHEP,1904,113"], + "journal_title": ["JHEP"], + "journal_volume": ["1904"], + "journal_year": ["2019"], + "linemarker": ["67"], + "misc": ["H. Reyes-González, S. L. Williamson"], + "raw_ref": [ + "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." + ], + "reportnumber": ["arXiv:1812.09293"], + "title": [ + "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" + ], + "year": ["2019"], + }, + ] + }, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + requests_mocker.register_uri( + "POST", + "{}/api/matcher/linked_references/".format( + workflow_app.config["INSPIREHEP_URL"] + ), + json={ + "references": [ + { + "record": { + "$ref": "http://localhost:5000/api/literature/1000", + }, + "raw_refs": [ + { + "source": "submitter", + "schema": "That's a schema", + "value": "That's a reference", + } + ], + } + ] + }, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + requests_mocker.register_uri( + "POST", + "{}/extract_references_from_text".format( + workflow_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_references": [ + { + "author": [ + "G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson" + ], + "journal_page": ["113"], + "journal_reference": ["JHEP,1904,113"], + "journal_title": ["JHEP"], + "journal_volume": ["1904"], + "journal_year": ["2019"], + "linemarker": ["67"], + "raw_ref": [ + "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-Gonz´ alez, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." + ], + "reportnumber": ["arXiv:1812.09293"], + "title": [ + "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" + ], + "year": ["2019"], + } + ] + }, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) + requests_mocker.register_uri( + "POST", + "{}/extract_journal_info".format( + workflow_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_publication_infos": [ + { + "title": "A test title", + "year": 2014, + 'title': 'A test title' + } + ] + }, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) if "INSPIREHEP_URL" in workflow_app.config: # HEP record upload requests_mocker.register_uri( diff --git a/tests/integration/workflows/test_arxiv_workflow.py b/tests/integration/workflows/test_arxiv_workflow.py index 5adbc78fab..20a6f15919 100644 --- a/tests/integration/workflows/test_arxiv_workflow.py +++ b/tests/integration/workflows/test_arxiv_workflow.py @@ -1723,10 +1723,9 @@ def test_workflow_checks_affiliations_if_record_is_not_important( ): workflow_id = build_workflow(record).id start("article", object_id=workflow_id) - - collections_in_record = mocked_external_services.request_history[4].json()[ - "_collections" - ] + collections_in_record = filter( + lambda x: x.path == '/literature', + mocked_external_services.request_history).pop().json().get('_collections') assert "CDS Hidden" in collections_in_record assert "HAL Hidden" in collections_in_record assert "Fermilab" in collections_in_record @@ -1781,9 +1780,11 @@ def test_workflow_do_not_changes_to_hidden_if_record_authors_do_not_have_interes wf.save() wf.continue_workflow(delayed=False) - collections_in_record = mocked_external_services.request_history[4].json()[ - "_collections" - ] + collections_in_record = filter( + lambda x: x.path == '/literature', + mocked_external_services.request_history + ).pop().json().get('_collections') + assert "CDS Hidden" not in collections_in_record assert "HAL Hidden" not in collections_in_record assert "Fermilab" not in collections_in_record @@ -1876,9 +1877,11 @@ def test_workflow_checks_affiliations_if_record_is_rejected_by_curator( wf.save() wf.continue_workflow(delayed=False) - collections_in_record = mocked_external_services.request_history[4].json()[ - "_collections" - ] + collections_in_record = filter( + lambda x: x.path == '/literature', + mocked_external_services.request_history + ).pop().json().get('_collections') + assert "CDS Hidden" in collections_in_record assert "HAL Hidden" in collections_in_record assert "Fermilab" in collections_in_record diff --git a/tests/integration/workflows/test_workflows_tasks_actions.py b/tests/integration/workflows/test_workflows_tasks_actions.py index b65816a755..56abdbc942 100644 --- a/tests/integration/workflows/test_workflows_tasks_actions.py +++ b/tests/integration/workflows/test_workflows_tasks_actions.py @@ -524,17 +524,13 @@ def test_refextract_from_pdf( assert validate(citing_record["acquisition_source"], subschema) is None with mock.patch.dict(workflow_app.config, extra_config): - workflow_id = build_workflow(citing_record).id - citing_doc_workflow_uuid = start("article", object_id=workflow_id) + with override_config(FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE=True): + workflow_id = build_workflow(citing_record).id + citing_doc_workflow_uuid = start("article", object_id=workflow_id) citing_doc_eng = WorkflowEngine.from_uuid(citing_doc_workflow_uuid) citing_doc_obj = citing_doc_eng.processed_objects[0] - - assert ( - citing_doc_obj.data["references"][7]["record"]["$ref"] - == "http://localhost:5000/api/literature/1000" - ) - assert citing_doc_obj.data["references"][0]["raw_refs"][0]["source"] == "arXiv" + assert len(citing_doc_obj.data["references"]) == 1 @mock.patch( @@ -565,7 +561,7 @@ def test_count_reference_coreness( mocked_package_download, mocked_arxiv_download, workflow_app, - mocked_external_services, + mocked_external_services ): cited_record_json = { "$schema": "http://localhost:5000/schemas/records/hep.json", @@ -609,8 +605,9 @@ def test_count_reference_coreness( assert validate(citing_record["acquisition_source"], subschema) is None with mock.patch.dict(workflow_app.config, extra_config): - workflow_id = build_workflow(citing_record).id - citing_doc_workflow_uuid = start("article", object_id=workflow_id) + with override_config(FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE=True): + workflow_id = build_workflow(citing_record).id + citing_doc_workflow_uuid = start("article", object_id=workflow_id) citing_doc_eng = WorkflowEngine.from_uuid(citing_doc_workflow_uuid) citing_doc_obj = citing_doc_eng.processed_objects[0] @@ -1387,112 +1384,49 @@ def test_refextract_when_document_type_is_xml( assert not obj.data.get("references") -def test_refextract_from_text_data(insert_hep_records_into_db, workflow_app): +def test_refextract_from_text_data(insert_hep_records_into_db, workflow_app, mocked_external_services): """TODO: record cassette and remove mock request""" with override_config(FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE=True): - with requests_mock.Mocker() as mock_request: - mock_request.register_uri( - "POST", - "{}/extract_references_from_text".format( - current_app.config["REFEXTRACT_SERVICE_URL"] - ), - json={ - "extracted_references": [ - { - "author": [ - "G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson" - ], - "journal_page": ["113"], - "journal_reference": ["JHEP,1904,113"], - "journal_title": ["JHEP"], - "journal_volume": ["1904"], - "journal_year": ["2019"], - "linemarker": ["67"], - "raw_ref": [ - "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-Gonz´ alez, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." - ], - "reportnumber": ["arXiv:1812.09293"], - "title": [ - "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" - ], - "year": ["2019"], - } - ] - }, - ) + schema = load_schema("hep") + subschema = schema["properties"]["acquisition_source"] - schema = load_schema("hep") - subschema = schema["properties"]["acquisition_source"] - - data = {"acquisition_source": {"source": "submitter"}} - extra_data = { - "formdata": { - "references": "M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167", - }, - } - assert validate(data["acquisition_source"], subschema) is None + data = {"acquisition_source": {"source": "submitter"}} + extra_data = { + "formdata": { + "references": "M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167", + }, + } + assert validate(data["acquisition_source"], subschema) is None - obj = workflow_object_class.create( - data=data, extra_data=extra_data, id_user=1, data_type="hep" - ) + obj = workflow_object_class.create( + data=data, extra_data=extra_data, id_user=1, data_type="hep" + ) - refextract(obj, None) is None - assert obj.data["references"][0]["raw_refs"][0]["source"] == "submitter" - assert "references" in obj.data + refextract(obj, None) is None + assert obj.data["references"][0]["raw_refs"][0]["source"] == "submitter" + assert "references" in obj.data -def test_refextract_from_url(insert_hep_records_into_db, workflow_app): +def test_refextract_from_url(insert_hep_records_into_db, workflow_app, mocked_external_services): """TODO: record cassette and remove mock request""" with override_config(FEATURE_FLAG_ENABLE_REFEXTRACT_SERVICE=True): - with requests_mock.Mocker() as mock_request: - mock_request.register_uri( - "POST", - "{}/extract_references_from_url".format( - current_app.config["REFEXTRACT_SERVICE_URL"] - ), - json={ - "extracted_references": [ - { - "author": ["G. Chalons, M. D. Goodsell, S. Kraml"], - "journal_page": ["113"], - "journal_reference": ["JHEP,1904,113"], - "journal_title": ["JHEP"], - "journal_volume": ["1904"], - "journal_year": ["2019"], - "linemarker": ["67"], - "misc": ["H. Reyes-González, S. L. Williamson"], - "raw_ref": [ - "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." - ], - "reportnumber": ["arXiv:1812.09293"], - "title": [ - "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" - ], - "year": ["2019"], - }, - ] - }, - headers={"content-type": "application/json"}, - status_code=200, - ) - - schema = load_schema("hep") - subschema = schema["properties"]["acquisition_source"] + schema = load_schema("hep") + subschema = schema["properties"]["acquisition_source"] - data = { - "documents": [ - {"url": "https://arxiv.org/pdf/2204.13950.pdf", "fulltext": True} - ], - "acquisition_source": {"source": "submitter"}, - } + data = { + "documents": [ + {"url": "https://arxiv.org/pdf/2204.13950.pdf", "fulltext": True} + ], + "acquisition_source": {"source": "submitter"}, + } - assert validate(data["acquisition_source"], subschema) is None + assert validate(data["acquisition_source"], subschema) is None - obj = workflow_object_class.create(data=data, id_user=1, data_type="hep") + obj = workflow_object_class.create(data=data, id_user=1, data_type="hep") - refextract(obj, None) is None - assert obj.data["references"][0]["raw_refs"][0]["source"] == "submitter" - assert "references" in obj.data + refextract(obj, None) is None + assert obj.data["references"][0]["raw_refs"][0]["source"] == "submitter" + assert "references" in obj.data def test_remove_inspire_categories_derived_from_core_arxiv_categories(workflow_app): diff --git a/tests/integration_async/test_workflows.py b/tests/integration_async/test_workflows.py index 38bd1f3535..09cc9f4c9f 100644 --- a/tests/integration_async/test_workflows.py +++ b/tests/integration_async/test_workflows.py @@ -324,6 +324,28 @@ def test_wf_replaces_old_workflow_which_is_in_halted_state( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + request_mocker.register_uri( + "POST", + "http://web:8000/api/matcher/linked_references/", + json={ + "references": [ + { + "record": { + "$ref": "http://localhost:5000/api/literature/1000", + }, + "raw_refs": [ + { + "source": "submitter", + "schema": "That's a schema", + "value": "That's a reference", + } + ], + } + ] + }, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow = build_workflow(record) @@ -383,6 +405,28 @@ def test_wf_rejects_automatically_when_previous_matched_wf_was_rejected( headers=_get_headers_for_hep_root_table_request(), status_code=200, ) + request_mocker.register_uri( + "POST", + "http://web:8000/api/matcher/linked_references/", + json={ + "references": [ + { + "record": { + "$ref": "http://localhost:5000/api/literature/1000", + }, + "raw_refs": [ + { + "source": "submitter", + "schema": "That's a schema", + "value": "That's a reference", + } + ], + } + ] + }, + headers=_get_headers_for_hep_root_table_request(), + status_code=200, + ) workflow = build_workflow(record) diff --git a/tests/unit/workflows/test_workflows_tasks_actions.py b/tests/unit/workflows/test_workflows_tasks_actions.py index 6ade796b57..cb173dcf42 100644 --- a/tests/unit/workflows/test_workflows_tasks_actions.py +++ b/tests/unit/workflows/test_workflows_tasks_actions.py @@ -1181,25 +1181,63 @@ def test_validate_record_raises_when_record_is_invalid(): def test_refextract_from_text(mock_match, mock_get_document_in_workflow, mock_create_journal_kb_dict): """TODO: Make this an integration test and also test reference matching.""" - mock_get_document_in_workflow.return_value.__enter__.return_value = None - mock_get_document_in_workflow.return_value.__exit__.return_value = None + with requests_mock.Mocker() as mock_request: + mock_request.register_uri( + "POST", + "{}/extract_references_from_text".format( + current_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_references": [ + { + "author": ["G. Chalons, M. D. Goodsell, S. Kraml"], + "journal_page": ["113"], + "journal_reference": ["JHEP,1904,113"], + "journal_title": ["JHEP"], + "journal_volume": ["1904"], + "journal_year": ["2019"], + "linemarker": ["67"], + "misc": ["H. Reyes-González, S. L. Williamson"], + "raw_ref": [ + "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." + ], + "reportnumber": ["arXiv:1812.09293"], + "title": [ + "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" + ], + "year": ["2019"], + }, + ] + }, + headers={"content-type": "application/json"}, + status_code=200, + ) + mock_request.register_uri( + "POST", + "http://web:8000/api/matcher/linked_references/", + json={"references": [{"raw_refs": [{"source": "submitter"}]}]}, + status_code=200, + ) - schema = load_schema('hep') - subschema = schema['properties']['acquisition_source'] + mock_get_document_in_workflow.return_value.__enter__.return_value = None + mock_get_document_in_workflow.return_value.__exit__.return_value = None - data = {'acquisition_source': {'source': 'submitter'}} - extra_data = { - 'formdata': { - 'references': 'M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167', - }, - } - assert validate(data['acquisition_source'], subschema) is None + schema = load_schema('hep') + subschema = schema['properties']['acquisition_source'] - obj = MockObj(data, extra_data) - eng = MockEng() + data = {'acquisition_source': {'source': 'submitter'}} + extra_data = { + 'formdata': { + 'references': 'M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167', + }, + } + assert validate(data['acquisition_source'], subschema) is None + + obj = MockObj(data, extra_data) + eng = MockEng() - assert refextract(obj, eng) is None - assert obj.data['references'][0]['raw_refs'][0]['source'] == 'submitter' + assert refextract(obj, eng) is None + assert obj.data['references'][0]['raw_refs'][0]['source'] == 'submitter' @patch('inspirehep.modules.workflows.tasks.actions.create_journal_kb_dict', return_value={}) @@ -1229,9 +1267,60 @@ def test_refextract_from_raw_refs(mock_create_journal_dict, mock_match): obj = MockObj(data, {}) eng = MockEng() + with requests_mock.Mocker() as mock_request: + mock_request.register_uri( + "POST", + "{}/extract_references_from_text".format( + current_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_references": [ + { + "author": ["G. Chalons, M. D. Goodsell, S. Kraml"], + "journal_page": ["113"], + "journal_reference": ["JHEP,1904,113"], + "journal_title": ["JHEP"], + "journal_volume": ["1904"], + "journal_year": ["2019"], + "linemarker": ["67"], + "misc": ["H. Reyes-González, S. L. Williamson"], + "raw_ref": [ + "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." + ], + "reportnumber": ["arXiv:1812.09293"], + "title": [ + "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" + ], + "year": ["2019"], + }, + ] + }, + headers={"content-type": "application/json"}, + status_code=200, + ) + mock_request.register_uri( + "POST", + "http://web:8000/api/matcher/linked_references/", + json={ + "references": [ + { + "reference": { + "publication_info": { + "artid": "045", + "journal_title": "JHEP", + "journal_volume": "06", + "page_start": "045", + "year": 2007, + } + } + } + ] + }, + status_code=200, + ) - assert refextract(obj, eng) is None - assert 'reference' in obj.data['references'][0] + assert refextract(obj, eng) is None + assert 'reference' in obj.data['references'][0] @patch('inspirehep.modules.workflows.tasks.actions.create_journal_kb_dict', return_value={}) @@ -1240,9 +1329,6 @@ def test_refextract_from_raw_refs(mock_create_journal_dict, mock_match): return_value=iter([]) ) def test_refextract_valid_refs_from_raw_refs(mock_create_journal_dict, mock_match): - schema = load_schema('hep') - subschema = schema['properties']['references'] - data = { 'references': [ { @@ -1263,42 +1349,110 @@ def test_refextract_valid_refs_from_raw_refs(mock_create_journal_dict, mock_matc } obj = MockObj(data, {}) eng = MockEng() + with requests_mock.Mocker() as mock_request: + mock_request.register_uri( + "POST", + "{}/extract_references_from_text".format( + current_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_references": [ + { + "author": ["G. Chalons, M. D. Goodsell, S. Kraml"], + "journal_page": ["113"], + "journal_reference": ["JHEP,1904,113"], + "journal_title": ["JHEP"], + "journal_volume": ["1904"], + "journal_year": ["2019"], + "linemarker": ["67"], + "misc": ["H. Reyes-González, S. L. Williamson"], + "raw_ref": [ + "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." + ], + "reportnumber": ["arXiv:1812.09293"], + "title": [ + "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" + ], + "year": ["2019"], + }, + ] + }, + headers={"content-type": "application/json"}, + status_code=200, + ) + mock_request.register_uri( + "POST", + "http://web:8000/api/matcher/linked_references/", + json={"references": [{"raw_refs": [{"source": "submitter"}]}]}, + status_code=200, + ) - assert refextract(obj, eng) is None - assert len(obj.data['references']) == 1 - assert validate(obj.data['references'], subschema) is None + assert refextract(obj, eng) is None + assert len(obj.data['references']) == 1 @patch('inspirehep.modules.workflows.tasks.actions.create_journal_kb_dict', return_value={}) @patch('inspirehep.modules.workflows.tasks.actions.get_document_in_workflow') -@patch( - 'inspirehep.modules.refextract.matcher.match', - return_value=iter([]) -) -def test_refextract_valid_refs_from_text(mock_match, mock_get_document_in_workflow, mock_create_journal_kb_dict): +def test_refextract_valid_refs_from_text(mock_get_document_in_workflow, mock_create_journal_kb_dict): """TODO: Make this an integration test and also test reference matching.""" mock_get_document_in_workflow.return_value.__enter__.return_value = None mock_get_document_in_workflow.return_value.__exit__.return_value = None - schema = load_schema('hep') - refs_subschema = schema['properties']['references'] - acquisition_source_subschema = schema['properties']['acquisition_source'] + with requests_mock.Mocker() as mock_request: + mock_request.register_uri( + "POST", + "{}/extract_references_from_text".format( + current_app.config["REFEXTRACT_SERVICE_URL"] + ), + json={ + "extracted_references": [ + { + "author": ["G. Chalons, M. D. Goodsell, S. Kraml"], + "journal_page": ["113"], + "journal_reference": ["JHEP,1904,113"], + "journal_title": ["JHEP"], + "journal_volume": ["1904"], + "journal_year": ["2019"], + "linemarker": ["67"], + "misc": ["H. Reyes-González, S. L. Williamson"], + "raw_ref": [ + "[67] G. Chalons, M. D. Goodsell, S. Kraml, H. Reyes-González, S. L. Williamson, “LHC limits on gluinos and squarks in the minimal Dirac gaugino model”, JHEP 04, 113 (2019), arXiv:1812.09293." + ], + "reportnumber": ["arXiv:1812.09293"], + "title": [ + "LHC limits on gluinos and squarks in the minimal Dirac gaugino model" + ], + "year": ["2019"], + }, + ] + }, + headers={"content-type": "application/json"}, + status_code=200, + ) + mock_request.register_uri( + "POST", + "http://web:8000/api/matcher/linked_references/", + json={"references": [{"raw_refs": [{"source": "submitter", "value": "M.R"}]}]}, + status_code=200, + ) - data = {'acquisition_source': {'source': 'submitter'}} - extra_data = { - 'formdata': { - 'references': 'M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167\nM.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167', - }, - } - assert validate(data['acquisition_source'], acquisition_source_subschema) is None + schema = load_schema('hep') + acquisition_source_subschema = schema['properties']['acquisition_source'] - obj = MockObj(data, extra_data) - eng = MockEng() + data = {'acquisition_source': {'source': 'submitter'}} + extra_data = { + 'formdata': { + 'references': 'M.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167\nM.R. Douglas, G.W. Moore, D-branes, quivers, and ALE instantons, arXiv:hep-th/9603167', + }, + } + assert validate(data['acquisition_source'], acquisition_source_subschema) is None + + obj = MockObj(data, extra_data) + eng = MockEng() - assert refextract(obj, eng) is None - assert len(obj.data['references']) == 1 - assert validate(obj.data['references'], refs_subschema) is None + assert refextract(obj, eng) is None + assert len(obj.data['references']) == 1 def test_url_is_correctly_escaped():