From 55e63692381fb75ca16599255288e8030889073b Mon Sep 17 00:00:00 2001 From: PascalEgn Date: Fri, 13 Sep 2024 13:38:07 +0200 Subject: [PATCH] workflows: add ger curation step --- inspirehep/config.py | 1 + inspirehep/modules/workflows/tasks/actions.py | 15 +++ .../modules/workflows/workflows/article.py | 38 ++++++-- .../workflows/test_article_workflow.py | 5 + .../unit/workflows/test_workflows_actions.py | 91 ++++++++++++++++++- 5 files changed, 141 insertions(+), 9 deletions(-) diff --git a/inspirehep/config.py b/inspirehep/config.py index 23d9fb3f5a..6d6cab7726 100644 --- a/inspirehep/config.py +++ b/inspirehep/config.py @@ -1707,6 +1707,7 @@ "HEP_add_user": "Literature submissions", "HAL_curation": "HAL curation", "UK_curation": "UK curation", + "GER_curation": "German curation", "HEP_curation": "arXiv curation", "HEP_curation_jlab": "arXiv curation", "HEP_publishing": "Publisher curation", diff --git a/inspirehep/modules/workflows/tasks/actions.py b/inspirehep/modules/workflows/tasks/actions.py index 386558dead..8042aad0ed 100644 --- a/inspirehep/modules/workflows/tasks/actions.py +++ b/inspirehep/modules/workflows/tasks/actions.py @@ -1131,6 +1131,21 @@ def check_if_france_in_raw_affiliations(obj, eng): return True +def check_if_germany_in_fulltext(obj, eng): + fulltext = get_fulltext(obj) + if not fulltext: + return + regex = re.compile(r"\b(Germany|Deutschland)\b", re.UNICODE | re.IGNORECASE) + return regex.search(fulltext) + + +def check_if_germany_in_raw_affiliations(obj, eng): + raw_affs = get_value(obj.data, 'authors.raw_affiliations.value', []) + for aff in chain.from_iterable(raw_affs): + if "germany" in aff.lower() or "deutschland" in aff.lower(): + return True + + def check_if_core_and_uk_in_fulltext(obj, eng): fulltext = get_fulltext(obj) if not fulltext or not is_core(obj, eng): diff --git a/inspirehep/modules/workflows/workflows/article.py b/inspirehep/modules/workflows/workflows/article.py index d71bccac7c..650b0afd8b 100644 --- a/inspirehep/modules/workflows/workflows/article.py +++ b/inspirehep/modules/workflows/workflows/article.py @@ -76,6 +76,8 @@ create_core_selection_wf, check_if_france_in_fulltext, check_if_france_in_raw_affiliations, + check_if_germany_in_fulltext, + check_if_germany_in_raw_affiliations, link_institutions_with_affiliations, check_if_core_and_uk_in_fulltext ) @@ -271,6 +273,15 @@ ticket_id_key='curation_ticket_id', ), ), + IF( + check_if_germany_in_fulltext, + create_ticket( + template='literaturesuggest/tickets/curation_core.html', + queue='GER_curation', + context_factory=curation_ticket_context, + ticket_id_key='curation_ticket_id', + ), + ), IF( check_if_core_and_uk_in_fulltext, create_ticket( @@ -281,15 +292,26 @@ ), ) ], - IF( - check_if_france_in_raw_affiliations, - create_ticket( - template='literaturesuggest/tickets/curation_core.html', - queue='HAL_curation', - context_factory=curation_ticket_context, - ticket_id_key='curation_ticket_id', + [ + IF( + check_if_france_in_raw_affiliations, + create_ticket( + template='literaturesuggest/tickets/curation_core.html', + queue='HAL_curation', + context_factory=curation_ticket_context, + ticket_id_key='curation_ticket_id', + ), ), - ) + IF( + check_if_germany_in_raw_affiliations, + create_ticket( + template='literaturesuggest/tickets/curation_core.html', + queue='GER_curation', + context_factory=curation_ticket_context, + ticket_id_key='curation_ticket_id', + ), + ) + ] ) ), IF_NOT( diff --git a/tests/integration/workflows/test_article_workflow.py b/tests/integration/workflows/test_article_workflow.py index d39423b098..bceaaabbfb 100644 --- a/tests/integration/workflows/test_article_workflow.py +++ b/tests/integration/workflows/test_article_workflow.py @@ -152,6 +152,10 @@ def test_create_ticket_when_source_is_not_publishing( "inspirehep.modules.workflows.tasks.actions.check_if_france_in_fulltext", return_value=False, ) +@mock.patch( + "inspirehep.modules.workflows.tasks.actions.check_if_germany_in_fulltext", + return_value=False, +) @mock.patch( "inspirehep.modules.workflows.tasks.actions.check_if_core_and_uk_in_fulltext", return_value=False, @@ -161,6 +165,7 @@ def test_set_fermilab_collection_from_report_number( mocked_api_request_magpie, mocked_api_request_classifier, mocked_robotupload, + mocked_check_if_germany_in_fulltext, mocked_check_if_core_and_uk_in_fulltext, mocked_external_services, workflow_app, diff --git a/tests/unit/workflows/test_workflows_actions.py b/tests/unit/workflows/test_workflows_actions.py index c9a268b7b8..0e090def9f 100644 --- a/tests/unit/workflows/test_workflows_actions.py +++ b/tests/unit/workflows/test_workflows_actions.py @@ -38,7 +38,8 @@ from inspirehep.modules.workflows.tasks.actions import jlab_ticket_needed, load_from_source_data, \ extract_authors_from_pdf, is_suitable_for_pdf_authors_extraction, is_fermilab_report, add_collection, \ - check_if_france_in_fulltext, check_if_france_in_raw_affiliations, check_if_core_and_uk_in_fulltext + check_if_france_in_fulltext, check_if_france_in_raw_affiliations, check_if_germany_in_fulltext, \ + check_if_germany_in_raw_affiliations, check_if_core_and_uk_in_fulltext def test_match_approval_gets_match_recid(): @@ -581,6 +582,94 @@ def test_check_if_france_in_fulltext_when_france_in_text_body(mocked_get_documen assert france_in_fulltext +def test_check_if_germany_in_affiliations(app): + obj = MagicMock() + obj.data = { + 'authors': [ + {"full_name": "author 1", + "raw_affiliations": [{"value": "Laboratoire de Physique des 2 Infinis Irene Joliot-Curie (IJCLab), CNRS, Université Paris-Saclay, Orsay, 91405, Germany"}] + + } + ] + } + + obj.extra_data = {} + eng = None + result = check_if_germany_in_raw_affiliations(obj, eng) + assert result + + +def test_check_if_deutschland_in_affiliations(app): + obj = MagicMock() + obj.data = { + 'authors': [ + {"full_name": "author 1", + "raw_affiliations": [{"value": "Laboratoire de Physique des 2 Infinis Irene Joliot-Curie (IJCLab), CNRS, Université Paris-Saclay, Orsay, 91405, Deutschland"}] + + } + ] + } + + obj.extra_data = {} + eng = None + result = check_if_germany_in_raw_affiliations(obj, eng) + assert result + + +@patch("inspirehep.modules.workflows.tasks.actions.get_document_in_workflow") +def test_check_if_germany_in_fulltext_when_germany_in_text_body(mocked_get_document, app): + fake_grobid_response = "Germany" + obj = MagicMock() + obj.data = { + 'core': False + } + obj.extra_data = {} + eng = None + new_config = {"GROBID_URL": "http://grobid_url.local"} + + new_config = {"GROBID_URL": "http://grobid_url.local"} + with patch.dict(current_app.config, new_config): + with requests_mock.Mocker() as requests_mocker: + requests_mocker.register_uri( + 'POST', 'http://grobid_url.local/api/processFulltextDocument', + text=fake_grobid_response, + headers={'content-type': 'application/xml'}, + status_code=200, + ) + with tempfile.NamedTemporaryFile() as tmp_file: + mocked_get_document.return_value.__enter__.return_value = tmp_file.name + germany_in_fulltext = check_if_germany_in_fulltext(obj, eng) + + assert germany_in_fulltext + + +@patch("inspirehep.modules.workflows.tasks.actions.get_document_in_workflow") +def test_check_if_germany_in_fulltext_when_deutschland_in_text_body(mocked_get_document, app): + fake_grobid_response = "Deutschland" + obj = MagicMock() + obj.data = { + 'core': False + } + obj.extra_data = {} + eng = None + new_config = {"GROBID_URL": "http://grobid_url.local"} + + new_config = {"GROBID_URL": "http://grobid_url.local"} + with patch.dict(current_app.config, new_config): + with requests_mock.Mocker() as requests_mocker: + requests_mocker.register_uri( + 'POST', 'http://grobid_url.local/api/processFulltextDocument', + text=fake_grobid_response, + headers={'content-type': 'application/xml'}, + status_code=200, + ) + with tempfile.NamedTemporaryFile() as tmp_file: + mocked_get_document.return_value.__enter__.return_value = tmp_file.name + germany_in_fulltext = check_if_germany_in_fulltext(obj, eng) + + assert germany_in_fulltext + + @patch("inspirehep.modules.workflows.tasks.actions.get_document_in_workflow") def test_check_if_uk_in_fulltext_not_core(mocked_get_document, app): fake_grobid_response = "England"