From 6e9e4f2fbf46b64a170be14355f82232f6cb93dd Mon Sep 17 00:00:00 2001 From: eliarizzetto Date: Tue, 13 Feb 2024 17:06:26 +0100 Subject: [PATCH 1/3] add Identifier Manager for OpenAlex The oc_idmanager.openalex.py module contains the Identifier Manager for OpenAlex Work IDs and Source IDs. --- oc_ds_converter/oc_idmanager/__init__.py | 2 + oc_ds_converter/oc_idmanager/openalex.py | 135 +++++++++++++++++++++++ 2 files changed, 137 insertions(+) create mode 100644 oc_ds_converter/oc_idmanager/openalex.py diff --git a/oc_ds_converter/oc_idmanager/__init__.py b/oc_ds_converter/oc_idmanager/__init__.py index 628cec5..4c381c9 100644 --- a/oc_ds_converter/oc_idmanager/__init__.py +++ b/oc_ds_converter/oc_idmanager/__init__.py @@ -27,3 +27,5 @@ from oc_ds_converter.oc_idmanager.viaf import ViafManager from oc_ds_converter.oc_idmanager.wikidata import WikidataManager from oc_ds_converter.oc_idmanager.wikipedia import WikipediaManager +from oc_ds_converter.oc_idmanager.openalex import OpenAlexManager + diff --git a/oc_ds_converter/oc_idmanager/openalex.py b/oc_ds_converter/oc_idmanager/openalex.py new file mode 100644 index 0000000..a287d26 --- /dev/null +++ b/oc_ds_converter/oc_idmanager/openalex.py @@ -0,0 +1,135 @@ +#!python +# Copyright 2019, Silvio Peroni +# Copyright 2022, Giuseppe Grieco , Arianna Moretti , Elia Rizzetto , Arcangelo Massari +# +# Permission to use, copy, modify, and/or distribute this software for any purpose +# with or without fee is hereby granted, provided that the above copyright notice +# and this permission notice appear in all copies. +# +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH +# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND +# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT, +# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, +# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS +# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS +# SOFTWARE. + +from oc_ds_converter.oc_idmanager.base import IdentifierManager +from re import sub, match +from requests import ReadTimeout, get +from requests.exceptions import ConnectionError +from json import loads +from time import sleep + + +class OpenAlexManager(IdentifierManager): + """This class implements an identifier manager for openalex identifier""" + + def __init__(self, data={}, use_api_service=True): + """OpenAlex manager constructor.""" + super(OpenAlexManager, self).__init__() + self._api = "https://api.openalex.org/" + self._use_api_service = use_api_service + self._p = "openalex:" + self._url_id_pref = "https://openalex.org/" + self._data = data + self._headers = { + "User-Agent": "Identifier Manager / OpenCitations Indexes " + "(http://opencitations.net; mailto:contact@opencitations.net)" + } + + def is_valid(self, oal_id, get_extra_info=False): + oal_id = self.normalise(oal_id, include_prefix=True) + + if oal_id is None: + return False + else: + if oal_id not in self._data or self._data[oal_id] is None: + if get_extra_info: + info = self.exists(oal_id, get_extra_info=True) + self._data[oal_id] = info[1] + return (info[0] and self.syntax_ok(oal_id)), info[1] + self._data[oal_id] = dict() + self._data[oal_id]["valid"] = True if (self.exists(oal_id) and self.syntax_ok(oal_id)) else False + return self._data[oal_id].get("valid") + if get_extra_info: + return self._data[oal_id].get("valid"), self._data[oal_id] + return self._data[oal_id].get("valid") + + def normalise(self, id_string, include_prefix=False): + try: + if id_string.startswith(self._p): + oal_string = id_string[len(self._p):] + else: + oal_string = id_string + + oal_string = sub(self._url_id_pref, '', oal_string) + oal_string = sub(self._api, '', oal_string) + oal_string = sub("\0+", "", sub("[^WS0-9]", "", oal_string.upper())) + return "%s%s" % ( + self._p if include_prefix else "", + oal_string.strip(), + ) + except: + # Any error in processing the OpenAlex ID will return None + return None + + def syntax_ok(self, id_string): + + if not id_string.startswith("openalex:"): + id_string = self._p + id_string + return True if match("^openalex:[WS][1-9]\\d*$", id_string) else False + + def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None): + valid_bool = True + if self._use_api_service: + oal_id = self.normalise(openalex_id_full) + if oal_id is not None: + tentative = 3 + while tentative: + tentative -= 1 + try: + r = get(self._api + oal_id, headers=self._headers, timeout=30) + if r.status_code == 200: + r.encoding = "utf-8" + json_res = loads(r.text) + if get_extra_info: + extra_info_result = {} + try: + result = True if json_res['id'] == (self._url_id_pref + oal_id) else False + extra_info_result['valid'] = result + return result, extra_info_result + except KeyError: + extra_info_result["valid"] = False + return False, extra_info_result + try: + return True if json_res['id'] == (self._url_id_pref + oal_id) else False + except KeyError: + return False + if r.status_code == 429: + sleep(1) # only handles per-second rate limits (not per-day rate limits) + elif 400 <= r.status_code < 500: + if get_extra_info: + return False, {"valid": False} + return False + except ReadTimeout: + # Do nothing, just try again + pass + except ConnectionError: + # Sleep 5 seconds, then try again + sleep(5) + valid_bool = False + else: + if get_extra_info: + return False, {"valid": False} + return False + + if get_extra_info: + return valid_bool, {"valid": valid_bool} + return valid_bool + + def extra_info(self, api_response, choose_api=None, info_dict={}): + result = {} + result["valid"] = True + # to be implemented + return result From 6c1d97f6768d025f421a2fb2355cae02bc8eb267 Mon Sep 17 00:00:00 2001 From: eliarizzetto Date: Wed, 6 Mar 2024 15:58:19 +0100 Subject: [PATCH 2/3] Update openalex.py added support for storage manager to OpenAlexManager class and improved OpenAlexManager.normalise() method. --- oc_ds_converter/oc_idmanager/openalex.py | 56 +++++++++++++++--------- 1 file changed, 36 insertions(+), 20 deletions(-) diff --git a/oc_ds_converter/oc_idmanager/openalex.py b/oc_ds_converter/oc_idmanager/openalex.py index a287d26..8aa759e 100644 --- a/oc_ds_converter/oc_idmanager/openalex.py +++ b/oc_ds_converter/oc_idmanager/openalex.py @@ -15,24 +15,32 @@ # SOFTWARE. from oc_ds_converter.oc_idmanager.base import IdentifierManager +from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager from re import sub, match from requests import ReadTimeout, get from requests.exceptions import ConnectionError from json import loads from time import sleep +from typing import Optional class OpenAlexManager(IdentifierManager): """This class implements an identifier manager for openalex identifier""" - def __init__(self, data={}, use_api_service=True): + def __init__(self, use_api_service=True, storage_manager: Optional[StorageManager] = None): """OpenAlex manager constructor.""" super(OpenAlexManager, self).__init__() + if storage_manager is None: + self.storage_manager = InMemoryStorageManager() + else: + self.storage_manager = storage_manager self._api = "https://api.openalex.org/" + self._api_works_route = r"https://api.openalex.org/works/" + self._api_sources_route = r"https://api.openalex.org/sources/" self._use_api_service = use_api_service self._p = "openalex:" self._url_id_pref = "https://openalex.org/" - self._data = data self._headers = { "User-Agent": "Identifier Manager / OpenCitations Indexes " "(http://opencitations.net; mailto:contact@opencitations.net)" @@ -44,17 +52,18 @@ def is_valid(self, oal_id, get_extra_info=False): if oal_id is None: return False else: - if oal_id not in self._data or self._data[oal_id] is None: + id_validation_value = self.storage_manager.get_value(oal_id) + if isinstance(id_validation_value, bool): + return id_validation_value + else: if get_extra_info: info = self.exists(oal_id, get_extra_info=True) - self._data[oal_id] = info[1] + self.storage_manager.set_full_value(oal_id,info[1]) return (info[0] and self.syntax_ok(oal_id)), info[1] - self._data[oal_id] = dict() - self._data[oal_id]["valid"] = True if (self.exists(oal_id) and self.syntax_ok(oal_id)) else False - return self._data[oal_id].get("valid") - if get_extra_info: - return self._data[oal_id].get("valid"), self._data[oal_id] - return self._data[oal_id].get("valid") + validity_check = self.exists(oal_id) and self.syntax_ok(oal_id) + self.storage_manager.set_value(oal_id, validity_check) + + return validity_check def normalise(self, id_string, include_prefix=False): try: @@ -63,9 +72,14 @@ def normalise(self, id_string, include_prefix=False): else: oal_string = id_string - oal_string = sub(self._url_id_pref, '', oal_string) - oal_string = sub(self._api, '', oal_string) - oal_string = sub("\0+", "", sub("[^WS0-9]", "", oal_string.upper())) + oal_string = sub("\0+", "", (sub("\s+", "", oal_string))) + + oal_string = oal_string.removeprefix(self._api_works_route) + oal_string = oal_string.removeprefix(self._api_sources_route) + oal_string = oal_string.removeprefix(self._api) + oal_string = oal_string.removeprefix(self._url_id_pref) + + oal_string = oal_string.upper() return "%s%s" % ( self._p if include_prefix else "", oal_string.strip(), @@ -82,9 +96,11 @@ def syntax_ok(self, id_string): def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None): valid_bool = True + openalex_id_full = self._p + openalex_id_full if not openalex_id_full.startswith(self._p) else openalex_id_full if self._use_api_service: - oal_id = self.normalise(openalex_id_full) - if oal_id is not None: + oal_id = self.normalise(openalex_id_full) # returns None or unprefixed ID (include_prefix is set to False) + pref_oalid = self._p + oal_id if oal_id else None + if pref_oalid is not None: tentative = 3 while tentative: tentative -= 1 @@ -94,13 +110,13 @@ def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None): r.encoding = "utf-8" json_res = loads(r.text) if get_extra_info: - extra_info_result = {} + extra_info_result = {'id': pref_oalid} try: result = True if json_res['id'] == (self._url_id_pref + oal_id) else False extra_info_result['valid'] = result return result, extra_info_result except KeyError: - extra_info_result["valid"] = False + extra_info_result['valid'] = False return False, extra_info_result try: return True if json_res['id'] == (self._url_id_pref + oal_id) else False @@ -110,7 +126,7 @@ def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None): sleep(1) # only handles per-second rate limits (not per-day rate limits) elif 400 <= r.status_code < 500: if get_extra_info: - return False, {"valid": False} + return False, {'id': pref_oalid, 'valid': False} return False except ReadTimeout: # Do nothing, just try again @@ -121,11 +137,11 @@ def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None): valid_bool = False else: if get_extra_info: - return False, {"valid": False} + return False, {'id': pref_oalid, 'valid': False} return False if get_extra_info: - return valid_bool, {"valid": valid_bool} + return valid_bool, {'id': openalex_id_full, 'valid': valid_bool} return valid_bool def extra_info(self, api_response, choose_api=None, info_dict={}): From dc2f0ece795329bca9d81e6eaac36862225ee6ff Mon Sep 17 00:00:00 2001 From: eliarizzetto Date: Wed, 6 Mar 2024 15:59:53 +0100 Subject: [PATCH 3/3] add tests (code and data) for OpenAlexManager --- test/data/glob.json | 6 +- test/idm_openalex_test.py | 203 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 207 insertions(+), 2 deletions(-) create mode 100644 test/idm_openalex_test.py diff --git a/test/data/glob.json b/test/data/glob.json index 35b5b8d..b530103 100644 --- a/test/data/glob.json +++ b/test/data/glob.json @@ -133,11 +133,13 @@ "viaf:5604148947771454950004": {"id": "viaf:5604148947771454950004","valid": true}, - "viaf:5604148947771454953333": {"id":"viaf:5604148947771454953333","valid": false} - + "viaf:5604148947771454953333": {"id":"viaf:5604148947771454953333","valid": false}, + "openalex:W2013228336": {"id": "openalex:W2013228336", "valid": true}, + "openalex:W7836728310": {"id": "openalex:W7836728310", "valid": false}, + "openalex:S4263287381": {"id": "openalex:S4263287381", "valid": false} } \ No newline at end of file diff --git a/test/idm_openalex_test.py b/test/idm_openalex_test.py new file mode 100644 index 0000000..a435008 --- /dev/null +++ b/test/idm_openalex_test.py @@ -0,0 +1,203 @@ +import json +import sqlite3 +import os.path +import unittest +from os import makedirs +from os.path import exists, join + +import xmltodict +from oc_ds_converter.oc_idmanager import * +from oc_ds_converter.oc_idmanager.base import IdentifierManager +from requests import ReadTimeout, get +from requests.exceptions import ConnectionError +from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager + +class OpenAlexIdentifierManagerTest(unittest.TestCase): + """This class aim at testing identifiers manager.""" + + def setUp(self): + if not exists("tmp"): + makedirs("tmp") + + self.test_dir = join("test", "data") + self.test_json_path = join(self.test_dir, "glob.json") + with open(self.test_json_path, encoding="utf-8") as fp: + self.data = json.load(fp) + + self.valid_wid = "W2013228336" + self.valid_sid = "S4210229581" + self.invalid_wid = "W7836728310" + self.invalid_sid = "S4263287381" + + def test_openalex_is_valid(self): + oalm_nofile = OpenAlexManager() + self.assertTrue(oalm_nofile.is_valid(self.valid_wid)) + self.assertTrue(oalm_nofile.is_valid(self.valid_sid)) + self.assertFalse(oalm_nofile.is_valid(self.invalid_wid)) + self.assertFalse(oalm_nofile.is_valid(self.invalid_sid)) + + oalm_file = OpenAlexManager(use_api_service=False, storage_manager=InMemoryStorageManager(self.test_json_path)) + self.assertTrue(oalm_file.normalise(self.valid_wid, include_prefix=True) in self.data) + self.assertTrue(oalm_file.normalise(self.invalid_wid, include_prefix=True) in self.data) + self.assertTrue(oalm_file.is_valid(self.valid_wid)) + self.assertFalse(oalm_file.is_valid(self.invalid_wid)) + + oalm_nofile_noapi = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) + self.assertTrue(oalm_nofile_noapi.is_valid(self.valid_wid)) + self.assertTrue(oalm_nofile_noapi.is_valid(self.valid_sid)) + + def test_exists(self): + with self.subTest(msg="get_extra_info=True, allow_extra_api=None"): + oalm = OpenAlexManager() + output = oalm.exists('openalex:W748315831', get_extra_info=True, allow_extra_api=None) + expected_output = (True, {'valid': True}) + self.assertEqual(expected_output[0], output[0]) + # self.assertCountEqual({k:v for k,v in expected_output[1].items() if k!= "author"}, {k:v for k,v in output[1].items() if k!= "author"}) + # self.assertCountEqual(expected_output[1]["author"], output[1]["author"]) + + with self.subTest(msg="get_extra_info=False, allow_extra_api=None"): + oalm = OpenAlexManager() + output = oalm.exists('S4210229581', get_extra_info=False, allow_extra_api=None) + expected_output = True + self.assertEqual(output, expected_output) + + + def test_openalex_normalise(self): + oalm = OpenAlexManager() + + self.assertEqual( + self.valid_wid, oalm.normalise("openalex:" + self.valid_wid) + ) + self.assertEqual( + self.valid_wid, oalm.normalise(self.valid_wid.replace("", " ")) + ) + self.assertEqual( + self.valid_wid, + oalm.normalise("https://openalex.org/" + self.valid_wid), + ) + self.assertEqual( + oalm.normalise(self.valid_wid), + oalm.normalise(' ' + self.valid_wid), + ) + self.assertEqual( + oalm.normalise(self.valid_sid), + oalm.normalise("https://api.openalex.org/sources/" + self.valid_sid), + ) + + dm_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) + self.assertTrue(dm_file.normalise(self.valid_wid, include_prefix=True) in self.data) + self.assertTrue(dm_file.normalise(self.invalid_sid, include_prefix=True) in self.data) + self.assertTrue(dm_file.is_valid(self.valid_wid)) + self.assertFalse(dm_file.is_valid(self.invalid_sid)) + + def test_openalex_default(self): + mngr = OpenAlexManager() + # No support files (it generates it) + # Default storage manager : in Memory + generates file on method call (not automatically) + # uses API + self.assertTrue(mngr.is_valid(self.valid_wid)) + self.assertTrue(mngr.is_valid(self.valid_sid)) + self.assertFalse(mngr.is_valid(self.invalid_sid)) + self.assertFalse(mngr.is_valid(self.invalid_wid)) + mngr.storage_manager.store_file() + validated_ids = [self.valid_wid, self.valid_sid, self.invalid_wid, self.invalid_sid] + validated = [mngr.normalise(x, include_prefix=True) for x in validated_ids if mngr.normalise(x, include_prefix=True)] + # check that the support file was correctly created + self.assertTrue(os.path.exists("storage/id_value.json")) + lj = open("storage/id_value.json") + load_dict = json.load(lj) + lj.close() + stored = [mngr.normalise(x, include_prefix=True) for x in load_dict if mngr.normalise(x, include_prefix=True)] + + # check that all the validated ids are stored in the json file + self.assertTrue(all(x in stored for x in validated)) + mngr.storage_manager.delete_storage() + # check that the support file was correctly deleted + self.assertFalse(os.path.exists("storage/id_value.json")) + + def test_openalex_memory_file_noapi(self): + # Uses support file (without updating it) + # Uses InMemoryStorageManager storage manager + # does not use API (so a syntactically correct id is considered to be valid) + am_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False) + self.assertTrue(am_file.normalise(self.valid_wid, include_prefix=True) in self.data) + self.assertTrue(am_file.normalise(self.invalid_sid, include_prefix=True) in self.data) + self.assertFalse(am_file.is_valid(self.invalid_sid)) # is stored in support file as invalid + # self.assertTrue(am_file.is_valid(am_file.normalise(self.invalid_wid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax + + def test_openalex_memory_file_api(self): + # Uses support file (without updating it) + # Uses InMemoryStorageManager storage manager + # uses API (so a syntactically correct id which is not valid is considered to be invalid) + am_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=True) + self.assertFalse(am_file.is_valid(self.invalid_wid)) + + def test_openalex_memory_nofile_noapi(self): + # Does not use support file + # Uses InMemoryStorageManager storage manager + # Does not use API (so a syntactically correct id which is not valid is considered to be valid) + am_nofile_noapi = OpenAlexManager(storage_manager=InMemoryStorageManager(), use_api_service=False) + self.assertTrue(am_nofile_noapi.is_valid(self.valid_wid)) + self.assertTrue(am_nofile_noapi.is_valid(self.invalid_wid)) + am_nofile_noapi.storage_manager.delete_storage() + + def test_openalex_sqlite_nofile_api(self): + # No support files (it generates it) + # storage manager : SqliteStorageManager + # uses API + sql_am_nofile = OpenAlexManager(storage_manager=SqliteStorageManager()) + self.assertTrue(sql_am_nofile.is_valid(self.valid_wid)) + self.assertTrue(sql_am_nofile.is_valid(self.valid_sid)) + self.assertFalse(sql_am_nofile.is_valid(self.invalid_wid)) + self.assertFalse(sql_am_nofile.is_valid(self.invalid_sid)) + # check that the support db was correctly created and that it contains all the validated ids + self.assertTrue(os.path.exists("storage/id_valid_dict.db")) + validated_ids = [self.valid_wid, self.valid_sid, self.invalid_wid, self.invalid_sid] + all_ids_stored = sql_am_nofile.storage_manager.get_all_keys() + # check that all the validated ids are stored in the json file + stored = [x for x in all_ids_stored] + validated = [sql_am_nofile.normalise(x, include_prefix=True) for x in validated_ids if sql_am_nofile.normalise(x, include_prefix=True)] + self.assertTrue(all(x in stored for x in validated)) + sql_am_nofile.storage_manager.delete_storage() + # check that the support file was correctly deleted + self.assertFalse(os.path.exists("storage/id_valid_dict.db")) + + def test_openalex_sqlite_file_api(self): + # Uses support file + # Uses SqliteStorageManager storage manager + # does not use API (so a syntactically correct id is considered to be valid) + # db creation + test_sqlite_db = os.path.join(self.test_dir, "database.db") + if os.path.exists(test_sqlite_db): + os.remove(test_sqlite_db) + #con = sqlite3.connect(test_sqlite_db) + #cur = con.cursor() + to_insert = [self.invalid_wid, self.valid_wid] + sql_file = OpenAlexManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=True) + for id in to_insert: + norm_id = sql_file.normalise(id, include_prefix=True) + is_valid = 1 if sql_file.is_valid(norm_id) else 0 + insert_tup = (norm_id, is_valid) + sql_file.storage_manager.cur.execute(f"INSERT OR REPLACE INTO info VALUES (?,?)", insert_tup) + sql_file.storage_manager.con.commit() + sql_file.storage_manager.con.close() + + sql_no_api = OpenAlexManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=False) + all_db_keys = sql_no_api.storage_manager.get_all_keys() + #check that all the normalised ind in the list were correctly inserted in the db + self.assertTrue(all(sql_no_api.normalise(x, include_prefix=True) in all_db_keys for x in to_insert)) + self.assertTrue(sql_no_api.is_valid(self.valid_wid)) # is stored in support file as valid + self.assertFalse(sql_no_api.is_valid(self.invalid_wid)) # is stored in support file as invalid + self.assertTrue(sql_no_api.is_valid(sql_no_api.normalise(self.invalid_sid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax + sql_no_api.storage_manager.delete_storage() + + def test_openalex_sqlite_nofile_noapi(self): + # Does not use support file + # Uses SqliteStorageManager storage manager + # Does not use API (so a syntactically correct id which is not valid is considered to be valid) + am_nofile_noapi = OpenAlexManager(storage_manager=SqliteStorageManager(), use_api_service=False) + self.assertTrue(am_nofile_noapi.is_valid(self.valid_wid)) + self.assertTrue(am_nofile_noapi.is_valid(self.invalid_sid)) + am_nofile_noapi.storage_manager.delete_storage()