Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Identifier manager: add support for OpenAlex IDs #8

Merged
merged 4 commits into from
Mar 12, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions oc_ds_converter/oc_idmanager/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,3 +27,5 @@
from oc_ds_converter.oc_idmanager.viaf import ViafManager
from oc_ds_converter.oc_idmanager.wikidata import WikidataManager
from oc_ds_converter.oc_idmanager.wikipedia import WikipediaManager
from oc_ds_converter.oc_idmanager.openalex import OpenAlexManager

151 changes: 151 additions & 0 deletions oc_ds_converter/oc_idmanager/openalex.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
#!python
# Copyright 2019, Silvio Peroni <[email protected]>
# Copyright 2022, Giuseppe Grieco <[email protected]>, Arianna Moretti <[email protected]>, Elia Rizzetto <[email protected]>, Arcangelo Massari <[email protected]>
#
# Permission to use, copy, modify, and/or distribute this software for any purpose
# with or without fee is hereby granted, provided that the above copyright notice
# and this permission notice appear in all copies.
#
# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH
# REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
# FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY SPECIAL, DIRECT, INDIRECT,
# OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE,
# DATA OR PROFITS, WHETHER IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS
# ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
# SOFTWARE.

from oc_ds_converter.oc_idmanager.base import IdentifierManager
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
from re import sub, match
from requests import ReadTimeout, get
from requests.exceptions import ConnectionError
from json import loads
from time import sleep
from typing import Optional


class OpenAlexManager(IdentifierManager):
"""This class implements an identifier manager for openalex identifier"""

def __init__(self, use_api_service=True, storage_manager: Optional[StorageManager] = None):
"""OpenAlex manager constructor."""
super(OpenAlexManager, self).__init__()
if storage_manager is None:
self.storage_manager = InMemoryStorageManager()
else:
self.storage_manager = storage_manager
self._api = "https://api.openalex.org/"
self._api_works_route = r"https://api.openalex.org/works/"
self._api_sources_route = r"https://api.openalex.org/sources/"
self._use_api_service = use_api_service
self._p = "openalex:"
self._url_id_pref = "https://openalex.org/"
self._headers = {
"User-Agent": "Identifier Manager / OpenCitations Indexes "
"(http://opencitations.net; mailto:[email protected])"
}

def is_valid(self, oal_id, get_extra_info=False):
oal_id = self.normalise(oal_id, include_prefix=True)

if oal_id is None:
return False
else:
id_validation_value = self.storage_manager.get_value(oal_id)
if isinstance(id_validation_value, bool):
return id_validation_value
else:
if get_extra_info:
info = self.exists(oal_id, get_extra_info=True)
self.storage_manager.set_full_value(oal_id,info[1])
return (info[0] and self.syntax_ok(oal_id)), info[1]
validity_check = self.exists(oal_id) and self.syntax_ok(oal_id)
self.storage_manager.set_value(oal_id, validity_check)

return validity_check

def normalise(self, id_string, include_prefix=False):
try:
if id_string.startswith(self._p):
oal_string = id_string[len(self._p):]
else:
oal_string = id_string

oal_string = sub("\0+", "", (sub("\s+", "", oal_string)))

oal_string = oal_string.removeprefix(self._api_works_route)
oal_string = oal_string.removeprefix(self._api_sources_route)
oal_string = oal_string.removeprefix(self._api)
oal_string = oal_string.removeprefix(self._url_id_pref)

oal_string = oal_string.upper()
return "%s%s" % (
self._p if include_prefix else "",
oal_string.strip(),
)
except:
# Any error in processing the OpenAlex ID will return None
return None

def syntax_ok(self, id_string):

if not id_string.startswith("openalex:"):
id_string = self._p + id_string
return True if match("^openalex:[WS][1-9]\\d*$", id_string) else False

def exists(self, openalex_id_full, get_extra_info=False, allow_extra_api=None):
valid_bool = True
openalex_id_full = self._p + openalex_id_full if not openalex_id_full.startswith(self._p) else openalex_id_full
if self._use_api_service:
oal_id = self.normalise(openalex_id_full) # returns None or unprefixed ID (include_prefix is set to False)
pref_oalid = self._p + oal_id if oal_id else None
if pref_oalid is not None:
tentative = 3
while tentative:
tentative -= 1
try:
r = get(self._api + oal_id, headers=self._headers, timeout=30)
if r.status_code == 200:
r.encoding = "utf-8"
json_res = loads(r.text)
if get_extra_info:
extra_info_result = {'id': pref_oalid}
try:
result = True if json_res['id'] == (self._url_id_pref + oal_id) else False
extra_info_result['valid'] = result
return result, extra_info_result
except KeyError:
extra_info_result['valid'] = False
return False, extra_info_result
try:
return True if json_res['id'] == (self._url_id_pref + oal_id) else False
except KeyError:
return False
if r.status_code == 429:
sleep(1) # only handles per-second rate limits (not per-day rate limits)
elif 400 <= r.status_code < 500:
if get_extra_info:
return False, {'id': pref_oalid, 'valid': False}
return False
except ReadTimeout:
# Do nothing, just try again
pass
except ConnectionError:
# Sleep 5 seconds, then try again
sleep(5)
valid_bool = False
else:
if get_extra_info:
return False, {'id': pref_oalid, 'valid': False}
return False

if get_extra_info:
return valid_bool, {'id': openalex_id_full, 'valid': valid_bool}
return valid_bool

def extra_info(self, api_response, choose_api=None, info_dict={}):
result = {}
result["valid"] = True
# to be implemented
return result
6 changes: 4 additions & 2 deletions test/data/glob.json
Original file line number Diff line number Diff line change
Expand Up @@ -133,11 +133,13 @@

"viaf:5604148947771454950004": {"id": "viaf:5604148947771454950004","valid": true},

"viaf:5604148947771454953333": {"id":"viaf:5604148947771454953333","valid": false}

"viaf:5604148947771454953333": {"id":"viaf:5604148947771454953333","valid": false},

"openalex:W2013228336": {"id": "openalex:W2013228336", "valid": true},

"openalex:W7836728310": {"id": "openalex:W7836728310", "valid": false},

"openalex:S4263287381": {"id": "openalex:S4263287381", "valid": false}


}
203 changes: 203 additions & 0 deletions test/idm_openalex_test.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,203 @@
import json
import sqlite3
import os.path
import unittest
from os import makedirs
from os.path import exists, join

import xmltodict
from oc_ds_converter.oc_idmanager import *
from oc_ds_converter.oc_idmanager.base import IdentifierManager
from requests import ReadTimeout, get
from requests.exceptions import ConnectionError
from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager
from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager
from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager

class OpenAlexIdentifierManagerTest(unittest.TestCase):
"""This class aim at testing identifiers manager."""

def setUp(self):
if not exists("tmp"):
makedirs("tmp")

self.test_dir = join("test", "data")
self.test_json_path = join(self.test_dir, "glob.json")
with open(self.test_json_path, encoding="utf-8") as fp:
self.data = json.load(fp)

self.valid_wid = "W2013228336"
self.valid_sid = "S4210229581"
self.invalid_wid = "W7836728310"
self.invalid_sid = "S4263287381"

def test_openalex_is_valid(self):
oalm_nofile = OpenAlexManager()
self.assertTrue(oalm_nofile.is_valid(self.valid_wid))
self.assertTrue(oalm_nofile.is_valid(self.valid_sid))
self.assertFalse(oalm_nofile.is_valid(self.invalid_wid))
self.assertFalse(oalm_nofile.is_valid(self.invalid_sid))

oalm_file = OpenAlexManager(use_api_service=False, storage_manager=InMemoryStorageManager(self.test_json_path))
self.assertTrue(oalm_file.normalise(self.valid_wid, include_prefix=True) in self.data)
self.assertTrue(oalm_file.normalise(self.invalid_wid, include_prefix=True) in self.data)
self.assertTrue(oalm_file.is_valid(self.valid_wid))
self.assertFalse(oalm_file.is_valid(self.invalid_wid))

oalm_nofile_noapi = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False)
self.assertTrue(oalm_nofile_noapi.is_valid(self.valid_wid))
self.assertTrue(oalm_nofile_noapi.is_valid(self.valid_sid))

def test_exists(self):
with self.subTest(msg="get_extra_info=True, allow_extra_api=None"):
oalm = OpenAlexManager()
output = oalm.exists('openalex:W748315831', get_extra_info=True, allow_extra_api=None)
expected_output = (True, {'valid': True})
self.assertEqual(expected_output[0], output[0])
# self.assertCountEqual({k:v for k,v in expected_output[1].items() if k!= "author"}, {k:v for k,v in output[1].items() if k!= "author"})
# self.assertCountEqual(expected_output[1]["author"], output[1]["author"])

with self.subTest(msg="get_extra_info=False, allow_extra_api=None"):
oalm = OpenAlexManager()
output = oalm.exists('S4210229581', get_extra_info=False, allow_extra_api=None)
expected_output = True
self.assertEqual(output, expected_output)


def test_openalex_normalise(self):
oalm = OpenAlexManager()

self.assertEqual(
self.valid_wid, oalm.normalise("openalex:" + self.valid_wid)
)
self.assertEqual(
self.valid_wid, oalm.normalise(self.valid_wid.replace("", " "))
)
self.assertEqual(
self.valid_wid,
oalm.normalise("https://openalex.org/" + self.valid_wid),
)
self.assertEqual(
oalm.normalise(self.valid_wid),
oalm.normalise(' ' + self.valid_wid),
)
self.assertEqual(
oalm.normalise(self.valid_sid),
oalm.normalise("https://api.openalex.org/sources/" + self.valid_sid),
)

dm_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False)
self.assertTrue(dm_file.normalise(self.valid_wid, include_prefix=True) in self.data)
self.assertTrue(dm_file.normalise(self.invalid_sid, include_prefix=True) in self.data)
self.assertTrue(dm_file.is_valid(self.valid_wid))
self.assertFalse(dm_file.is_valid(self.invalid_sid))

def test_openalex_default(self):
mngr = OpenAlexManager()
# No support files (it generates it)
# Default storage manager : in Memory + generates file on method call (not automatically)
# uses API
self.assertTrue(mngr.is_valid(self.valid_wid))
self.assertTrue(mngr.is_valid(self.valid_sid))
self.assertFalse(mngr.is_valid(self.invalid_sid))
self.assertFalse(mngr.is_valid(self.invalid_wid))
mngr.storage_manager.store_file()
validated_ids = [self.valid_wid, self.valid_sid, self.invalid_wid, self.invalid_sid]
validated = [mngr.normalise(x, include_prefix=True) for x in validated_ids if mngr.normalise(x, include_prefix=True)]
# check that the support file was correctly created
self.assertTrue(os.path.exists("storage/id_value.json"))
lj = open("storage/id_value.json")
load_dict = json.load(lj)
lj.close()
stored = [mngr.normalise(x, include_prefix=True) for x in load_dict if mngr.normalise(x, include_prefix=True)]

# check that all the validated ids are stored in the json file
self.assertTrue(all(x in stored for x in validated))
mngr.storage_manager.delete_storage()
# check that the support file was correctly deleted
self.assertFalse(os.path.exists("storage/id_value.json"))

def test_openalex_memory_file_noapi(self):
# Uses support file (without updating it)
# Uses InMemoryStorageManager storage manager
# does not use API (so a syntactically correct id is considered to be valid)
am_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=False)
self.assertTrue(am_file.normalise(self.valid_wid, include_prefix=True) in self.data)
self.assertTrue(am_file.normalise(self.invalid_sid, include_prefix=True) in self.data)
self.assertFalse(am_file.is_valid(self.invalid_sid)) # is stored in support file as invalid
# self.assertTrue(am_file.is_valid(am_file.normalise(self.invalid_wid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax

def test_openalex_memory_file_api(self):
# Uses support file (without updating it)
# Uses InMemoryStorageManager storage manager
# uses API (so a syntactically correct id which is not valid is considered to be invalid)
am_file = OpenAlexManager(storage_manager=InMemoryStorageManager(self.test_json_path), use_api_service=True)
self.assertFalse(am_file.is_valid(self.invalid_wid))

def test_openalex_memory_nofile_noapi(self):
# Does not use support file
# Uses InMemoryStorageManager storage manager
# Does not use API (so a syntactically correct id which is not valid is considered to be valid)
am_nofile_noapi = OpenAlexManager(storage_manager=InMemoryStorageManager(), use_api_service=False)
self.assertTrue(am_nofile_noapi.is_valid(self.valid_wid))
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_wid))
am_nofile_noapi.storage_manager.delete_storage()

def test_openalex_sqlite_nofile_api(self):
# No support files (it generates it)
# storage manager : SqliteStorageManager
# uses API
sql_am_nofile = OpenAlexManager(storage_manager=SqliteStorageManager())
self.assertTrue(sql_am_nofile.is_valid(self.valid_wid))
self.assertTrue(sql_am_nofile.is_valid(self.valid_sid))
self.assertFalse(sql_am_nofile.is_valid(self.invalid_wid))
self.assertFalse(sql_am_nofile.is_valid(self.invalid_sid))
# check that the support db was correctly created and that it contains all the validated ids
self.assertTrue(os.path.exists("storage/id_valid_dict.db"))
validated_ids = [self.valid_wid, self.valid_sid, self.invalid_wid, self.invalid_sid]
all_ids_stored = sql_am_nofile.storage_manager.get_all_keys()
# check that all the validated ids are stored in the json file
stored = [x for x in all_ids_stored]
validated = [sql_am_nofile.normalise(x, include_prefix=True) for x in validated_ids if sql_am_nofile.normalise(x, include_prefix=True)]
self.assertTrue(all(x in stored for x in validated))
sql_am_nofile.storage_manager.delete_storage()
# check that the support file was correctly deleted
self.assertFalse(os.path.exists("storage/id_valid_dict.db"))

def test_openalex_sqlite_file_api(self):
# Uses support file
# Uses SqliteStorageManager storage manager
# does not use API (so a syntactically correct id is considered to be valid)
# db creation
test_sqlite_db = os.path.join(self.test_dir, "database.db")
if os.path.exists(test_sqlite_db):
os.remove(test_sqlite_db)
#con = sqlite3.connect(test_sqlite_db)
#cur = con.cursor()
to_insert = [self.invalid_wid, self.valid_wid]
sql_file = OpenAlexManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=True)
for id in to_insert:
norm_id = sql_file.normalise(id, include_prefix=True)
is_valid = 1 if sql_file.is_valid(norm_id) else 0
insert_tup = (norm_id, is_valid)
sql_file.storage_manager.cur.execute(f"INSERT OR REPLACE INTO info VALUES (?,?)", insert_tup)
sql_file.storage_manager.con.commit()
sql_file.storage_manager.con.close()

sql_no_api = OpenAlexManager(storage_manager=SqliteStorageManager(test_sqlite_db), use_api_service=False)
all_db_keys = sql_no_api.storage_manager.get_all_keys()
#check that all the normalised ind in the list were correctly inserted in the db
self.assertTrue(all(sql_no_api.normalise(x, include_prefix=True) in all_db_keys for x in to_insert))
self.assertTrue(sql_no_api.is_valid(self.valid_wid)) # is stored in support file as valid
self.assertFalse(sql_no_api.is_valid(self.invalid_wid)) # is stored in support file as invalid
self.assertTrue(sql_no_api.is_valid(sql_no_api.normalise(self.invalid_sid, include_prefix=True))) # is not stored in support file as invalid, does not exist but has correct syntax
sql_no_api.storage_manager.delete_storage()

def test_openalex_sqlite_nofile_noapi(self):
# Does not use support file
# Uses SqliteStorageManager storage manager
# Does not use API (so a syntactically correct id which is not valid is considered to be valid)
am_nofile_noapi = OpenAlexManager(storage_manager=SqliteStorageManager(), use_api_service=False)
self.assertTrue(am_nofile_noapi.is_valid(self.valid_wid))
self.assertTrue(am_nofile_noapi.is_valid(self.invalid_sid))
am_nofile_noapi.storage_manager.delete_storage()
Loading