From e50ab7c20849c89382645d93a9973b7ac2d4949d Mon Sep 17 00:00:00 2001 From: martasoricetti Date: Thu, 26 Oct 2023 16:13:59 +0200 Subject: [PATCH 1/2] fix jalc_process.py('citation_list' error key) + crossref test + jalc_languages_metadata_count --- .../crossref/crossref_processing.py | 6 +- .../datacite/datacite_processing.py | 5 +- .../datacite/datacite_processing_new.py | 771 ++++++++++++++++++ oc_ds_converter/lib/jsonmanager.py | 2 +- .../openaire/openaire_processing.py | 6 +- oc_ds_converter/preprocessing/datacite.py | 4 +- oc_ds_converter/run/crossref_process.py | 23 +- oc_ds_converter/run/datacite_process_new.py | 241 ++++++ oc_ds_converter/run/jalc_process.py | 53 +- .../jalc_languages_metadata_count.py | 386 +++++++++ .../test/JOCI_PRE_SAMPLE/sample_jalc.zip | Bin 0 -> 9424 bytes .../test/conta_test.py | 207 +++++ test/crossref_process_test.py | 247 +++++- test/jalc_process_test.py | 6 +- test/processing_crossref_test.py | 415 ---------- test/processing_oroci_test.py | 8 +- 16 files changed, 1885 insertions(+), 495 deletions(-) create mode 100644 oc_ds_converter/datacite/datacite_processing_new.py create mode 100644 oc_ds_converter/run/datacite_process_new.py create mode 100644 scripts_analysis/jalc_languages_metadata/jalc_languages_metadata_count.py create mode 100644 scripts_analysis/jalc_languages_metadata/test/JOCI_PRE_SAMPLE/sample_jalc.zip create mode 100644 scripts_analysis/jalc_languages_metadata/test/conta_test.py delete mode 100644 test/processing_crossref_test.py diff --git a/oc_ds_converter/crossref/crossref_processing.py b/oc_ds_converter/crossref/crossref_processing.py index 8700708..35ac9f4 100644 --- a/oc_ds_converter/crossref/crossref_processing.py +++ b/oc_ds_converter/crossref/crossref_processing.py @@ -129,9 +129,9 @@ def to_validated_id_list(self, norm_id_dict): def memory_to_storage(self): kv_in_memory = self.temporary_manager.get_validity_list_of_tuples() - #if kv_in_memory: - self.storage_manager.set_multi_value(kv_in_memory) - self.temporary_manager.delete_storage() + if kv_in_memory: + self.storage_manager.set_multi_value(kv_in_memory) + self.temporary_manager.delete_storage() def validated_as(self, id_dict): diff --git a/oc_ds_converter/datacite/datacite_processing.py b/oc_ds_converter/datacite/datacite_processing.py index 72ecbf7..7ecb2c2 100644 --- a/oc_ds_converter/datacite/datacite_processing.py +++ b/oc_ds_converter/datacite/datacite_processing.py @@ -303,8 +303,7 @@ def get_datacite_pages(self, item: dict) -> str: related_pages_list = list() container = item.get("container") if container: - if container.get("identifierType") == "ISSN" or container.get( - "identifierType") == "ISBN": + if container.get("identifierType") == "ISSN" or container.get("identifierType") == "ISBN": if container.get("firstPage"): container_pages_list.append(container.get("firstPage")) if container.get("lastPage"): @@ -477,7 +476,7 @@ def add_editors_to_agent_list(self, item: dict, ag_list: list) -> list: def add_authors_to_agent_list(self, item: dict, ag_list: list) -> list: ''' - This function returns the the agents list updated with the authors dictionaries, in the correct format. + This function returns the agents list updated with the authors dictionaries, in the correct format. :params item: the item's dictionary (attributes), ag_list: the :type item: dict, ag_list: list diff --git a/oc_ds_converter/datacite/datacite_processing_new.py b/oc_ds_converter/datacite/datacite_processing_new.py new file mode 100644 index 0000000..80b7947 --- /dev/null +++ b/oc_ds_converter/datacite/datacite_processing_new.py @@ -0,0 +1,771 @@ +import html +import re +import warnings +import os +import fakeredis +import csv +import json + +from bs4 import BeautifulSoup +from oc_ds_converter.oc_idmanager.doi import DOIManager +from oc_ds_converter.oc_idmanager.orcid import ORCIDManager +from oc_ds_converter.lib.master_of_regex import * +from oc_ds_converter.oc_idmanager.oc_data_storage.storage_manager import StorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import InMemoryStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import SqliteStorageManager +from oc_ds_converter.oc_idmanager.issn import ISSNManager +from oc_ds_converter.oc_idmanager.isbn import ISBNManager +from oc_ds_converter.datasource.redis import RedisDataSource +from oc_ds_converter.preprocessing.datacite import DatacitePreProcessing +from oc_ds_converter.ra_processor import RaProcessor +from typing import Dict, List, Tuple, Optional, Type, Callable +from pathlib import Path + +warnings.filterwarnings("ignore", category=UserWarning, module='bs4') + + +class DataciteProcessing(RaProcessor): + def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_filepath_dc: str = None, testing: bool = True, storage_manager: Optional[StorageManager] = None, citing=True): + super(DataciteProcessing, self).__init__(orcid_index, doi_csv) + # self.preprocessor = DatacitePreProcessing(inp_dir, out_dir, interval, filter) + if storage_manager is None: + self.storage_manager = SqliteStorageManager() + else: + self.storage_manager = storage_manager + + self.temporary_manager = InMemoryStorageManager('../memory.json') + + self._needed_info = ["relationType", "relatedIdentifierType", "relatedIdentifier"] + self._filter = ["references", "isreferencedby", "cites", "iscitedby"] + + self.RIS_types_map = {'abst': 'abstract', + 'news': 'newspaper article', + 'slide': 'presentation', + 'book': 'book', + 'data': 'dataset', + 'thes': 'dissertation', + 'jour': 'journal article', + 'mgzn': 'journal article', + 'gen': 'other', + 'advs': 'other', + 'video': 'other', + 'unpb': 'other', + 'ctlg': 'other', + 'art': 'other', + 'case': 'other', + 'icomm': 'other', + 'inpr': 'other', + 'map': 'other', + 'mpct': 'other', + 'music': 'other', + 'pamp': 'other', + 'pat': 'other', + 'pcomm': 'other', + 'catalog': 'other', + 'elec': 'other', + 'hear': 'other', + 'stat': 'other', + 'bill': 'other', + 'unbill': 'other', + 'cpaper': 'proceedings article', + 'rprt': 'report', + 'chap': 'book chapter', + 'ser': 'book series', + 'jfull': 'journal', + 'conf': 'proceedings', + 'comp': 'computer program', + 'sound': 'audio document'} + self.BIBTEX_types_map = {'book': 'book', + 'mastersthesis': 'dissertation', + 'phdthesis': 'dissertation', + 'article': 'journal article', + 'misc': 'other', + 'unpublished': 'other', + 'manual': 'other', + 'booklet': 'other', + 'inproceedings': 'proceedings article', + 'techreport': 'report', + 'inbook': 'book chapter', + 'incollection': 'book part', + 'proceedings': 'proceedings'} + self.CITEPROC_types_map = {'book': 'book', + 'dataset': 'dataset', + 'thesis': 'dissertation', + 'article-journal': 'journal article', + 'article': 'other', + 'graphic': 'other', + 'post-weblog': 'web content', + 'paper-conference': 'proceedings article', + 'report': 'report', + 'chapter': 'book chapter', + 'song': 'audio document'} + self.SCHEMAORG_types_map = {'book': 'book', + 'dataset': 'dataset', + 'thesis': 'dissertation', + 'scholarlyarticle': 'journal article', + 'article': 'journal article', + 'creativework': 'other', + 'event': 'other', + 'service': 'other', + 'mediaobject': 'other', + 'review': 'other', + 'collection': 'other', + 'imageobject': 'other', + 'blogposting': 'web content', + 'report': 'report', + 'chapter': 'book chapter', + 'periodical': 'journal', + 'publicationissue': 'journal issue', + 'publicationvolume': 'journal volume', + 'softwaresourcecode': 'computer program', + 'audioobject': 'audio document'} + self.RESOURCETYPEGENERAL_types_map = {'book': 'book', + 'dataset': 'dataset', + 'dissertation': 'dissertation', + 'journalarticle': 'journal article', + 'text': 'other', + 'other': 'other', + 'datapaper': 'other', + 'audiovisual': 'other', + 'interactiveresource': 'other', + 'physicalobject': 'other', + 'event': 'other', + 'service': 'other', + 'collection': 'other', + 'image': 'other', + 'model': 'other', + 'peerreview': 'peer review', + 'conferencepaper': 'proceedings article', + 'report': 'report', + 'bookchapter': 'book chapter', + 'journal': 'journal', + 'conferenceproceeding': 'proceedings', + 'standard': 'standard', + 'outputmanagementplan': 'data management plan', + 'preprint': 'preprint', + 'software': 'computer program', + 'sound': 'audio document', + 'workflow': 'workflow'} + + # def input_preprocessing(self): + # self.preprocessor.split_input() + + self.doi_m = DOIManager(storage_manager=self.storage_manager) + self.orcid_m = ORCIDManager(storage_manager=self.storage_manager) + self.issn_m = ISSNManager() + self.isbn_m = ISBNManager() + self.venue_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m} + # Temporary storage managers : all data must be stored in tmp storage manager and passed all together to the + # main storage_manager only once the full file is processed. Checks must be done both on tmp and in + # storage_manager, so that in case the process breaks while processing a file which does not complete (so + # without writing the final file) all the data concerning the ids are not stored. Otherwise, the ids saved in + # a storage_manager db would be considered to have been processed and thus would be ignored by the process + # and lost. + + self.tmp_doi_m = DOIManager(storage_manager=self.temporary_manager) + self.tmp_orcid_m = ORCIDManager(storage_manager=self.temporary_manager) + self.venue_tmp_id_man_dict = {"issn": self.issn_m, "isbn": self.isbn_m} + + if testing: + self.BR_redis = fakeredis.FakeStrictRedis() + self.RA_redis = fakeredis.FakeStrictRedis() + + else: + self.BR_redis = RedisDataSource("DB-META-BR") + self.RA_redis = RedisDataSource("DB-META-RA") + + self._redis_values_ra = [] + self._redis_values_br = [] + + if not publishers_filepath_dc: + self.publishers_filepath = None + else: + self.publishers_filepath = publishers_filepath_dc + + if os.path.exists(self.publishers_filepath): + pfp = dict() + csv_headers = ("id", "name", "prefix") + if self.publishers_filepath.endswith(".csv"): + with open(self.publishers_filepath, encoding="utf8") as f: + csv_reader = csv.DictReader(f, csv_headers) + for row in csv_reader: + pfp[row["prefix"]] = {"name": row["name"], "crossref_member": row["id"]} + self.publishers_filepath = self.publishers_filepath.replace(".csv", ".json") + elif self.publishers_filepath.endswith(".json"): + with open(self.publishers_filepath, encoding="utf8") as f: + pfp = json.load(f) + self.publishers_mapping = pfp + #added + def update_redis_values(self, br, ra): + self._redis_values_br = br + self._redis_values_ra = ra + + #added + def validated_as(self, id_dict): + # Check if the validity was already retrieved and thus + # a) if it is now saved either in the in-memory database, which only concerns data validated + # during the current file processing; + # b) or if it is now saved in the storage_manager database, which only concerns data validated + # during the previous files processing. + # In memory db is checked first because the dimension is smaller and the check is faster and + # Because we assume that it is more likely to find the same ids in close positions, e.g.: same + # citing id in several citations with different cited ids. + + schema = id_dict["schema"].strip().lower() + id = id_dict["identifier"] + + if schema != "orcid": + validity_value = self.tmp_doi_m.validated_as_id(id) + if validity_value is None: + validity_value = self.doi_m.validated_as_id(id) + return validity_value + else: + validity_value = self.tmp_orcid_m.validated_as_id(id) + if validity_value is None: + validity_value = self.orcid_m.validated_as_id(id) + return validity_value + + #added(probably unuseful) + + def get_id_manager(self, schema_or_id, id_man_dict): + """Given as input the string of a schema (e.g.:'pmid') and a dictionary mapping strings of + the schemas to their id managers, the method returns the correct id manager. Note that each + instance of the Preprocessing class needs its own instances of the id managers, in order to + avoid conflicts while validating data""" + if ":" in schema_or_id: + split_id_prefix = schema_or_id.split(":") + schema = split_id_prefix[0] + else: + schema = schema_or_id + id_man = id_man_dict.get(schema) + return id_man + + #added (probably unuseful) + def normalise_any_id(self, id_with_prefix): + id_man = self.doi_m + id_no_pref = ":".join(id_with_prefix.split(":")[1:]) + norm_id_w_pref = id_man.normalise(id_no_pref, include_prefix=True) + return norm_id_w_pref + + #added + def dict_to_cache(self, dict_to_be_saved, path): + path = Path(path) + parent_dir_path = path.parent.absolute() + if not os.path.exists(parent_dir_path): + Path(parent_dir_path).mkdir(parents=True, exist_ok=True) + with open(path, "w", encoding="utf-8") as fd: + json.dump(dict_to_be_saved, fd, ensure_ascii=False, indent=4) + + #no modified(look at the part of the venues) + def csv_creator(self, item: dict) -> dict: + row = dict() + doi = str(item['id']) + if (doi and self.doi_set and doi in self.doi_set) or (doi and not self.doi_set): + norm_id = self.doi_m.normalise(doi, include_prefix=True) + # create empty row + keys = ['id', 'title', 'author', 'pub_date', 'venue', 'volume', 'issue', 'page', 'type', + 'publisher', 'editor'] + for k in keys: + row[k] = '' + + attributes = item['attributes'] + + # row['type'] + if attributes.get('types') is not None: + types_dict = attributes['types'] + for k, v in types_dict.items(): + if k.lower() == 'ris': + if type(v) is str: + norm_v = v.strip().lower() + if norm_v in self.RIS_types_map.keys(): + row['type'] = self.RIS_types_map[norm_v] + break + if k.lower() == 'bibtex': + if type(v) is str: + norm_v = v.strip().lower() + if norm_v in self.BIBTEX_types_map.keys(): + row['type'] = self.BIBTEX_types_map[norm_v] + break + if k.lower() == 'schemaorg': + if type(v) is str: + norm_v = v.strip().lower() + if norm_v in self.SCHEMAORG_types_map.keys(): + row['type'] = self.SCHEMAORG_types_map[norm_v] + break + if k.lower() == 'citeproc': + if type(v) is str: + norm_v = v.strip().lower() + if norm_v in self.CITEPROC_types_map.keys(): + row['type'] = self.CITEPROC_types_map[norm_v] + break + if k.lower() == 'resourcetypegeneral': + if type(v) is str: + norm_v = v.strip().lower() + if norm_v in self.RESOURCETYPEGENERAL_types_map.keys(): + row['type'] = self.RESOURCETYPEGENERAL_types_map[norm_v] + break + + + # row['id'] + ids_list = list() + ids_list.append(norm_id) + + if attributes.get('identifiers'): + for other_id in attributes.get('identifiers'): + if other_id.get('identifier') and other_id.get('identifierType'): + o_id_type = other_id.get('identifierType') + o_id = other_id.get('identifier') + + + if o_id_type == 'ISBN': + if row['type'] in {'book', 'dissertation', 'edited book', 'monograph', 'reference book', 'report', + 'standard'}: + self.id_worker(o_id, ids_list, self.isbn_worker) + + elif o_id_type == 'ISSN': + if row['type'] in {'book series', 'book set', 'journal', 'proceedings series', 'series', + 'standard series', 'report series'}: + self.id_worker(o_id, ids_list, self.issn_worker) + + + row['id'] = ' '.join(ids_list) + + # row['title'] + pub_title = "" + if attributes.get("titles"): + for title in attributes.get("titles"): + if title.get("title"): + p_title = title.get("title") + soup = BeautifulSoup(p_title, 'html.parser') + title_soup = soup.get_text().replace('\n', '') + title_soup_space_replaced = ' '.join(title_soup.split()) + title_soup_strip = title_soup_space_replaced.strip() + clean_tit = html.unescape(title_soup_strip) + pub_title = clean_tit if clean_tit else p_title + + row['title'] = pub_title + + agent_list_authors_only = self.add_authors_to_agent_list(attributes, []) + agents_list = self.add_editors_to_agent_list(attributes, agent_list_authors_only) + + authors_strings_list, editors_string_list = self.get_agents_strings_list(doi, agents_list) + + # row['author'] + if 'creators' in attributes: + row['author'] = '; '.join(authors_strings_list) + + + # row['pub_date'] + cur_date = "" + dates = attributes.get("dates") + if dates: + for date in dates: + if date.get("dateType") == "Issued": + cur_date = date.get("date") + break + + if cur_date == "": + if attributes.get("publicationYear"): + cur_date = str(attributes.get("publicationYear")) + + row['pub_date'] = cur_date + + # row['venue'] + row['venue'] = self.get_venue_name(attributes, row) + + issue = "" + volume = "" + + container = attributes.get("container") + if container and container.get("identifierType") == "ISSN" or container.get( + "identifierType") == "ISBN": + if container.get("issue"): + issue = container.get("issue") + if container.get("volume"): + volume = container.get("volume") + + if not issue or not volume: + relatedIdentifiers = attributes.get("relatedIdentifiers") + if relatedIdentifiers: + for related in relatedIdentifiers: + if related.get("relationType"): + if related.get("relationType").lower() == "ispartof": + if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN": + if not issue and related.get("issue"): + issue = related.get("issue") + if not volume and related.get("volume"): + volume = related.get("volume") + # row['volume'] + row['volume'] = volume + + # row['issue'] + row['issue'] = issue + + # row['page'] + row['page'] = self.get_datacite_pages(attributes) + + # row['publisher'] + row['publisher'] = self.get_publisher_name(doi, attributes) + + # row['editor'] + if attributes.get("contributors"): + editors = [contributor for contributor in attributes.get("contributors") if + contributor.get("contributorType") == "Editor"] + if editors: + row['editor'] = '; '.join(editors_string_list) + try: + return self.normalise_unicode(row) + except TypeError: + print(row) + raise(TypeError) + + #added + def to_validated_id_list(self, norm_id): + valid_id_list = [] + if norm_id in self._redis_values_br: + self.tmp_doi_m.storage_manager.set_value(norm_id,True) # In questo modo l'id presente in redis viene inserito anche nello storage e risulta già + # preso in considerazione negli step successivi + valid_id_list.append(norm_id) + # if the id is not in redis db, validate it before appending + elif self.tmp_doi_m.is_valid(norm_id): # In questo modo l'id presente in redis viene inserito anche nello storage e risulta già + # preso in considerazione negli step successivi + valid_id_list.append(norm_id) + return valid_id_list + + #no modified + def get_datacite_pages(self, item: dict) -> str: + ''' + This function returns the pages interval. + + :params item: the item's dictionary + :type item: dict + :returns: str -- The output is a string in the format 'START-END', for example, '583-584'. If there are no pages, the output is an empty string. + ''' + container_pages_list = list() + related_pages_list = list() + container = item.get("container") + if container: + if container.get("identifierType") == "ISSN" or container.get("identifierType") == "ISBN": + if container.get("firstPage"): + container_pages_list.append(container.get("firstPage")) + if container.get("lastPage"): + container_pages_list.append(container.get("lastPage")) + + relatedIdentifiers = item.get("relatedIdentifiers") + if relatedIdentifiers: + for related in relatedIdentifiers: + if related.get("relationType"): + if related.get("relationType").lower() == "ispartof": + if related.get("relatedIdentifierType") == "ISSN" or related.get("relatedIdentifierType") == "ISBN": + if related.get("firstPage"): + related_pages_list.append(related.get("firstPage")) + if related.get("lastPage"): + related_pages_list.append(related.get("lastPage")) + + page_list = related_pages_list if len(related_pages_list)> len(container_pages_list) else container_pages_list + return self.get_pages(page_list) + + #no modified + def get_publisher_name(self, doi: str, item: dict) -> str: + ''' + This function aims to return a publisher's name and id. If a mapping was provided, + it is used to find the publisher's standardized name from its id or DOI prefix. + + :params doi: the item's DOI + :type doi: str + :params item: the item's dictionary + :type item: dict + :returns: str -- The output is a string in the format 'NAME [SCHEMA:ID]', for example, 'American Medical Association (AMA) [crossref:10]'. If the id does not exist, the output is only the name. Finally, if there is no publisher, the output is an empty string. + ''' + publisher = item.get("publisher") + if type(publisher) is str: + if publisher.lower().strip() == "(:unav)": + publisher = "" + + data = { + 'publisher': publisher, + 'prefix': doi.split('/')[0] + } + + publisher = data['publisher'] + prefix = data['prefix'] + + if self.publishers_mapping: + member_dict = next( + ({member: data} for member, data in self.publishers_mapping.items() if prefix in data['prefixes']), + None) + if member_dict: + member = list(member_dict.keys())[0] + name_and_id = f"{member_dict[member]['name']} [datacite:{member}]" + else: + name_and_id = publisher + else: + name_and_id = publisher + + return name_and_id + + #no modified + + def get_venue_name(self, item: dict, row: dict) -> str: + ''' + This method deals with generating the venue's name, followed by id in square brackets, separated by spaces. + HTML tags are deleted and HTML entities escaped. In addition, any ISBN and ISSN are validated. + Finally, the square brackets in the venue name are replaced by round brackets to avoid conflicts with the ids enclosures. + + :params item: the item's dictionary + :type item: dict + :params row: a CSV row + :type row: dict + :returns: str -- The output is a string in the format 'NAME [SCHEMA:ID]', for example, 'Nutrition & Food Science + [issn:0034-6659]'. If the id does not exist, the output is only the name. Finally, if there is no venue, + the output is an empty string. + ''' + + cont_title = "" + venids_list = list() + + # container + container = item.get("container") + if container: + # TITLE + if container.get("title"): + cont_title = (container["title"].lower()).replace('\n', '') + ven_soup = BeautifulSoup(cont_title, 'html.parser') + ventit = html.unescape(ven_soup.get_text()) + ambiguous_brackets = re.search('\[\s*((?:[^\s]+:[^\s]+)?(?:\s+[^\s]+:[^\s]+)*)\s*\]', ventit) + if ambiguous_brackets: + match = ambiguous_brackets.group(1) + open_bracket = ventit.find(match) - 1 + close_bracket = ventit.find(match) + len(match) + ventit = ventit[:open_bracket] + '(' + ventit[open_bracket + 1:] + ventit = ventit[:close_bracket] + ')' + ventit[close_bracket + 1:] + cont_title = ventit + + # IDS + if container.get("identifierType") == "ISBN": + if row['type'] in {'book chapter', 'book part', 'book section', 'book track', 'reference entry'}: + self.id_worker(container.get("identifier"), venids_list, self.isbn_worker) + + if container.get("identifierType") == "ISSN": + if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article', 'journal volume', + 'journal issue', 'monograph', 'proceedings', 'peer review', 'reference book', + 'reference entry', 'report'}: + self.id_worker(container.get("identifier"), venids_list, self.issn_worker) + elif row['type'] == 'report series': + if container.get("title"): + if container.get("title"): + self.id_worker(container.get("identifier"), venids_list, self.issn_worker) + + + if not venids_list: + relatedIdentifiers = item.get("relatedIdentifiers") + if relatedIdentifiers: + for related in relatedIdentifiers: + if related.get("relationType"): + if related.get("relationType").lower() == "ispartof": + if related.get("relatedIdentifierType") == "ISBN": + if row['type'] in {'book chapter', 'book part', 'book section', 'book track', + 'reference entry'}: + self.id_worker(related.get("relatedIdentifier"), venids_list, self.isbn_worker) + if related.get("relatedIdentifierType") == "ISSN": + if row['type'] in {'book', 'data file', 'dataset', 'edited book', 'journal article', + 'journal volume', + 'journal issue', 'monograph', 'proceedings', 'peer review', + 'reference book', + 'reference entry', 'report'}: + self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker) + elif row['type'] == 'report series': + if related.get("title"): + if related.get("title"): + self.id_worker(related.get("relatedIdentifier"), venids_list, self.issn_worker) + + if venids_list: + name_and_id = cont_title + ' [' + ' '.join(venids_list) + ']' if cont_title else '[' + ' '.join(venids_list) + ']' + else: + name_and_id = cont_title + + return name_and_id + + #added the call to find_datacite_orcid + def add_editors_to_agent_list(self, item: dict, ag_list: list) -> list: + ''' + This function returns the the agents list updated with the editors dictionaries, in the correct format. + + :params item: the item's dictionary (attributes), ag_list: the + :type item: dict, ag_list: list + + :returns: listthe agents list updated with the editors dictionaries, in the correct format. + ''' + agent_list = ag_list + if item.get("contributors"): + editors = [contributor for contributor in item.get("contributors") if + contributor.get("contributorType") == "Editor"] + for ed in editors: + agent = {} + agent["role"] = "editor" + if ed.get('name'): + agent["name"] = ed.get("name") + if ed.get("nameType") == "Personal" or ("familyName" in ed or "givenName" in ed): + agent["family"] = ed.get("familyName") + agent["given"] = ed.get("givenName") + if ed.get("nameIdentifiers"): + orcid_ids = [x.get("nameIdentifier") for x in ed.get("nameIdentifiers") if + x.get("nameIdentifierScheme") == "ORCID"] + if orcid_ids: + orcid_id = self.find_datacite_orcid(orcid_ids) + if orcid_id: + agent["orcid"] = orcid_id + + missing_names = [x for x in ["family", "given", "name"] if x not in agent] + for mn in missing_names: + agent[mn] = "" + agent_list.append(agent) + return agent_list + + # added the call to find_datacite_orcid + def add_authors_to_agent_list(self, item: dict, ag_list: list) -> list: + ''' + This function returns the agents list updated with the authors dictionaries, in the correct format. + + :params item: the item's dictionary (attributes), ag_list: the + :type item: dict, ag_list: list + + :returns: list the agents list updated with the authors dictionaries, in the correct format. + ''' + agent_list = ag_list + if item.get("creators"): + creators = item.get("creators") + for c in creators: + agent = {} + agent["role"] = "author" + if c.get("name"): + agent["name"] = c.get("name") + if c.get("nameType") == "Personal" or ("familyName" in c or "givenName" in c): + agent["family"] = c.get("familyName") + agent["given"] = c.get("givenName") + if c.get("nameIdentifiers"): + orcid_ids = [x.get("nameIdentifier") for x in c.get("nameIdentifiers") if + x.get("nameIdentifierScheme") == "ORCID"] + if orcid_ids: + orcid_id = self.find_datacite_orcid(orcid_ids) + if orcid_id: + agent["orcid"] = orcid_id + missing_names = [x for x in ["family", "given", "name"] if x not in agent] + for mn in missing_names: + agent[mn] = "" + agent_list.append(agent) + return agent_list + + #added + def find_datacite_orcid(self, all_author_ids): + orcid = "" + if all_author_ids: + for identifier in all_author_ids: + norm_orcid = self.orcid_m.normalise(identifier, include_prefix = True) + ## Check orcid presence in memory and storage before validating the id + validity_value_orcid = self.validated_as({"identifier": norm_orcid, "schema": "orcid"}) + if validity_value_orcid is True: + orcid = norm_orcid + elif validity_value_orcid is None: + #if self.RA_redis.get(norm_orcid): + if norm_orcid in self._redis_values_ra: + orcid = norm_orcid + # if the id is not in redis db, validate it before appending + elif self.tmp_orcid_m.is_valid(norm_orcid): + orcid = norm_orcid + return orcid + # added + def memory_to_storage(self): + kv_in_memory = self.temporary_manager.get_validity_list_of_tuples() + self.storage_manager.set_multi_value(kv_in_memory) + self.temporary_manager.delete_storage() + + # added (division in first and second iteration) + def extract_all_ids(self, citation, is_first_iteration: bool): + + if is_first_iteration: + all_br = set() + all_ra = set() + + subject_id = citation['id'] + norm_id = self.doi_m.normalise(subject_id, include_prefix=True) + if norm_id: + # if it was possible to normalise the id according to one of the schemas accepted in oc, add + # the id to the set of retrieved br ids for the citation. + all_br.add(norm_id) + + attributes = citation.get("attributes") + if attributes: + creators = attributes.get("creators") + if creators: + for c in creators: + c_ids = c.get("nameIdentifiers") + if c_ids: + norm_c_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in c.get("nameIdentifiers") if + x.get("nameIdentifierScheme") == "ORCID"} + if norm_c_orcids: + # if it was possible to normalise any id according to orcid schema, add + # the norm_orcids to the set of retrieved ra ids for the citation. + all_ra.update(norm_c_orcids) + + if attributes.get("contributors"): + editors = [contributor for contributor in citation .get("contributors") if + contributor.get("contributorType") == "Editor"] + for ed in editors: + if ed.get("nameIdentifiers"): + norm_ed_orcids = {self.orcid_m.normalise(x.get("nameIdentifier"), include_prefix=True) for x in ed.get("nameIdentifiers") if + x.get("nameIdentifierScheme") == "ORCID"} + if norm_ed_orcids: + all_ra.update(norm_ed_orcids) + + all_br = list(all_br) + for x in all_br: + if x is None: + all_br.remove(x) + all_ra = list(all_ra) + for y in all_ra: + if y is None: + all_ra.remove(y) + return all_br, all_ra + + #all the objects doi + else: + all_br = set() + attributes = citation["attributes"] + rel_ids = attributes.get("relatedIdentifiers") + if rel_ids: + for ref in rel_ids: + if all(elem in ref for elem in self._needed_info): + relatedIdentifierType = (str(ref["relatedIdentifierType"])).lower() + relationType = str(ref["relationType"]).lower() + if relatedIdentifierType == "doi": + if relationType in self._filter: + rel_id = self.doi_m.normalise(ref["relatedIdentifier"], include_prefix=True) + if rel_id: + all_br.add(rel_id) + all_br = list(all_br) + return all_br + + + #added + def get_reids_validity_list(self, id_list, redis_db): + if redis_db == "ra": + valid_ra_ids = [] + # DO NOT UPDATED (REDIS RETRIEVAL METHOD HERE) + validity_list_ra = self.RA_redis.mget(id_list) + for i, e in enumerate(id_list): + if validity_list_ra[i]: + valid_ra_ids.append(e) + return valid_ra_ids + + elif redis_db == "br": + valid_br_ids = [] + # DO NOT UPDATED (REDIS RETRIEVAL METHOD HERE) + validity_list_br = self.BR_redis.mget(id_list) + for i, e in enumerate(id_list): + if validity_list_br[i]: + valid_br_ids.append(e) + return valid_br_ids + else: + raise ValueError("redis_db must be either 'ra' for responsible agents ids " + "or 'br' for bibliographic resources ids") diff --git a/oc_ds_converter/lib/jsonmanager.py b/oc_ds_converter/lib/jsonmanager.py index c8a0cf9..b32c5c4 100644 --- a/oc_ds_converter/lib/jsonmanager.py +++ b/oc_ds_converter/lib/jsonmanager.py @@ -91,7 +91,7 @@ def get_all_files_by_type(i_dir_or_compr:str, req_type:str, cache_filepath:str|N for cur_file in targz_fd: if cur_file.name.endswith(req_type) and not basename(cur_file.name).startswith(".") and not cur_file in cache: result.append(cur_file) - targz_fd.close() + #targz_fd.close() elif i_dir_or_compr.endswith(".tar"): dest_dir = i_dir_or_compr.replace('.tar', '') + "_decompr_zip_dir" targz_fd = tarfile.open(i_dir_or_compr, "r:*", encoding="utf-8") diff --git a/oc_ds_converter/openaire/openaire_processing.py b/oc_ds_converter/openaire/openaire_processing.py index 8c19fc7..0391611 100644 --- a/oc_ds_converter/openaire/openaire_processing.py +++ b/oc_ds_converter/openaire/openaire_processing.py @@ -135,8 +135,8 @@ def __init__(self, orcid_index: str = None, doi_csv: str = None, publishers_file } if testing: - self.BR_redis= fakeredis.FakeStrictRedis() - self.RA_redis= fakeredis.FakeStrictRedis() + self.BR_redis = fakeredis.FakeStrictRedis() + self.RA_redis = fakeredis.FakeStrictRedis() else: self.BR_redis = RedisDataSource("DB-META-BR") @@ -535,7 +535,7 @@ def to_validated_id_list(self, id_dict_of_list): # with the v1 version of the arxiv id. If it is not possible to retrieve an arxiv id from the only id which is # either declared as an arxiv id or starts with the arxiv doi prefix, return None and interrupt the process if len(valid_id_set) == 0: - if len(to_be_processed_input)==1 : + if len(to_be_processed_input) == 1: single_id_dict_list = self.manage_arxiv_single_id(to_be_processed_input) if single_id_dict_list: to_be_processed_id_dict_list = single_id_dict_list diff --git a/oc_ds_converter/preprocessing/datacite.py b/oc_ds_converter/preprocessing/datacite.py index 3de6259..1f1f561 100644 --- a/oc_ds_converter/preprocessing/datacite.py +++ b/oc_ds_converter/preprocessing/datacite.py @@ -139,4 +139,6 @@ def splitted_to_file(self, cur_n, target_n, out_dir, data, headers=None): empt_list = [] return empt_list else: - return data \ No newline at end of file + return data + + diff --git a/oc_ds_converter/run/crossref_process.py b/oc_ds_converter/run/crossref_process.py index 4a808b5..7734e3f 100644 --- a/oc_ds_converter/run/crossref_process.py +++ b/oc_ds_converter/run/crossref_process.py @@ -18,6 +18,7 @@ import csv import os import sys +import tarfile from argparse import ArgumentParser from tarfile import TarInfo from pathlib import Path @@ -67,7 +68,7 @@ def preprocess(crossref_json_dir:str, publishers_filepath:str, orcid_doi_filepat if verbose: print(f'[INFO: crossref_process] Getting all files from {crossref_json_dir}') - all_files, targz_fd = get_all_files_by_type(crossref_json_dir,".json", cache) + all_files, targz_fd = get_all_files_by_type(crossref_json_dir, ".json", cache) if verbose: pbar = tqdm(total=len(all_files)) @@ -76,16 +77,16 @@ def preprocess(crossref_json_dir:str, publishers_filepath:str, orcid_doi_filepat if not redis_storage_manager or max_workers == 1: for filename in all_files: # skip elements starting with ._ - if filename.startswith("._"): - continue + #if filename.startswith("._"): + # continue get_citations_and_metadata(filename, targz_fd, preprocessed_citations_dir, csv_dir, orcid_doi_filepath, wanted_doi_filepath, publishers_filepath, storage_path, redis_storage_manager, testing, cache, is_first_iteration=True) for filename in all_files: # skip elements starting with ._ - if filename.startswith("._"): - continue + #if filename.startswith("._"): + # continue get_citations_and_metadata(filename, targz_fd, preprocessed_citations_dir, csv_dir, orcid_doi_filepath, wanted_doi_filepath, publishers_filepath, storage_path, redis_storage_manager, @@ -132,13 +133,14 @@ def preprocess(crossref_json_dir:str, publishers_filepath:str, orcid_doi_filepat pbar.close() if verbose else None - - -def get_citations_and_metadata(file_name: str, targz_fd, preprocessed_citations_dir: str, csv_dir: str, +def get_citations_and_metadata(file_name, targz_fd, preprocessed_citations_dir: str, csv_dir: str, orcid_index: str, doi_csv: str, publishers_filepath: str, storage_path: str, redis_storage_manager: bool, testing: bool, cache: str, is_first_iteration:bool): + if isinstance(file_name, tarfile.TarInfo): + file_tarinfo = file_name + file_name = file_name.name storage_manager = get_storage_manager(storage_path, redis_storage_manager, testing=testing) if cache: if not cache.endswith(".json"): @@ -151,6 +153,7 @@ def get_citations_and_metadata(file_name: str, targz_fd, preprocessed_citations_ lock = FileLock(cache + ".lock") cache_dict = dict() + file_name = file_name write_new = False if os.path.exists(cache): with lock: @@ -262,8 +265,10 @@ def save_files(ent_list, citation_list, is_first_iteration_par: bool): return ent_list, citation_list def task_done(is_first_iteration_par: bool) -> None: + try: + if is_first_iteration_par and "first_iteration" not in cache_dict.keys(): cache_dict["first_iteration"] = set() @@ -334,7 +339,7 @@ def task_done(is_first_iteration_par: bool) -> None: data_citing.append(source_tab_data) save_files(data_citing, index_citations_to_csv, True) - #pbar.close() + '''cited entities: - look for the DOI in the temporary manager and in the storage manager: diff --git a/oc_ds_converter/run/datacite_process_new.py b/oc_ds_converter/run/datacite_process_new.py new file mode 100644 index 0000000..37dfec9 --- /dev/null +++ b/oc_ds_converter/run/datacite_process_new.py @@ -0,0 +1,241 @@ +from pathlib import Path +from oc_ds_converter.lib.jsonmanager import * +from pebble import ProcessFuture, ProcessPool +from oc_ds_converter.oc_idmanager.oc_data_storage.redis_manager import \ + RedisStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.sqlite_manager import \ + SqliteStorageManager +from oc_ds_converter.oc_idmanager.oc_data_storage.in_memory_manager import \ + InMemoryStorageManager +from oc_ds_converter.datacite.datacite_processing import DataciteProcessing +import json +from filelock import Timeout, FileLock + + +def preprocess(datacite_ndjson_dir:str, publishers_filepath:str, orcid_doi_filepath:str, + csv_dir:str, wanted_doi_filepath:str=None, cache:str=None, verbose:bool=False, storage_path:str = None, + testing: bool = True, redis_storage_manager: bool = False, max_workers: int = 1, target=50000) -> None: + + els_to_be_skipped = [] + if not testing: + input_dir_cont = os.listdir(datacite_ndjson_dir) + for el in input_dir_cont: + if el.startswith("._"): + els_to_be_skipped.append(os.path.join(datacite_ndjson_dir, el)) + else: + if el.endswith(".zst"): + base_name = el.replace('.zst', '') + if [x for x in os.listdir(datacite_ndjson_dir) if x.startswith(base_name) and x.endswith("decompr_zst_dir")]: + els_to_be_skipped.append(os.path.join(datacite_ndjson_dir, el)) + + if not os.path.exists(csv_dir): + os.makedirs(csv_dir) + + preprocessed_citations_dir = csv_dir + "_citations" + if not os.path.exists(preprocessed_citations_dir): + makedirs(preprocessed_citations_dir) + + if verbose: + if publishers_filepath or orcid_doi_filepath or wanted_doi_filepath: + what = list() + if publishers_filepath: + what.append('publishers mapping') + if orcid_doi_filepath: + what.append('DOI-ORCID index') + if wanted_doi_filepath: + what.append('wanted DOIs CSV') + log = '[INFO: jalc_process] Processing: ' + '; '.join(what) + print(log) + + if verbose: + print(f'[INFO: datacite_process] Getting all files from {datacite_ndjson_dir}') + + req_type = ".ndjson" + all_input_ndjson = [] + if not testing: + els_to_be_skipped_cont = [x for x in els_to_be_skipped if x.endswith(".zst")] + + if els_to_be_skipped_cont: + for el_to_skip in els_to_be_skipped_cont: + if el_to_skip.startswith("._"): + continue + base_name_el_to_skip = el_to_skip.replace('.zst', '') + for el in os.listdir(datacite_ndjson_dir): + if el == base_name_el_to_skip + "decompr_zst_dir": + # if el.startswith(base_name_el_to_skip) and el.endswith("decompr_zst_dir"): + #CHECK + all_input_ndjson = [os.path.join(datacite_ndjson_dir, el, file) for file in os.listdir(os.path.join(datacite_ndjson_dir, el)) if not file.endswith(".json") and not file.startswith("._")] + + if len(all_input_ndjson) == 0: + + for lev_zst in os.listdir(datacite_ndjson_dir): + all_input_ndjson, targz_fd = get_all_files_by_type(os.path.join(datacite_ndjson_dir, lev_zst), req_type, cache) + + # in test files the decompressed directory, at the end of each execution of the process, is always deleted + else: + for lev_zst in os.listdir(datacite_ndjson_dir): + all_input_ndjson, targz_fd = get_all_files_by_type(os.path.join(datacite_ndjson_dir, lev_zst), req_type, cache) + + + # We need to understand how often (how many processed files) we should send the call to Redis + if not redis_storage_manager or max_workers == 1: + for ndjson_file in all_input_ndjson:#it should be one + for idx, chunk in enumerate(read_ndjson_chunk(ndjson_file, target), start=1): + chunk_to_save = f'chunk_{idx}' + get_citations_and_metadata(ndjson_file, chunk, preprocessed_citations_dir, csv_dir, chunk_to_save, orcid_doi_filepath, + wanted_doi_filepath, publishers_filepath, storage_path, + redis_storage_manager, + testing, cache, is_first_iteration=True) + for ndjson_file in all_input_ndjson: + for idx, chunk in enumerate(read_ndjson_chunk(ndjson_file, target), start=1): + chunk_to_save = f'chunk_{idx}' + get_citations_and_metadata(ndjson_file, chunk, preprocessed_citations_dir, csv_dir, chunk_to_save, orcid_doi_filepath, + wanted_doi_filepath, publishers_filepath, storage_path, + redis_storage_manager, + testing, cache, is_first_iteration=False) + + elif redis_storage_manager or max_workers > 1: + + with ProcessPool(max_workers=max_workers, max_tasks=1) as executor: + for ndjson_file in all_input_ndjson: + for idx, chunk in enumerate(read_ndjson_chunk(ndjson_file, target), start=1): + chunk_to_save = f'chunk_{idx}' + future: ProcessFuture = executor.schedule( + function=get_citations_and_metadata, + args=( + ndjson_file, chunk, preprocessed_citations_dir, csv_dir, chunk_to_save, orcid_doi_filepath, wanted_doi_filepath, + publishers_filepath, storage_path, redis_storage_manager, testing, cache, True)) + + with ProcessPool(max_workers=max_workers, max_tasks=1) as executor: + for ndjson_file in all_input_ndjson: + for idx, chunk in enumerate(read_ndjson_chunk(ndjson_file, target), start=1): + chunk_to_save = f'chunk_{idx}' + future: ProcessFuture = executor.schedule( + function=get_citations_and_metadata, + args=( + ndjson_file, chunk, preprocessed_citations_dir, csv_dir, chunk_to_save, orcid_doi_filepath, wanted_doi_filepath, + publishers_filepath, storage_path, redis_storage_manager, testing, cache, False)) + + if cache: + if os.path.exists(cache): + os.remove(cache) + lock_file = cache + ".lock" + if os.path.exists(lock_file): + os.remove(lock_file) + + +def get_citations_and_metadata(ndjson_file: str, chunk: list, preprocessed_citations_dir: str, csv_dir: str, chunk_to_save:str, + orcid_index: str, + doi_csv: str, publishers_filepath_jalc: str, storage_path: str, + redis_storage_manager: bool, + testing: bool, cache: str, is_first_iteration:bool): + storage_manager = get_storage_manager(storage_path, redis_storage_manager, testing=testing) + if cache: + if not cache.endswith(".json"): + cache = os.path.join(os.getcwd(), "cache.json") + else: + if not os.path.exists(os.path.abspath(os.path.join(cache, os.pardir))): + Path(os.path.abspath(os.path.join(cache, os.pardir))).mkdir(parents=True, exist_ok=True) + else: + cache = os.path.join(os.getcwd(), "cache.json") + + last_part_processed = 0 + lock = FileLock(cache + ".lock") + cache_dict = dict() + write_new = False + + if os.path.exists(cache): + with lock: + with open(cache, "r", encoding="utf-8") as c: + try: + cache_dict = json.load(c) + except: + write_new = True + else: + write_new = True + + if write_new: + with lock: + with open(cache, "w", encoding="utf-8") as c: + json.dump(cache_dict, c) + + + ndjson_filename = Path(ndjson_file).name + if cache_dict.get("first_iteration"): + if is_first_iteration and chunk_to_save in cache_dict["first_iteration"][ndjson_filename]: + return + + if cache_dict.get("second_iteration"): + if not is_first_iteration and chunk_to_save in cache_dict["second_iteration"][ndjson_filename]: + return + + if is_first_iteration: + dc_csv = DataciteProcessing(orcid_index=orcid_index, doi_csv=doi_csv, + publishers_filepath_jalc=publishers_filepath_jalc, + storage_manager=storage_manager, testing=testing, citing=True) + elif not is_first_iteration: + dc_csv = DataciteProcessing(orcid_index=orcid_index, doi_csv=doi_csv, + publishers_filepath_jalc=publishers_filepath_jalc, + storage_manager=storage_manager, testing=testing, citing=False) + + filename_without_ext = ndjson_filename.replace('.ndjson', '')+'_'+chunk_to_save + filepath_ne = os.path.join(csv_dir, f'{os.path.basename(filename_without_ext)}') + filepath_citations_ne = os.path.join(preprocessed_citations_dir, f'{os.path.basename(filename_without_ext)}') + + filepath = os.path.join(csv_dir, f'{os.path.basename(filename_without_ext)}.csv') + filepath_citations = os.path.join(preprocessed_citations_dir, f'{os.path.basename(filename_without_ext)}.csv') + pathoo(filepath) + pathoo(filepath_citations) + + + + + + + + +def get_storage_manager(storage_path: str, redis_storage_manager: bool, testing: bool): + if not redis_storage_manager: + if storage_path: + if not os.path.exists(storage_path): + # if parent dir does not exist, it is created + if not os.path.exists(os.path.abspath(os.path.join(storage_path, os.pardir))): + Path(os.path.abspath(os.path.join(storage_path, os.pardir))).mkdir(parents=True, exist_ok=True) + if storage_path.endswith(".db"): + storage_manager = SqliteStorageManager(storage_path) + elif storage_path.endswith(".json"): + storage_manager = InMemoryStorageManager(storage_path) + + if not storage_path and not redis_storage_manager: + new_path_dir = os.path.join(os.getcwd(), "storage") + if not os.path.exists(new_path_dir): + os.makedirs(new_path_dir) + storage_manager = SqliteStorageManager(os.path.join(new_path_dir, "id_valid_dict.db")) + elif redis_storage_manager: + if testing: + storage_manager = RedisStorageManager(testing=True) + else: + storage_manager = RedisStorageManager(testing=False) + return storage_manager + +def pathoo(path:str) -> None: + if not os.path.exists(os.path.dirname(path)): + os.makedirs(os.path.dirname(path)) + +def read_ndjson_chunk(file_path, chunk_size): + with open(file_path, 'r', encoding='utf-8') as file: + while True: + chunk = [] + for _ in range(chunk_size): + line = file.readline() + if not line: + break + try: + data = json.loads(line) + chunk.append(data) + except json.JSONDecodeError as e: + # Handle JSON decoding errors if necessary + print(f"Error decoding JSON: {e}") + if not chunk: + break + yield chunk \ No newline at end of file diff --git a/oc_ds_converter/run/jalc_process.py b/oc_ds_converter/run/jalc_process.py index 51d15bf..b140b26 100644 --- a/oc_ds_converter/run/jalc_process.py +++ b/oc_ds_converter/run/jalc_process.py @@ -339,32 +339,33 @@ def task_done(is_first_iteration_par: bool) -> None: for entity in tqdm(source_dict): if entity: d = entity.get("data") - norm_source_id = jalc_csv.doi_m.normalise(d['doi'], include_prefix=True) - if norm_source_id: - cit_list_entities = [x for x in d["citation_list"] if x.get("doi")] - # filtering out entities with citations without dois - if cit_list_entities: - valid_target_ids = [] - for cited_entity in cit_list_entities: - norm_id = jalc_csv.doi_m.normalise(cited_entity["doi"], include_prefix=True) - if norm_id: - stored_validity = jalc_csv.validated_as(norm_id) - if stored_validity is None: - if norm_id in jalc_csv.to_validated_id_list(norm_id): - target_tab_data = jalc_csv.csv_creator(cited_entity) - if target_tab_data: - processed_target_id = target_tab_data.get("id") - if processed_target_id: - data_cited.append(target_tab_data) - valid_target_ids.append(norm_id) - elif stored_validity is True: - valid_target_ids.append(norm_id) - - for target_id in valid_target_ids: - citation = dict() - citation["citing"] = norm_source_id - citation["cited"] = target_id - index_citations_to_csv.append(citation) + if d.get("citation_list"): + norm_source_id = jalc_csv.doi_m.normalise(d['doi'], include_prefix=True) + if norm_source_id: + cit_list_entities = [x for x in d["citation_list"] if x.get("doi")] + # filtering out entities with citations without dois + if cit_list_entities: + valid_target_ids = [] + for cited_entity in cit_list_entities: + norm_id = jalc_csv.doi_m.normalise(cited_entity["doi"], include_prefix=True) + if norm_id: + stored_validity = jalc_csv.validated_as(norm_id) + if stored_validity is None: + if norm_id in jalc_csv.to_validated_id_list(norm_id): + target_tab_data = jalc_csv.csv_creator(cited_entity) + if target_tab_data: + processed_target_id = target_tab_data.get("id") + if processed_target_id: + data_cited.append(target_tab_data) + valid_target_ids.append(norm_id) + elif stored_validity is True: + valid_target_ids.append(norm_id) + + for target_id in valid_target_ids: + citation = dict() + citation["citing"] = norm_source_id + citation["cited"] = target_id + index_citations_to_csv.append(citation) save_files(data_cited, index_citations_to_csv, False) def get_storage_manager(storage_path: str, redis_storage_manager: bool, testing: bool): if not redis_storage_manager: diff --git a/scripts_analysis/jalc_languages_metadata/jalc_languages_metadata_count.py b/scripts_analysis/jalc_languages_metadata/jalc_languages_metadata_count.py new file mode 100644 index 0000000..39ebb2d --- /dev/null +++ b/scripts_analysis/jalc_languages_metadata/jalc_languages_metadata_count.py @@ -0,0 +1,386 @@ +import csv +import os +import pathlib +import zipfile +from os.path import isdir, basename, exists +from os import walk, sep, makedirs +import zstandard as zstd +import json +from tqdm import tqdm +import tarfile + + +class CountMetadataLang: + + def get_all_files_by_type(self, i_dir_or_compr: str, req_type: str): + result = [] + targz_fd = None + + if isdir(i_dir_or_compr): + + for cur_dir, cur_subdir, cur_files in walk(i_dir_or_compr): + for cur_file in cur_files: + if cur_file.endswith(req_type) and not basename(cur_file).startswith("."): + result.append(os.path.join(cur_dir, cur_file)) + elif i_dir_or_compr.endswith("tar.gz"): + targz_fd = tarfile.open(i_dir_or_compr, "r:gz", encoding="utf-8") + for cur_file in targz_fd: + if cur_file.name.endswith(req_type) and not basename(cur_file.name).startswith("."): + result.append(cur_file) + targz_fd.close() + elif i_dir_or_compr.endswith(".tar"): + dest_dir = i_dir_or_compr.replace('.tar', '') + "_decompr_zip_dir" + targz_fd = tarfile.open(i_dir_or_compr, "r:*", encoding="utf-8") + targz_fd.extractall(dest_dir) + + for cur_dir, cur_subdir, cur_files in walk(dest_dir): + for cur_file in cur_files: + if cur_file.endswith(req_type) and not basename(cur_file).startswith("."): + result.append(cur_dir + sep + cur_file) + + targz_fd.close() + + + elif i_dir_or_compr.endswith("zip"): + with zipfile.ZipFile(i_dir_or_compr, 'r') as zip_ref: + dest_dir = i_dir_or_compr.replace('.zip', '') + "_decompr_zip_dir" + if not exists(dest_dir): + makedirs(dest_dir) + zip_ref.extractall(dest_dir) + for cur_dir, cur_subdir, cur_files in walk(dest_dir): + for cur_file in cur_files: + if cur_file.endswith(req_type) and not basename(cur_file).startswith("."): + result.append(cur_dir + sep + cur_file) + + elif i_dir_or_compr.endswith("zst"): + input_file = pathlib.Path(i_dir_or_compr) + dest_dir = i_dir_or_compr.split(".")[0] + "_decompr_zst_dir" + with open(input_file, 'rb') as compressed: + decomp = zstd.ZstdDecompressor() + if not exists(dest_dir): + makedirs(dest_dir) + output_path = pathlib.Path(dest_dir) / input_file.stem + if not exists(output_path): + with open(output_path, 'wb') as destination: + decomp.copy_stream(compressed, destination) + for cur_dir, cur_subdir, cur_files in walk(dest_dir): + for cur_file in cur_files: + if cur_file.endswith(req_type) and not basename(cur_file).startswith("."): + result.append(cur_dir + sep + cur_file) + else: + print("It is not possible to process the input path.", i_dir_or_compr) + return result, targz_fd + + def count_publisher_lang(self, source_dict, citing=True): + count_en_ja_citing = 0 + count_en_ja_cited = 0 + count_en_citing = 0 + count_en_cited = 0 + count_ja_citing = 0 + count_ja_cited = 0 + for entity_dict in source_dict: + data = entity_dict["data"] + if citing: + if data.get('publisher_list'): + # if the name of the publisher is given both in japanese and in english + pub_list = data['publisher_list'] + pub_lang = [item['lang'] for item in pub_list if 'lang' in item] + if 'en' in pub_lang and 'ja' in pub_lang: + count_en_ja_citing += 1 + else: + if 'en' in pub_lang: + count_en_citing += 1 + elif 'ja' in pub_lang: + count_ja_citing += 1 + else: + if data.get('citation_list'): + cit_list_entities = [x for x in data["citation_list"]] + for cit in cit_list_entities: + if cit.get('publisher_list'): + pub_list_cit = cit['publisher_list'] + pub_lang = [item['lang'] for item in pub_list_cit if 'lang' in item] + if 'en' in pub_lang and 'ja' in pub_lang: + count_en_ja_cited += 1 + else: + if 'en' in pub_lang: + count_en_cited += 1 + elif 'ja' in pub_lang: + count_ja_cited += 1 + if citing: + return count_en_ja_citing, count_en_citing, count_ja_citing + else: + return count_en_ja_cited, count_en_cited, count_ja_cited + + def count_journal_title_lang(self, source_dict, citing=True): + count_en_ja_citing = 0 + count_en_ja_cited = 0 + count_en_citing = 0 + count_en_cited = 0 + count_ja_citing = 0 + count_ja_cited = 0 + for entity_dict in source_dict: + data = entity_dict["data"] + if citing: + if data.get('journal_title_name_list'): + candidate_venues = data['journal_title_name_list'] + full_venue = [item for item in candidate_venues if 'type' in item if item['type'] == 'full'] + if full_venue: + full_venue_lang = set(item['lang'] for item in full_venue if 'lang' in item) + if 'en' in full_venue_lang and 'ja' in full_venue_lang: + count_en_ja_citing += 1 + else: + if 'en' in full_venue_lang: + count_en_citing += 1 + elif 'ja' in full_venue_lang: + count_ja_citing += 1 + else: + abbr_venue = [item for item in candidate_venues if 'type' in item if item['type'] == 'abbreviation'] + if abbr_venue: + abbr_venue_lang = set(item['lang'] for item in abbr_venue if 'lang' in item) + if 'en' in abbr_venue_lang and 'ja' in abbr_venue_lang: + count_en_ja_citing += 1 + else: + if 'en' in abbr_venue_lang: + count_en_citing += 1 + elif 'ja' in abbr_venue_lang: + count_ja_citing += 1 + else: + venues = [item for item in candidate_venues] + if venues: + lang_venue = [venue['lang'] for venue in venues if 'lang' in venue] + if 'en' in lang_venue and 'ja' in lang_venue: + count_en_ja_citing += 1 + else: + if 'en' in lang_venue: + count_en_citing += 1 + elif 'ja' in lang_venue: + count_ja_citing += 1 + + else: + if data.get('citation_list'): + cit_list_entities = [x for x in data["citation_list"]] + for cit in cit_list_entities: + if cit.get('journal_title_name_list'): + candidate_venues = cit['journal_title_name_list'] + full_venue = [item for item in candidate_venues if 'type' in item if item['type'] == 'full'] + if full_venue: + full_venue_lang = set(item['lang'] for item in full_venue if 'lang' in item) + if 'en' in full_venue_lang and 'ja' in full_venue_lang: + count_en_ja_cited += 1 + else: + if 'en' in full_venue_lang: + count_en_cited += 1 + elif 'ja' in full_venue_lang: + count_ja_cited += 1 + else: + abbr_venue = [item for item in candidate_venues if 'type' in item if + item['type'] == 'abbreviation'] + if abbr_venue: + abbr_venue_lang = set(item['lang'] for item in abbr_venue if 'lang' in item) + if 'en' in abbr_venue_lang and 'ja' in abbr_venue_lang: + count_en_ja_cited += 1 + else: + if 'en' in abbr_venue_lang: + count_en_cited += 1 + elif 'ja' in abbr_venue_lang: + count_ja_cited += 1 + else: + venues = [item for item in candidate_venues] + if venues: + lang_venue = [venue['lang'] for venue in venues if 'lang' in venue] + if 'en' in lang_venue and 'ja' in lang_venue: + count_en_ja_cited += 1 + else: + if 'en' in lang_venue: + count_en_cited += 1 + elif 'ja' in lang_venue: + count_ja_cited += 1 + + + + if citing: + return count_en_ja_citing, count_en_citing, count_ja_citing + else: + return count_en_ja_cited, count_en_cited, count_ja_cited + + + + def count_title_lang(self, source_dict, citing=True): + count_en_ja_citing = 0 + count_en_ja_cited = 0 + count_en_citing = 0 + count_en_cited = 0 + count_ja_citing = 0 + count_ja_cited = 0 + for entity_dict in source_dict: + data = entity_dict["data"] + if citing: + if data.get('title_list'): + title_list = data['title_list'] + title_lang = [item['lang'] for item in title_list if 'lang' in item] + if 'en' in title_lang and 'ja' in title_lang: + count_en_ja_citing += 1 + else: + if 'en' in title_lang: + count_en_citing += 1 + elif 'ja' in title_lang: + count_ja_citing += 1 + else: + if data.get('citation_list'): + cit_list_entities = [x for x in data["citation_list"]] + for cit in cit_list_entities: + if cit.get('title_list'): + title_list = cit['title_list'] + title_lang = [item['lang'] for item in title_list if 'lang' in item] + if 'en' in title_lang and 'ja' in title_lang: + count_en_ja_cited += 1 + else: + if 'en' in title_lang: + count_en_cited += 1 + elif 'ja' in title_lang: + count_ja_cited += 1 + if citing: + return count_en_ja_citing, count_en_citing, count_ja_citing + else: + return count_en_ja_cited, count_en_cited, count_ja_cited + + + def count_creator_names_lang(self, source_dict, citing=True): + count_en_ja_citing = 0 + count_en_ja_cited = 0 + count_en_citing = 0 + count_en_cited = 0 + count_ja_citing = 0 + count_ja_cited = 0 + for entity_dict in source_dict: + data = entity_dict["data"] + if citing: + if data.get('creator_list'): + creator_list = data['creator_list'] + for creator_dict in creator_list: + if creator_dict.get('names'): + names_list = creator_dict['names'] + names_lang = [creator['lang'] for creator in names_list if 'lang' in creator] + if 'en' in names_lang and 'ja' in names_lang: + count_en_ja_citing += 1 + else: + if 'en' in names_lang: + count_en_citing += 1 + elif 'ja' in names_lang: + count_ja_citing += 1 + else: + if data.get('citation_list'): + cit_list_entities = [x for x in data["citation_list"]] + for cit in cit_list_entities: + if cit.get('creator_list'): + creator_list = cit['creator_list'] + for creator_dict in creator_list: + if creator_dict.get('names'): + names_list = creator_dict['names'] + names_lang = [creator['lang'] for creator in names_list if 'lang' in creator] + if 'en' in names_lang and 'ja' in names_lang: + count_en_ja_cited += 1 + else: + if 'en' in names_lang: + count_en_cited += 1 + elif 'ja' in names_lang: + count_ja_cited += 1 + if citing: + return count_en_ja_citing, count_en_citing, count_ja_citing + else: + return count_en_ja_cited, count_en_cited, count_ja_cited + + def call_functions_for_all_zips(self, list_of_zips, funct_list: list, csv_file:str, citing=True, cited=True): + data_to_be_saved = [] + for func in funct_list: + count_en_ja_citing, count_en_citing, count_ja_citing = 0, 0, 0 + count_en_ja_cited, count_en_cited, count_ja_cited = 0, 0, 0 + for zip_file in tqdm(list_of_zips): + zip_f = zipfile.ZipFile(zip_file) + source_data = [x for x in zip_f.namelist() if not x.startswith("doiList")] + source_dict = [] + # here I create a list containing all the json in the zip folder as dictionaries + for json_file in source_data: + f = zip_f.open(json_file, 'r') + my_dict = json.load(f) + source_dict.append(my_dict) + if citing and cited: + result_citing = func(source_dict, True) + result_cited = func(source_dict, False) + count_en_ja_citing += result_citing[0] + count_en_citing += result_citing[1] + count_ja_citing += result_citing[2] + count_en_ja_cited += result_cited[0] + count_en_cited += result_cited[1] + count_ja_cited += result_cited[2] + else: + if citing: + result_citing = func(source_dict, True) + count_en_ja_citing += result_citing[0] + count_en_citing += result_citing[1] + count_ja_citing += result_citing[2] + else: + result_cited = func(source_dict, False) + count_en_ja_cited += result_cited[0] + count_en_cited += result_cited[1] + count_ja_cited += result_cited[2] + + if citing and cited: + dict_result1 = {"en_ja_citing": count_en_ja_citing, "en_citing": count_en_citing, "ja_citing": count_ja_citing, "en_ja_cited": count_en_ja_cited, "en_cited": count_en_cited, "ja_cited": count_ja_cited} + data_to_be_saved.append(dict_result1) + else: + if citing: + dict_result2 = {"en_ja_citing": count_en_ja_citing, "en_citing": count_en_citing, "ja_citing": count_ja_citing} + data_to_be_saved.append(dict_result2) + else: + dict_result3 = {"en_ja_cited": count_en_ja_cited, "en_cited": count_en_cited, "ja_cited": count_ja_cited} + data_to_be_saved.append(dict_result3) + try: + with open(csv_file, 'w', newline='') as csvf: + labels = ["en_ja_citing", "en_citing", "ja_citing", "en_ja_cited", "en_cited", "ja_cited"] + writer = csv.DictWriter(csvf, fieldnames=labels) + writer.writeheader() + for elem in data_to_be_saved: + writer.writerow(elem) + except IOError: + print("I/O error") + + + + + def find_zip_subfiles(self, jalc_json_dir): + els_to_be_skipped =[] + input_dir_cont = os.listdir(jalc_json_dir) + # for element in the list of elements in jalc_json_dir (input) + for el in input_dir_cont: # should be one (the input dir contains 1 zip) + if el.startswith("._"): + # skip elements starting with ._ + els_to_be_skipped.append(os.path.join(jalc_json_dir, el)) + else: + if el.endswith(".zip"): + base_name = el.replace('.zip', '') + if [x for x in os.listdir(jalc_json_dir) if x.startswith(base_name) and x.endswith("decompr_zip_dir")]: + els_to_be_skipped.append(os.path.join(jalc_json_dir, el)) + req_type = ".zip" + all_input_zip = [] + els_to_be_skipped_cont = [x for x in els_to_be_skipped if x.endswith(".zip")] + + if els_to_be_skipped_cont: + for el_to_skip in els_to_be_skipped_cont: + if el_to_skip.startswith("._"): + continue + base_name_el_to_skip = basename(el_to_skip).replace('.zip', '') + for el in os.listdir(jalc_json_dir): + if el.startswith(base_name_el_to_skip) and el.endswith("decompr_zip_dir"): + all_input_zip = [os.path.join(jalc_json_dir, el, file) for file in + os.listdir(os.path.join(jalc_json_dir, el)) if + not file.endswith(".json") and not file.startswith("._")] + + if len(all_input_zip) == 0: + for zip_lev0 in os.listdir(jalc_json_dir): + all_input_zip, targz_fd = self.get_all_files_by_type(os.path.join(jalc_json_dir, zip_lev0), req_type) + return all_input_zip + + +# To execute the function you need to use the method "call_functions_for_all_zips" + diff --git a/scripts_analysis/jalc_languages_metadata/test/JOCI_PRE_SAMPLE/sample_jalc.zip b/scripts_analysis/jalc_languages_metadata/test/JOCI_PRE_SAMPLE/sample_jalc.zip new file mode 100644 index 0000000000000000000000000000000000000000..c18dd67601b7c4282084a34d5298ac265b69c412 GIT binary patch literal 9424 zcma)iWl$a4x-GB>ws3b38e|~>7Vhru?!n#Ng9mqacMBHW9RdV*hv4?I?>W29xmEjC zz1cOptGmbdb@h*~F-DK6APs=Vg!pT0*-EQ7p6Awx5JEuwH4*<@12Hp#K%mbojGmVE z3japJK~VfbDmkU4_aj3{3|>~oqNQ;=hB2$C z%30r;S}&Fq4(WS4T=Gwc{4VLc2i`u0Uis%j%=Vo0)Z&i|cE`sC;o@rfE@3f>%-Vo= z_ZYs9SaqsT1K&|?iZZ&AdWgKa&#&bY%aU6Q2SxU`_}9LNQ}A zjLyPI=!(vQk%PHS6tr{(!I9yUk&$aq$8|_g*kSC7@ysbR&HB&2l>_F3Pf&Po6U#3m zFhgJ8&ilrOJonL#w3)>ZZhtHD@>ZGaAE!-u-W5E0&mBJsPgNeJA@O?YPfZ>kp)4*# zG&+3@#zM2MCz4csQJ0blUeSWmZnKE1by78J&jYJxeZz)@9t+nQIK#b+1r1pxVXjt~ zxnlt+?9rV!2p?}h(!aKPLRr7?t&BUOF+*S;?3-fr|I)(04GN^7Mf|*o88hOTNnorZ z6`uQBWIm*Nn?}u{Bbbp&W!eqFQz)fj#6$<3f?2}IGcQ}QtU&77S!><1UQCT7)w
    fq18q&1_1pD4%aR zv2Dyaw&w;BPSscfjD-bDdwiVjTrQ&FHnk#F9ePZjwG<|GA@>@rw&@2_YpniL`0Z#I zVi&8<;Z5ZVx3~i1wT#cjB#GbUP3}L!4r`=-{)LH^c zj8ib)beF#yp;-T zgkifRt`lC|wOY_0V2Opn+2uzU<4U zGC0qu`KyXp4_5=U&k+MuO-zn-R&AgP4|0@(k17Xd4Q;9DI2{V})|FKhh$gV-;)%}d z9D?n6!@tpfXkR9$N9@i#?)k{gi&nq?>RYSi*zYRZ^#Opv*M!Sa!c^^sKGByL#6kCK(%?GZ?X;P$SCiq-`OQrJ{Ax!?pK! zbn*WbO^7qW!n*=Y_!jm;(-i3i!DXL{L1A=2@xt+8dkZ->PgE`8gb$LYch29DVR5?$ zmuZK)jLGOhB|B~57=E}wu>NMhHTPwYRr`=lK{uFrd7BVuXLLJu$x^}(2BWR$P^=D_ z#lJ_9#ao3hF!gyq!7W=)a1`Z>)ztxGxSYeM=pPPatS~`(Wz$9j;jZ)xzF=-{UYqjJ z0`k|Lh|%n36QWCJT0)%rV*!^paNeX})MDVV?MI3#*Npb7xmfn*Jqu-PI$}F)znN-b zc|A6oDOu0>S)kPPAaQ-7M`M=E5cgQ0XFOV6b>tan*4(%{B?m%BPMYm~S za~q|B&y;{RaHz}2jnB^?l_H~~m-AKZX_(MdG^`{%=QfhA%q<%bDFRe7sa4w&gQNQy zuTKDj*X4CAw%*>lT#J4`u+x9YLBN$=sZeSqO}-`p8{()eleU_qOud@i{fv!$bTcSD zh4f;XdN|MEGdJ9y@UJ7X)0P^9JzW*2yr&yh?W-=-|vCAJ$1VC%gOc;e>x{%68p{e^>~gIAa=hhE2bv9iHI5n;S-8)RiH}-z>=Mc3}T|!fwY_5zWus2Sm5H< zMc$RvdX>M4ry0DnicH{&n~Fo)#x*y}*IW2&qn`0kGqa`_1WwS6Ttcd89wp}b)3!&V z)?d`*`C*X~j%1zA1Gs!+Ntkp>FYA_;p^>ICDp&{S!-4QZhhI_V7pFIEe=8Mvydq28 zT%foF*7WKxEzWMEb>3zR)DoJ=E1F{gbfqpEE2obHA3b z=miiyLnRBB9~hlSsow?|htk_!R)o@r&8@jqHmVN=WH^$0N=juidUI8}Lhe`Topr)h zl|-5%OJ_wzbb5)l$+eS?#RO)RgZ%^}r&klp_tNVMy5b*}Km4;t1Vcf|Is0zJ8C{yjryg4iF$+j#9(wZf zbBIog>W42wvp7(3r&lHEqGTa$dhRQEoQq9DzZNZ5v#&>4T}e~?HEdVK3L z*nbje9?aE-6KelltahDoVaTS7>DnRcA+(P*5qwF2&q`3}hB27x)FFcrjht>_KQAK8 zKxekK2EBecO;K@Ab~F-yX3nNRg%Y=={(C22h%0?<$SgVy$vPjF6KQN8<>usdx4-q# zDB|49A!zLal&qK5fud9T6a>6$8sO#Soly1w39hoR$p0|eivV#!Vtg@;R=+>Ap*#RZ z8TSz2W&||KOlaG~qOhHdi5n82U^OeXzt$7@c|&Ljf>~M)h3t15+Dk@$MD;CJW}g)q zFr!1kCDUs89&n8?;CKLW=bnNWxDgR(S5X-PZ`YYReN6X1`%kG>Kz(E8lC>8T z71}f`A#p)|aKB!W$=!qUz=|rojckVjtviG^AM62!8ulm%j@Vx zn!n5I0oJt63;l3nk-vCqZ3m=fQlV%FcDQjCX1frr%X#2tfaSq2(&|8D=__|BQLEXY z>a|M>QU?bu8`x1YH`1?0^)MGS=0Br@*taa-lXX~jC_)~ByHhFOyL&JqIUj{JuPakAhdo0pWdo9K5W*hK za4UJbvjG)Nig+h)%3`K;6gPNMCd1C2-Rh;;`1Lk|$CJUM~u zcU9ULC}|E*y%px{QJyVKsE+V?FeM^W*5eD;5EIlG+;RBV8BS{Vtz2o%_1vT=7I89+ ziAR<@BXlD?)b#77qS`BuC(n=60|kr$cHOSJY;We>*k%O#utD*01z0+DvaZwFAN7#Gpqb<;cMI;sWJ<{ z&IkGOQ~v_{1|3*NA)cNM;@LL^lp#r)qxNYcTq5p=Eqa zx8WV5X~V5sn~Aohf}fceI61Cr!MUYoQM(ok+cT}9_m<;!@ka0;3q1FiS*AY168+4EnbV_Oc@wLVKqJzA7&JKX58t;KEpF)|KIWh>yvI38?MV zPM{~UTUqBGaYpTFLdB6wr>asd^yu|iuA5V--ocdJrkrpqlaYMG&RXG45WmJeMuTNc z;Qx|+cgKE{k@Yxn+QpNrbL?ql5Qw;I1_Lt!QFSHMHCWq=&ST*bAUO~c6fMSsCvW7G zL@PSBe6Otj~1Ow$zB8=+|rLAS|b~`(J zyhKlFYuuhlw_(C@(%~uPn`&a^52mn`ezmphNa2dNVxy|p#m@W*)sPXa2Zyb_D5kKV z=>Qh0CXJ*F6D3Zm)Xer=ZE`7-QOyG-PF2*jpOo^_<5d{P8YU5RDe+v#{%@COq-n+M z$D0YCT}C-+onQni6vx~TPthpLoe7g%8+UO$4HcmqB zyhh(1)XfMHfvcMFuGq6L)m7Iys?x6pjRa-kq&kjx^rZOf*oMSQM;exhs=-L+^^-d- z!!Z0r6fvvUq5eoFIaq1mJekFkhDA^(Nf*WJnV@E!P5$2HnJA-HYQtLcmA6#d?6?nW zH=%!NhEq^+dT8T2EuM=gE?~g-05mk;u}I2+gz&j4I2oa-DE~*3;5~ybqq4Yxam1Ly zN#nshU|~inVwEpH@3y~cLXfO>B+w*s#AO2&dn-+8$Z3?$EL>em*)<($A1lMdv?TP@ToX<24c7S&PQqDp zDaWtk`&Rljsh$lUI&7ey|xo^b#--U`XLaSW)B!lQ4dxO zf<5Q!jf9V8wl38g9$$Fg zCXyU~I!PgW&U{|v5*@`Qh$s?Qel0pVS~zSYf@ z2d^i1rDPvoE8Y{AN?Z7W3JXuoOhn$U{m656ul&<0^@Z>}nhSU148oEPAUwP$t1{Y)l~b zi^CH?h4(x>Mv1}9$V>n8ZRs+9`uYtq>nn zr>3{~gjsd^5@H5sB;qp4z;O#!_iPkC#c74PYrGpGPsXJPHL8-Tuwum}T={_Pz<}lm&$qconmcjRJkR{9(qWX-hf0x$TOu@i zS;8@Hvy8f=b?1N_byQwlqSEGbW^IvjS#U==rRmkYg}09Ok}~CK+f7>YhUI7M!*q3& zPV?bMmslGfcPG0O!HdiFB|E;i{Y6Dh1>%={GhUBrzVJme{)R>xO1fM=2?Qu1veaZK zyYsf6e8BNa&OYoeWU^Tz(W5UyeT-Mg4!`cJ8va|k z%iZ;p=nH;cIs+WaCLW$8j|Oyc(n-^6W!d1%>7zjzO~<(gU3nHCD_@qzyZ0MZ2;lU! zUv}rs)vv{>E`l5~gQf2)E1lmP9u0TG?(dt=&(>dAoh^7g-*IcRXr$ZOuw43*$ymSX z>G!zWb)$Lo-L;Sq6EK*n1=qgjLt_JR0Z&?Qc=Bol6)n{ifvY)BLkOf`7*6WZ-TDxn z8pnguK!8gTKRCAA%jW%T6>aFKOP$2e=e88$73xJ_{EyzNG-HuY^DcbPfF0aaCwzMB zuHg&+?boqH@|oD?4M79NfPxzE$q&@=t<7HG$1%#%c@>A-M)wkTd)=$x&fP(GUlG=H zfAU;UvJ>{l51XS-opLqP?A7nBKXP60i0cJX4mRCFZF4Jjda_(Bvh&LOI4tlwQzUe92a>9EZVkyOI zpqy4(Q4dER1xq8HpqpyN(%4b(w%Q}>3M_DxgOu(Vp<$xiTz+z_3>JYG# zGB~G8gqHKa?#(a6wfJ7P*GYw-%Ohx<5Y`(P3?f10A%-ZGwZha5rt*o|5$q>4#!p1d zWI=xQhZeFP+L~>@Qv@MOplo&L31$)Bg&1+7M;iL)E8nm8Kwu(6p;0>UvtrY-NZOG8 zsA7N7f>HG3F)Ttql$3vwA!Yd*{*@3NjfnB*$d{iI)EM7t+8vBJ`L@;TvyJG0yQC}` zS&)pddewHVEALx367@(rw)b|c;z%4H3=Mn9Rak)4$35<9Xhz*VET!~}RbM0tMkV{d zA~T1)>*MTwBluzKhOAmlNwTV3hcwSr-;j_Q5t z{2F|%B@$zwCHyEI>r)zQyuskN?hCD*cbc_@x6J}s%G@~6(;e6S) zAL&H%*|69QDY&pSt=O-nEVOCD5SjwD5q(O$-vt9Jnu=!OS%3=gsE3T`CdvK@f#dNmEHI9Mu4wpZ9<7I~EZ5ODR%w(1e zjIgtp3=?3p2QQ_eknW{ln{af2pjuod3sp<-vM5H@es5T<0al2HSM|||Q&M@k0)yI1 z9OFuOH2}9?*DQ=QRA+EA-o`QJ7_(io0U^ud9haSV1M+?gBMj4=Qj73wRmhrSbsU*f zn~*J<&giy~El7EIe_t_~Uhl(6a&3o!a(4jglTOF*N}nMrvFxR$^mwyzbvT16ZiD25 z_5uL<@&qm$_z5eYg<{HU+;(8ESi+spVTy7wvl}8A$D*WPpu%$u#0UVI%Hi>k%f_*b ze_0-%se~7$>xl2XH~iC+0-1iEupISNTMJO355yqS8Vss zVcee^e+KFQ?EinFx&Q9>PtNAAMv(v04}kK2_WR#HM?v}n%wJ_tf3|`@$0&lo&i*z~ z5D?dY4Aj8s7DhEJ1jJwSztTq5|D86L|3Q|iM-L+YLE&Tl>$UzXZDiFq{F61BS{qt2 zf*3)}|H>NoG*0YRRWRSx86W*a=Q-=<>d*5eu}Cd6A*+Z;qs!at>=IVN&27yYN##m& zl^F)6J-?XL+TlOLRKa|xI&QumYIT^Ee*tD1A8dWpBO%x*E{^s&=Z*E6yx_4lM{gD; zT9PBAiiqWx`5gz{^l;cX5SN?QDfsveJ|hDL-ISK0C|<{;S@x#Yfodxid0|Turkz>Z zbq7T12bGZyO>YuszNJ}!jFf6~UMq-X8jDyQDkk>vfyn)xcRnvYv>(xY%~FGn&0nPv z$RH_Q|EW7maz@gsR&w;CrV>kQFP4Q&{w8O#4`))~YXa?c84fEPzC_8;MlXkSEg?UV zYJdux(^~GpbezeU*x1DJ8Y43ZSs(dRc6!8gIwGJn0g*MjLWTMwwU6b4OHT=}Nufb9 z&0|j^LyR7DJed7@_{UdhcRD#8mn+G$!Om`1Q)}ab@9thFM-xnIGT^Mcni>@*#>Hxe zFB`1{No;}`4aT@==ZBw+EJT!@u=K?T*f_3#fFdSKLt_o9g&V*C{G~Kunz|nEa3orX z5&@M43&=Dy=J{#lM2;x@CZIS>#2|Yg(Y072I&o=HY5Qfa>xsPIo1TwtFMW+?5=qyn zvfUd=jpIVai<+N%*VADjhhN>>3Dcjipyovy6^>(dn?IfR?xKOa>Dkn-`ZVdnYtOFh z@#fb{(dHveX~;rB*=gP7?{b5AP(eWbN(v+Eyg)KxYi{dA#Zse*nZY=2HC=7dK;b{9_33TT5s ztC;6vR1wyR8X92Coi!8D _N>6t`LhFic)sIa-(-O^YP3Xu~KbBMi{MBaBqdE$B) zXMIXWsC}aUyu{?}wDi{g@)SyOL$}4LX`&_LiFk9=RT$>ymgoAYD8uua=?ib${ZlYB z3Z657eI8O4vt^P;Jkv~tG%7*vK3@~Q_qSXQ8<^g<|1-R9y$YQq1~bwwp0A{ba(bz_zeqg|4k zlEo-w*23Bq>-~9gJcjLkxlf!+~%dFMrnFLGx={ybG zDh)iT_iC0;y}d3V@^srsDO`MS4jwNj>*XLcBG8Z}6ZwTjKzyEkNNQWEWL_CG5mDHX zI;M1kW^b`O@ZAf8WZ>?uisTpt$BhLm;I+P9&qUv8&2m&FFLjw`Ndk2(K3TZ3ym%TS zcIKO8Urtaf*!0-8E-L~8vgH+fF>g>H18!(I*Q80pN5TrgyPgspi#|S?_@x~>lH$rv z!QT3HJRN1O=Q`_nf7{ig!1ZjcIt20+Cq9>@w;n2RESeDOIJ-h!M8st$qw3b~1&H(7 z#G2JP%6M!8D@L4*hyXb!k&JbS$uUEW1Y@1VeHVjnTp2Lh=5w3C4=cQWytu!yu+St2 zpQii8sKoF=SRD1iB(0gk@9T%A)q@lISV5oLSo&4%%^2P__sRAu7P_!6Z8?yXZ)ZiDj91eY-Uq6TA5hGLyatSG z95*T|ZK|ehHrj!x_oErTvW87>CrMQkrZq3QZ<|VISNeIg5^rH6FW!@}G_Xn4l_ZK! zL7e9$l4F92Z{3X44k=Mi1aBnLXBdqon$80ivcEvbR?^mW{po%;C;NzVoy!H7EnDph z3AFCnuo3}nkRv6#Aar64zc%#)VDK;T5s3q2IikI14fAmkEF_Q=mt>wyjNwmn z?J12aKB$=(7sf{dyO&a?7E|mN7@0y=X!4vqN0_n1C33t;0q610$W$AAxH}m#^muOX zIQ`6fZ~;9qkVND_CnpVKp(*>k_5ngRmSxf`cV7VECvjEpgYo3{1XX0CdD|t2S8%nV z`8OZaYVSh@PQ44*aXg3{d8Y;OjJLFs1@ovM-A|Qo21TB(hSa|YlO+S6ARFt^1F>Vi zsd(_ZH8g$rur(iN)Tc^L9=8q@MCIF3TKImWeq4 literal 0 HcmV?d00001 diff --git a/scripts_analysis/jalc_languages_metadata/test/conta_test.py b/scripts_analysis/jalc_languages_metadata/test/conta_test.py new file mode 100644 index 0000000..7cb0299 --- /dev/null +++ b/scripts_analysis/jalc_languages_metadata/test/conta_test.py @@ -0,0 +1,207 @@ +import unittest +from oc_ds_converter.conta import CountMetadataLang +import csv + +# data are not real, they are created for testing purposes +source_dict_publisher_citing = [{"status":"OK","apiType":"doi","apiVersion":"1.0.0","message":{"total":1,"rows":1,"totalPages":1,"page":1},"data":{"siteId":"SI/JST.JSTAGE","content_type":"JA","doi":"10.57383/brontesocietyjapan.5.4_69","url":"https://doi.org/10.57383/brontesocietyjapan.5.4_69","ra":"JaLC","prefix":"10.57383","site_name":"J-STAGE","publisher_list":[{"publisher_name":"The Brontë Society of Japan","lang":"en"},{"publisher_name":"日本ブロンテ協会","lang":"ja"}],"title_list":[{"lang":"ja","title":"アン・ブロンテのスカーバラ体験ー","subtitle":"ロビンソン家令嬢と梨園の御曹司の駆け落ち結婚"},{"lang":"en","title":"Anne Brontë's Scarborough:","subtitle":"Cultural Activities in the Age of the Railway, and Lydia Robinson's Elopement with the Scion of the Famous Theatrical Family"}],"creator_list":[{"sequence":"1","type":"person","names":[{"lang":"ja","first_name":"大田 美和"},{"lang":"en","first_name":"Miwa OTA"}]}],"publication_date":{"publication_year":"2012","publication_month":"12","publication_day":"01"},"relation_list":[{"content":"https://www.jstage.jst.go.jp/article/brontesocietyjapan/5/4/5_69/_pdf","type":"URL","relation":"fullTextPdf"}],"content_language":"ja","updated_date":"2022-08-29","article_type":"pub","journal_id_list":[{"journal_id":"0913-8617","type":"ISSN","issn_type":"print"},{"journal_id":"2758-2264","type":"ISSN","issn_type":"online"},{"journal_id":"brontesocietyjapan","type":"JID"}],"journal_title_name_list":[{"journal_title_name":"ブロンテ・スタディーズ","type":"full","lang":"ja"},{"journal_title_name":"Brontë Studies","type":"full","lang":"en"}],"journal_classification":"01","recorded_year":"2000","volume":"5","issue":"4","first_page":"69","last_page":"84","date":"2022-08-30"}}, + {"status":"OK","apiType":"doi","apiVersion":"1.0.0","message":{"total":1,"rows":1,"totalPages":1,"page":1},"data":{"siteId":"SI/JST.JSTAGE","content_type":"JA","doi":"10.1241/johokanri.30.676","url":"https://doi.org/10.1241/johokanri.30.676","ra":"JaLC","prefix":"10.1241","site_name":"J-STAGE","publisher_list":[{"publisher_name":"Japan Science and Technology Agency","lang":"en"}],"title_list":[{"lang":"ja","title":"編集後記"}],"publication_date":{"publication_year":"1987"},"relation_list":[{"content":"https://www.jstage.jst.go.jp/article/johokanri/30/7/30_7_676/_pdf","type":"URL","relation":"fullTextPdf"}],"content_language":"ja","updated_date":"2014-12-09","article_type":"pub","journal_id_list":[{"journal_id":"0021-7298","type":"ISSN","issn_type":"print"},{"journal_id":"1347-1597","type":"ISSN","issn_type":"online"},{"journal_id":"johokanri","type":"JID"}],"journal_title_name_list":[{"journal_title_name":"Journal of Information Processing and Management","type":"full","lang":"en"},{"journal_title_name":"Joho Kanri /J Inf Proc Manage","type":"abbreviation","lang":"en"},{"journal_title_name":"j inf proc manage","type":"abbreviation","lang":"en"},{"journal_title_name":"joho kanri","type":"abbreviation","lang":"en"},{"journal_title_name":"情報管理","type":"full","lang":"ja"},{"journal_title_name":"情報管理","type":"abbreviation","lang":"ja"}],"journal_classification":"01","journal_txt_lang":"ja","recorded_year":"1958-2017","volume":"30","issue":"7","first_page":"676","last_page":"676","date":"2012-03-23"}}, + {"status":"OK","apiType":"doi","apiVersion":"1.0.0","message":{"total":1,"rows":1,"totalPages":1,"page":1},"data":{"siteId":"SI/NIJL","content_type":"RD","doi":"10.20730/100121418","url":"https://doi.org/10.20730/100121418","ra":"JaLC","prefix":"10.20730","site_name":"大学共同利用機関法人 人間文化研究機構 国文学研究資料館 ","publisher_list":[{"publisher_name":"[出版者なし]","lang":"ja"}],"title_list":[{"lang":"ja","title":"散木奇歌集"}],"creator_list":[{"sequence":"1","names":[{"lang":"ja","first_name":"源/俊頼"}]}],"publication_date":{"publication_year":"0000"},"updated_date":"2022-05-06"}}, + {"status":"OK","apiType":"doi","apiVersion":"1.0.0","message":{"total":1,"rows":1,"totalPages":1,"page":1},"data":{"siteId":"SI/NDL.NDL.Search","content_type":"JA","doi":"10.11501/3226398","url":"https://doi.org/10.11501/3226398","ra":"JaLC","prefix":"10.11501","site_name":"国立国会図書館/National Diet Library","publisher_list":[{"publisher_name":"日本測量協会","location":"JPN"}],"title_list":[{"title":"測量 = The journal of survey : 地理空間情報の科学と技術 7(7)"}],"creator_list":[{"sequence":"1","names":[{"first_name":"日本測量協会"}]}],"publication_date":{"publication_year":"1957","publication_month":"07"},"edition":{"format":"image/jp2"},"relation_list":[{"content":"http://iss.ndl.go.jp/books/R100000002-I000000013974-00","type":"URL","relation":"source"},{"content":"https://dl.ndl.go.jp/info:ndljp/pid/3381154","type":"URL","relation":"isPartOf"}],"alternate_identifier_list":[{"alternate_identifier":"000000013974","type":"NDL"},{"alternate_identifier":"oai:iss.ndl.go.jp:R100000039-I001202164-00","type":"OAIPMH"}],"content_language":"ja","updated_date":"2022-08-11","article_type":"pub","journal_id_list":[{"journal_id":"0285-7790","type":"ISSN","issn_type":"print"}],"first_page":"-","date":"2022-08-11"}}] +source_dict_publisher_cited = [{"status":"OK","apiType":"doi","apiVersion":"1.0.0","message":{"total":1,"rows":1,"totalPages":1,"page":1},"data":{"siteId":"SI/JST.JSTAGE","content_type":"JA","doi":"10.11248/jsta.52.1","url":"https://doi.org/10.11248/jsta.52.1","ra":"JaLC","prefix":"10.11248","site_name":"J-STAGE","publisher_list":[{"publisher_name":"Japanese Society for Tropical Agriculture","lang":"en"},{"publisher_name":"日本熱帯農業学会","lang":"ja"}],"title_list":[{"lang":"en","title":"Salt Tolerance of Suaeda japonica M. and Use in Salt- Affected Lands"}],"creator_list":[{"sequence":"1","type":"person","names":[{"lang":"en","last_name":"SHIMIZU","first_name":"Katsuyoshi"}],"affiliation_list":[{"affiliation_name":"Graduate School of Life and Environmental Sciences, University of Tsukuba","sequence":"1","lang":"en"}]},{"sequence":"2","type":"person","names":[{"lang":"en","last_name":"CAO","first_name":"Weidon"}],"affiliation_list":[{"affiliation_name":"Chinese Academy of Agricultural Sciences","sequence":"2","lang":"en"}]},{"sequence":"3","type":"person","names":[{"lang":"en","last_name":"ABIKO","first_name":"Naoyuki"}],"affiliation_list":[{"affiliation_name":"College of Agrobiological Resources, University of Tsukuba","sequence":"3","lang":"en"}]},{"sequence":"4","type":"person","names":[{"lang":"en","last_name":"ISHIKAWA","first_name":"Naoto"}],"affiliation_list":[{"affiliation_name":"Graduate School of Life and Environmental Sciences, University of Tsukuba","sequence":"1","lang":"en"}]},{"sequence":"5","type":"person","names":[{"lang":"en","last_name":"HAMAMURA","first_name":"Kunio"}],"affiliation_list":[{"affiliation_name":"Arid Land Research Center, Tottori University","sequence":"4","lang":"en"}]}],"publication_date":{"publication_year":"2008"},"relation_list":[{"content":"https://www.jstage.jst.go.jp/article/jsta/52/1/52_1_1/_pdf","type":"URL","relation":"fullTextPdf"}],"content_language":"en","updated_date":"2017-03-23","article_type":"pub","journal_id_list":[{"journal_id":"1882-8450","type":"ISSN","issn_type":"print"},{"journal_id":"1882-8469","type":"ISSN","issn_type":"online"},{"journal_id":"jsta","type":"JID"}],"journal_title_name_list":[{"journal_title_name":"Tropical Agriculture and Development","type":"full","lang":"en"},{"journal_title_name":"Trop. Agr. Develop.","type":"abbreviation","lang":"en"},{"journal_title_name":"熱帯農業","type":"before","lang":"ja"},{"journal_title_name":"Japanese Journal of Tropical Agriculture","type":"before","lang":"en"}],"journal_classification":"01","journal_txt_lang":"en","recorded_year":"2008-2014","volume":"52","issue":"1","first_page":"1","last_page":"6","date":"2009-03-03","keyword_list":[{"keyword":"crude protein","sequence":"1","lang":"en"},{"keyword":"desalinization","sequence":"2","lang":"en"},{"keyword":"foragecrop","sequence":"3","lang":"en"},{"keyword":"Salicornia","sequence":"4","lang":"en"}],"citation_list":[{"sequence":"1","publisher_list": [{"publisher_name":"Japan Science and Technology Agency","lang":"en"}],"original_text":"Jinno, N. 2000. Enseishokubutsu Shichimennsou. In Ariakekai no ikimonotachi (M. Sato eds.) Kaiyusha (Tokyo). 50-69."},{"sequence":"2","original_text":"National Agricultural Research Organization 2001 Standard Tables of Feed Composition in Japan. Japan Livestock Industry Association. pp245."},{"sequence":"3","publisher_list":[{"publisher_name":"[出版者なし]","lang":"ja"}],"original_text":"Shimizu, K. 1996. Agricultural and forestry science: 8. Studies on the salinity tolerance of Salicornia herbacea L. The Institute of Agriculture and Forestry, University of Tsukuba. pp.65."},{"sequence":"4","original_text":"Shimizu, K. 2000. Effects of Salt Treatments on the Production and Chemical Composition of Saltwort (Salicornia herbacea L.), Rodesgrass and Alfalfa. Jpn J. Trop. Agr. 44: 61-67."},{"sequence":"5","original_text":"Shimizu, K., N. Ishikawa and S. Muranaka 2001. Digestion Trial of the Mixed Diet with Salt Wort (Salicornia herbacea L.) in Goats. Jpn. J. Trop. Agr. 45: 45-48."},{"sequence":"6","original_text":"Shimose, N., F. Takenaka and O. Kimura 1987. Salt tolerance of grasswort Rush and Goldenrod. Jpn. J. Trop. Agr. 31:179-183."},{"sequence":"7","original_text":"Squires, V. R. and A. T. Ayoub 1994. Halophytes as a resource for livestock and for rehabilitation of degraded lands. Task for vegetation science 32. Kluwer Academic Publishers (London). p315."},{"sequence":"8","original_text":"UNEP 1991. Sands of change: Why land becomes desert and what can we do about it, UNEP Environment Brief No.2."},{"sequence":"9","original_text":"Wada, N., J. Zhang, N. Jinno, A. Okubo and S. Yamazaki 2003. Component analysis of a halophyte, Suaeda japonica, grown on the shore of Ariake sea. Bunseki Kagaku 52: 843-846."}]}}] +source_dict_journal_title_cited = [{'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11162/daikankyo.3.1_13', 'url': 'https://doi.org/10.11162/daikankyo.3.1_13', 'ra': 'JaLC', 'prefix': '10.11162', 'site_name': 'J-STAGE', 'publisher_list': [{'publisher_name': 'Academic Consociation of Environmental Safety and Waste Management, Japan', 'lang': 'en'}, {'publisher_name': '大学等環境安全協議会', 'lang': 'ja'}], 'title_list': [{'lang': 'en', 'title': 'Efficient Methane Fermentation System with Hydrolysis Technology Using High Temperature Aerobic Hydrolysis Bacteria'}, {'lang': 'ja', 'title': '高温可溶化技術を利用したメタン発酵システムの開発'}], 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Nakamichi', 'first_name': 'Takahiro'}, {'lang': 'ja', 'last_name': '中道', 'first_name': '隆広'}], 'affiliation_list': [{'affiliation_name': 'Graduate School of Engineering, Nagasaki Institute of Applied Science', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '長崎総合科学大学大学院工学研究科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Nakahsima', 'first_name': 'Takuji'}, {'lang': 'ja', 'last_name': '中島', 'first_name': '琢自'}], 'affiliation_list': [{'affiliation_name': 'Kitasato University Research Organization for Infection Control Science', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '北里大学感染制御研究機構', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Kai', 'first_name': 'Hotaka'}, {'lang': 'ja', 'last_name': '甲斐', 'first_name': '穂高'}], 'affiliation_list': [{'affiliation_name': 'Department of Chemistry and Biochemistry, Suzuka National College of Technology', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '鈴鹿工業高等専門学校生物応用化学科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Takemoto', 'first_name': 'Naomichi'}, {'lang': 'ja', 'last_name': '竹本', 'first_name': '直道'}], 'affiliation_list': [{'affiliation_name': 'Graduate School of Engineering, Nagasaki Institute of Applied Science', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '長崎総合科学大学大学院工学研究科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '5', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Matsuo', 'first_name': 'Hideki'}, {'lang': 'ja', 'last_name': '松尾', 'first_name': '英樹'}], 'affiliation_list': [{'affiliation_name': 'Study of Environmental Resources, Faculty of Environmental and Symbiotic Science, Prefectural University of Kumamoto', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '熊本県立大学環境共生学部', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '6', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Kobayashi', 'first_name': 'Jyun'}, {'lang': 'ja', 'last_name': '小林', 'first_name': '淳'}], 'affiliation_list': [{'affiliation_name': 'Study of Environmental Resources, Faculty of Environmental and Symbiotic Science, Prefectural University of Kumamoto', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '熊本県立大学環境共生学部', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '7', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Takahasi', 'first_name': 'Yoko'}, {'lang': 'ja', 'last_name': '高橋', 'first_name': '洋子'}], 'affiliation_list': [{'affiliation_name': 'Kitasato Institute for Life Science, Kitasato University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '北里大学北里生命科学研究所', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '8', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Ohba', 'first_name': 'Kazuhiro'}, {'lang': 'ja', 'last_name': '大場', 'first_name': '和彦'}], 'affiliation_list': [{'affiliation_name': 'Graduate School of Engineering, Nagasaki Institute of Applied Science', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '長崎総合科学大学大学院工学研究科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '9', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Ishibashi', 'first_name': 'Yasuhiro'}, {'lang': 'ja', 'last_name': '石橋', 'first_name': '康弘'}], 'affiliation_list': [{'affiliation_name': 'Study of Environmental Resources, Faculty of Environmental and Symbiotic Science, Prefectural University of Kumamoto', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '熊本県立大学環境共生学部', 'sequence': '1', 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2012'}, 'relation_list': [{'content': 'https://www.jstage.jst.go.jp/article/daikankyo/3/1/3_1_13/_pdf', 'type': 'URL', 'relation': 'fullTextPdf'}], 'content_language': 'ja', 'updated_date': '2014-12-09', 'article_type': 'pub', 'journal_id_list': [{'journal_id': '1884-4375', 'type': 'ISSN', 'issn_type': 'print'}, {'journal_id': '2186-3725', 'type': 'ISSN', 'issn_type': 'online'}, {'journal_id': 'daikankyo', 'type': 'JID'}], 'journal_title_name_list': [{'journal_title_name': 'Journal of Environment and Safety', 'type': 'full', 'lang': 'en'}, {'journal_title_name': 'Journal of Environment and Safety', 'type': 'abbreviation', 'lang': 'en'}, {'journal_title_name': '環境と安全', 'type': 'full', 'lang': 'ja'}, {'journal_title_name': '環境と安全', 'type': 'abbreviation', 'lang': 'ja'}], 'journal_classification': '01', 'recorded_year': '2010-2014', 'volume': '3', 'issue': '1', 'first_page': '1_13', 'last_page': '1_20', 'date': '2012-08-04', 'keyword_list': [{'keyword': '2槽式メタン発酵', 'sequence': '1', 'lang': 'ja'}, {'keyword': 'Two phase anaerobic digestion system', 'sequence': '1', 'lang': 'en'}, {'keyword': '高温可溶化', 'sequence': '2', 'lang': 'ja'}, {'keyword': 'Hydrothermal hydrolysis', 'sequence': '2', 'lang': 'en'}, {'keyword': '下水汚泥', 'sequence': '3', 'lang': 'ja'}, {'keyword': 'Sewage sludge', 'sequence': '3', 'lang': 'en'}], 'citation_list': [{'sequence': '1', 'doi': '10.3775/jie.90.247', 'journal_title_name_list': [{'journal_title_name': '日本エネルギー学会誌', 'lang': 'ja'}], 'title_list': [{'lang': 'ja', 'title': '下水汚泥エネルギー利用システムのインベントリ分析による性能評価'}], 'volume': '90', 'issue': '3', 'first_page': '247', 'last_page': '257', 'publication_date': {'publication_year': '2011'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'ja', 'first_name': '木室洋介'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'ja', 'first_name': '古林敬顕'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'ja', 'first_name': '中田俊彦'}]}], 'content_language': 'ja', 'original_text': '1) 木室洋介,古林敬顕,中田俊彦, 下水汚泥エネルギー利用システムのインベントリ分析による性能評価, 日本エネルギー学会誌, 90(3), 247-257, 2011.'}, {'sequence': '2', 'original_text': '2) 竹村昌太,坂本昭一,崔玉順,遠田幸生,杉山重彰,杉本文男,今井忠男,佐藤勇,佐藤博, 下水汚泥ケーキの環境リスク低減型炭化処理バイオマスに関する基礎的研究, 資源と素材, 119(2), 66-70, 2003.'}, {'sequence': '3', 'doi': '10.1263/jbb.105.48', 'title_list': [{'lang': 'en', 'title': 'Comparison of Thermophilic Anaerobic Digestion Characteristics between Single-Phase and Two-Phase Systems for Kitchen Garbage Treatment'}], 'volume': '105', 'issue': '1', 'first_page': '48', 'last_page': '54', 'publication_date': {'publication_year': '2008'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Park', 'first_name': 'YongJin'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Hong', 'first_name': 'Feng'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Cheon', 'first_name': 'JiHoon'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Hidaka', 'first_name': 'Taira'}]}, {'sequence': '5', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Tsuno', 'first_name': 'Hiroshi'}]}], 'content_language': 'en', 'original_text': '3) Park Y.,Hong F.,Cheon J.,Hidaka T.,Tsuno H., Comparison of thermophilic anaerobic digestion characteristics between single-phase and two-phase systems for kitchen garbage treatment, Journal of Bioscience and Bioengineering, 105(1), 48-54, 2008.'}, {'sequence': '4', 'original_text': '4) Pavan P, Battistoni P, Cecchi F, Mata-Alvarez J., Twophase anaerobic digestion of source sorted OFMSW (organic fraction of municipal solid waste), Water Sci Technol, 41(3), 111-118, 2000.'}, {'sequence': '5', 'original_text': '5) 吉田隆,バイオマスからの気体燃料製造とそのエネルギー利用, 中島田豊,西尾尚道編, メタン発酵の前処理方法,第1版, 178-187, 東京, エヌ・ティー・エス出版社, 2007.'}, {'sequence': '6', 'original_text': '6) 今井剛,荒金光弘,樋口隆哉,関根雅彦,村上定瞭,竹内正美, 可溶化技術を用いた汚泥処理に関する研究展望, 廃棄物学会論文誌, 19(1), 1-8, 2008.'}, {'sequence': '7', 'original_text': '7) 今井剛,荒金光弘,関根雅彦,樋口隆哉,浮田正夫, 高速回転ディスク法による余剰汚泥の減量化とその生分解特性に関する研究, 土木学会論文集G, 63(4), 351-359, 2007.'}, {'sequence': '8', 'original_text': '8) 荒金光弘,今井剛,村上定瞭,竹内正美,浮田正夫,関根雅彦,樋口隆哉, アルカリを添加した亜臨界水処理における余剰汚泥の可溶化に関する研究, 土木学会論文集G, 62(4), 427-434, 2006.'}, {'sequence': '9', 'doi': '10.3775/jie.88.147', 'journal_title_name_list': [{'journal_title_name': '日本エネルギー学会誌', 'lang': 'ja'}], 'title_list': [{'lang': 'ja', 'title': '超臨界水を利用した食品廃棄物のガス化の基礎的検討'}], 'volume': '88', 'issue': '2', 'first_page': '147', 'last_page': '154', 'publication_date': {'publication_year': '2009'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'ja', 'last_name': '宗綱', 'first_name': '洋人'}]}], 'content_language': 'ja', 'original_text': '9) 宗綱洋人,今村邦彦,玉井正弘,樋口浩一,橋本寿之,野口賢二郎,松村幸彦, 超臨界水を利用した食品廃棄物のガス化の基礎的検討, 日本エネルギー学会誌, 88(2), 147-154, 2009.'}, {'sequence': '10', 'doi': '10.1089/ees.2010.0077', 'original_text': '10) Nakamichi T., Nakashima T., Fujisaki H., Takamatu N., Matumoto T., Takahashi Y., and Isshibashi Y., Characteristics of Anoxybacillus sp. MU3 isolated from a hot spring and its, Environmental Engineering Science, 27(12), 993-999, 2010.'}, {'sequence': '11', 'original_text': '11) 社団法人日本下水道協会, 下水道施設計画・設計指針と解説\u3000後編, 345, 2009.'}, {'sequence': '12', 'original_text': '12) 酒沢千嘉弘,市川邦介,福井三郎, メタン醗酵に関する研究(第1報), 酸化還元電位について, 醗酵工學雑誌, 41(5), 245-249,1968.'}, {'sequence': '13', 'original_text': '13) 国土交通省都市・地域整備局下水道部編, バイオソリッド利活用基本計画下水汚泥処理総合計画), 策定マニュアル(案), 49, 2003.'}, {'sequence': '14', 'original_text': '14) 野池達也, メタン発酵, 20-21, 135-136, 東京, 技報堂出版社, 2009.'}, {'sequence': '15', 'original_text': '15) 社団法人日本有機化学会編, バイオガスシステムの現状と課題, 15, 2006.'}, {'sequence': '16', 'original_text': '16) 社団法人日本有機化学会編, バイオガス化マニュアル, 47, 2006.'}]}}, + {'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11162/daikankyo.3.2_87', 'url': 'https://doi.org/10.11162/daikankyo.3.2_87', 'ra': 'JaLC', 'prefix': '10.11162', 'site_name': 'J-STAGE', 'publisher_list': [{'publisher_name': 'Academic Consociation of Environmental Safety and Waste Management, Japan', 'lang': 'en'}, {'publisher_name': '大学等環境安全協議会', 'lang': 'ja'}], 'title_list': [{'lang': 'ja', 'title': '大学における建物単位防災訓練の有用性評価'}, {'lang': 'en', 'title': 'Usefulness of fire drills in each university building'}], 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Koshiba', 'first_name': 'Yusuke'}, {'lang': 'ja', 'last_name': '小柴', 'first_name': '佑介'}], 'affiliation_list': [{'affiliation_name': 'Department of Materials Science and Chemical Engineering, Faculty of Engineering, Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学大学院工学研究院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Hayashibara', 'first_name': 'Nobuhiro'}, {'lang': 'ja', 'last_name': '林原', 'first_name': '伸大'}], 'affiliation_list': [{'affiliation_name': 'College of Engineering, Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学工学部物質工学科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Suzuki', 'first_name': 'Yuji'}, {'lang': 'ja', 'last_name': '鈴木', 'first_name': '雄二'}], 'affiliation_list': [{'affiliation_name': 'Center for Risk Management and Safety Sciences, Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学安心・安全の科学研究教育センター', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Ohtani', 'first_name': 'Hideo'}, {'lang': 'ja', 'last_name': '大谷', 'first_name': '英雄'}], 'affiliation_list': [{'affiliation_name': ' Department of Safety Management, Faculty of Environment and Information Sciences, Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学大学院環境情報研究院', 'sequence': '1', 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2012'}, 'relation_list': [{'content': 'https://www.jstage.jst.go.jp/article/daikankyo/3/2/3_2_87/_pdf', 'type': 'URL', 'relation': 'fullTextPdf'}], 'content_language': 'ja', 'updated_date': '2014-12-09', 'article_type': 'pub', 'journal_id_list': [{'journal_id': '1884-4375', 'type': 'ISSN', 'issn_type': 'print'}, {'journal_id': '2186-3725', 'type': 'ISSN', 'issn_type': 'online'}, {'journal_id': 'daikankyo', 'type': 'JID'}], 'journal_title_name_list': [{'journal_title_name': 'Journal of Environment and Safety', 'type': 'full', 'lang': 'en'}, {'journal_title_name': 'Journal of Environment and Safety', 'type': 'abbreviation', 'lang': 'en'}, {'journal_title_name': '環境と安全', 'type': 'full', 'lang': 'ja'}, {'journal_title_name': '環境と安全', 'type': 'abbreviation', 'lang': 'ja'}], 'journal_classification': '01', 'recorded_year': '2010-2014', 'volume': '3', 'issue': '2', 'first_page': '2_87', 'last_page': '2_95', 'date': '2012-10-26', 'keyword_list': [{'keyword': '建物単位防災訓練', 'sequence': '1', 'lang': 'ja'}, {'keyword': 'Fire drill in each university building', 'sequence': '1', 'lang': 'en'}, {'keyword': 'アンケート', 'sequence': '2', 'lang': 'ja'}, {'keyword': 'Questionnaire survey', 'sequence': '2', 'lang': 'en'}, {'keyword': '数量化Ⅱ類', 'sequence': '3', 'lang': 'ja'}, {'keyword': 'Quantification method of the second type', 'sequence': '3', 'lang': 'en'}], 'citation_list': [{'sequence': '1', 'original_text': '1) Japan Meteorological Agency, The 2011 off the pacific coast of Tohoku earthquake ∼first report∼, http://www.jma.go.jp/jma/en/News/2011_Earthquake_01.html (Accessed on April 30, 2012).'}, {'sequence': '2', 'original_text': '2) 河北新聞, April 16, 2011.'}, {'sequence': '3', 'original_text': '3) 読売新聞, May 22, 2011.'}, {'sequence': '4', 'original_text': '4) 現代化学編集グループ, 研究室から地震を守るには―東日本大震災の教訓―, 現代化学, 486(9), 30-41, 2011.'}, {'sequence': '5', 'original_text': '5) 飯田光明, 産総研研究施設の地震被害状況, 安全工学, 50(6), 440-445, 2011.'}, {'sequence': '6', 'original_text': '6) 上月康則,井若和久,田邊晋他, わが国における大学防災の現状に関する調査研究, 安全問題研究論文集, 3(30), 185-190, 2008.'}, {'sequence': '7', 'doi': '10.11188/seisankenkyu.61.713', 'title_list': [{'lang': 'en', 'title': 'A basic study on development of an education curriculum for disaster reduction in a compulsory education course'}, {'lang': 'ja', 'title': '義務教育課程における防災教育カリキュラムの開発に向けた基礎的研究'}], 'volume': '61', 'issue': '4', 'first_page': '713', 'last_page': '716', 'publication_date': {'publication_year': '2009'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'KISHIDA', 'first_name': 'Sachiko'}, {'lang': 'ja', 'last_name': '岸田', 'first_name': '幸子'}], 'affiliation_list': [{'affiliation_name': '中央大学大学院 理工学研究科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'OHARA', 'first_name': 'Miho'}, {'lang': 'ja', 'last_name': '大原', 'first_name': '美保'}], 'affiliation_list': [{'affiliation_name': '東京大学大学院情報学環\u3000総合防災情報研究センター', 'sequence': '1', 'lang': 'ja'}, {'affiliation_name': '東京大学生産技術研究所\u3000都市基盤安全工学国際研究センター', 'sequence': '2', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'MEGURO', 'first_name': 'Kimiro'}, {'lang': 'ja', 'last_name': '目黒', 'first_name': '公郎'}], 'affiliation_list': [{'affiliation_name': '東京大学生産技術研究所\u3000都市基盤安全工学国際研究センター', 'sequence': '1', 'lang': 'ja'}]}], 'content_language': 'ja', 'original_text': '7) 岸田幸子,大原美保,目黒公郎, 義務教育課程における防災教育カリキュラムの開発に向けた基礎的研究, 生産研究, 61(4), 713-716, 2009.'}, {'sequence': '8', 'doi': '10.3130/aija.73.2599', 'title_list': [{'lang': 'en', 'title': "RESEARCH ON TEACHERS' IDEA OF FIRE PREVENTION EDUCATION AND TEACHERS' ABILITY TO PREVENT FIRE"}, {'title': '教師の防火教育に対する意識と防火対応能力'}], 'volume': '73', 'issue': '634', 'first_page': '2599', 'last_page': '2604', 'publication_date': {'publication_year': '2008'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'TATEBE', 'first_name': 'Kenji'}, {'lang': 'ja', 'last_name': '建部', 'first_name': '謙治'}], 'affiliation_list': [{'affiliation_name': 'Dept. of Urban Environment, Faculty of Engineering, Aichi Institute of Technology', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '愛知工業大学工学部都市環境学科', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'YOSHIOKA', 'first_name': 'Tatsumi'}, {'lang': 'ja', 'last_name': '吉岡', 'first_name': '竜巳'}], 'affiliation_list': [{'affiliation_name': 'Faculty of Engineering, Aichi Institute of Technology', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '愛知工業大学工学部', 'sequence': '1', 'lang': 'ja'}]}], 'original_text': '8) 建部謙治,吉岡竜巳, 教師の防火教育に対する意識と防火対応能力, 日本建築学会計画系論文集, 73(634) 2599-2604, 2008.'}, {'sequence': '9', 'original_text': '9) 木村玲欧,林能成,鈴木康弘他, 名古屋大学における防災訓練の実施と継続的な防災教育の試み, 安全問題研究論文集, 1, 49-54, 2006.'}, {'sequence': '10', 'original_text': '10) Florida Atlantic University, Fire safety manual, http://www.fau.edu/facilities/ehs/info/Fire-Safety-Manual.pdf (Accessed on April 30, 2012).'}, {'sequence': '11', 'original_text': '11) Multimedia University, Fire drill & emergency evacuation procedure, http://www.mmu.edu.my/upfiles/dm364_FDEEProcedure.pdf (Accessed on April 30, 2012).'}, {'sequence': '12', 'doi': '10.11181/hpi.48.184', 'title_list': [{'lang': 'en', 'title': 'Analysis of a Fire Accident Caused by the Leakage of a Large Amount of Oxygen Gas in a University'}, {'lang': 'ja', 'title': '大学における酸素ガス漏洩による火災事故の原因分析'}], 'volume': '48', 'issue': '4', 'first_page': '184', 'last_page': '191', 'publication_date': {'publication_year': '2010'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'SUZUKI', 'first_name': 'Yuji'}, {'lang': 'ja', 'last_name': '鈴木', 'first_name': '雄二'}], 'affiliation_list': [{'affiliation_name': 'Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'KOBAYASHI', 'first_name': 'Hideo'}, {'lang': 'ja', 'last_name': '小林', 'first_name': '英男'}], 'affiliation_list': [{'affiliation_name': 'Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'KOSHIBA', 'first_name': 'Yusuke'}, {'lang': 'ja', 'last_name': '小柴', 'first_name': '佑介'}], 'affiliation_list': [{'affiliation_name': 'Yokohama National University', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '横浜国立大学', 'sequence': '1', 'lang': 'ja'}]}], 'content_language': 'ja', 'original_text': '12) 鈴木雄二,小林英男,小柴佑介, 大学における酸素ガス漏洩による火災事故の原因分析, 圧力技術, 48(4) 184-191, 2010.'}, {'sequence': '13', 'doi': '10.1007/BF02949778', 'journal_title_name_list': [{'journal_title_name': 'Annals of the Institute of Statistical Mathematics', 'lang': 'en'}], 'title_list': [{'lang': 'en', 'title': 'On the prediction of phenomena from qualitative data and the quantification of qualitative data from the mathematical statistical point of view'}], 'volume': '3', 'first_page': '69', 'last_page': '98', 'publication_date': {'publication_year': '1952'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'HAYASHI', 'first_name': 'C.'}]}], 'content_language': 'en', 'original_text': '13) Hayashi, C., On the prediction of phenomena from qualitative data and the quantification of qualitative data from the mathematico-statistical point of view, Annals of the institute of statistical mathematics, 3, 69-98, 1952.'}, {'sequence': '14', 'doi': '10.1007/BF02919500', 'journal_title_name_list': [{'journal_title_name': 'Annals of the Institute of Statistical Mathematics', 'lang': 'en'}], 'title_list': [{'lang': 'en', 'title': 'On the quantification of qualitative data from the mathematico-statistical point of viuew-an approach for apolying this method to the parole prediction-'}], 'volume': '2', 'issue': '1', 'first_page': '35', 'last_page': '47', 'publication_date': {'publication_year': '1950'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'HAYASHI', 'first_name': 'C.'}]}], 'content_language': 'en', 'original_text': '14) Hayashi, C., On the quantification of qualitative data from the mathematico-statistical point of view, Annals of the institute of statistical mathematics, 2, 35-47, 1950.'}, {'sequence': '15', 'original_text': '15) Slovic, P., The perception of risk, 137-153; London, Earthscan Publications, 2000.'}, {'sequence': '16', 'original_text': '16) Slovic, P., Perception of risk, Science, 236(4799), 280-285, 1987.'}, {'sequence': '17', 'original_text': '17) Boholm, A., Comparative studies of risk perception, a review of twenty years of research, Journal of Risk Research, 1(2), 135-163, 1998.'}, {'sequence': '18', 'doi': '10.1037//0022-3514.39.5.806', 'content_language': 'ja', 'original_text': '18) Weinstein, N.D., Unrealistic optimism about future life events, Journal of Personality and Social Psychology, 39(5), 806-820, 1980.'}, {'sequence': '19', 'original_text': '19) Ju, Y.H., Sohn, S.Y., Quantification method analysis of the relationship between occupant injury and environmental factors in traffic accidents, Accident Analysis and Prevention, 43(1), 342-351, 2011.'}, {'sequence': '20', 'original_text': "20) Nandedkar A., Midha V., It won't happen to me, An assessment of optimism bias in music piracy, Computers in Human Behavior, 28(1), 41-48, 2012."}, {'sequence': '21', 'original_text': '21) Caponecchia, C., Relative risk perception for terrorism, implications for preparedness and risk communication, Risk Analysis, 32(9) 1524-1534, 2012.'}, {'sequence': '22', 'original_text': '22) Radcliffe, N.M., Klein, W.M.P., Dispositional, unrealistic, and comparative optimism, differential relations with the knowledge and processing of risk information and beliefs about personal risk, Personality and Social Psychology Bulletin, 28(6), 836-846, 2012.'}, {'sequence': '23', 'original_text': '23) Kung, Y.W., Chen, S.H., Perception of earthquake risk in Taiwan, effects of gender and past earthquake experience, Risk Analysis, 32(9) 1535-1546, 2012.'}, {'sequence': '24', 'doi': '10.1111/j.1539-6924.1994.tb00082.x', 'volume': '14', 'first_page': '1101', 'publication_date': {'publication_year': '1994'}, 'original_text': '24) Flynn, J., Slovic, P., Mertz, C.K., Gender, race, and perception of environmental health risks, Risk Analysis, 14(6) 1101-1108, 1994.'}]}}] +source_dict_title_cited = [{'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11163/akanekai.2.5', 'url': 'https://doi.org/10.11163/akanekai.2.5', 'ra': 'JaLC', 'prefix': '10.11163', 'site_name': 'J-STAGE', 'publisher_list': [{'publisher_name': 'Akane Medical Corporation, Showa Hospital', 'lang': 'en'}, {'publisher_name': '医療法人茜会\u3000昭和病院', 'lang': 'ja'}], 'title_list': [{'lang': 'en', 'title': 'Healing of decubitus required serum albumin in hospitalized patients'}, {'title': '昭和病院入院患者の褥瘡とその背景因子の臨床的検討'}], 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'TOYOTA', 'first_name': 'Masahiro'}, {'lang': 'ja', 'last_name': '豊田', 'first_name': '昌弘'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'UNOKI', 'first_name': 'Hideaki'}, {'lang': 'ja', 'last_name': '鵜木', 'first_name': '秀明'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'SHIN-YA', 'first_name': 'Tohoru'}, {'lang': 'ja', 'last_name': '新矢', 'first_name': '徹'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'OKUZONO', 'first_name': 'Yumiko'}, {'lang': 'ja', 'last_name': '奥園', 'first_name': '裕美子'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '5', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'YOSHIMIZU', 'first_name': 'Takumi'}, {'lang': 'ja', 'last_name': '吉水', 'first_name': '卓見'}], 'affiliation_list': [{'affiliation_name': "Yoshimizu Physician's Office", 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '吉水内科', 'sequence': '1', 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2005'}, 'relation_list': [{'content': 'http://www.jstage.jst.go.jp/article/akanekai/2/1/2_1_5/_pdf', 'type': 'URL', 'relation': 'fullTextPdf'}], 'updated_date': '2014-12-09', 'article_type': 'pub', 'journal_id_list': [{'journal_id': '1880-151X', 'type': 'ISSN', 'issn_type': 'online'}, {'journal_id': '1880-1528', 'type': 'ISSN', 'issn_type': 'print'}, {'journal_id': 'akanekai', 'type': 'JID'}], 'journal_title_name_list': [{'journal_title_name': 'The Journal of Showa Hospital', 'type': 'full', 'lang': 'en'}, {'journal_title_name': 'The Journal of Showa Hospital', 'type': 'abbreviation', 'lang': 'en'}, {'journal_title_name': '昭和病院雑誌', 'type': 'full', 'lang': 'ja'}, {'journal_title_name': '昭和病院雑誌', 'type': 'abbreviation', 'lang': 'ja'}], 'journal_classification': '01', 'recorded_year': '2004-2007', 'volume': '2', 'issue': '1', 'first_page': '5', 'last_page': '8', 'date': '2005-09-13', 'keyword_list': [{'keyword': '褥瘡', 'sequence': '1', 'lang': 'ja'}, {'keyword': 'decubitus', 'sequence': '1', 'lang': 'en'}, {'keyword': 'アルブミン', 'sequence': '2', 'lang': 'ja'}, {'keyword': 'albumin', 'sequence': '2', 'lang': 'en'}], 'citation_list': [{'sequence': '1', 'doi': '10.11163/akanekai.1.12', 'title_list': [{'lang': 'en', 'title': 'Decubitus follow up in Showa Hospital'}], 'volume': '1', 'issue': '1', 'first_page': '12', 'last_page': '16', 'publication_date': {'publication_year': '2004'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Tanaka', 'first_name': 'Shuji'}, {'lang': 'ja', 'last_name': '田中', 'first_name': '修二'}]}], 'original_text': '1. 田中修二, 昭和病院における褥瘡の経過, 昭和病院雑誌 2004 Sep: 第1巻(1号), 12-16'}, {'sequence': '2', 'volume': '2000', 'issue': '4 Pt 1', 'first_page': '13', 'publication_date': {'publication_year': '2000'}, 'original_text': "2. Guenter P, Malyszek R, Bliss DZ, Steffe T, O'Hara D, LaVan F, Monteiro D. Survey of nutritional status in newly hospitalized patients with stage III or stage IV pressure ulcers. Adv Skin Wound Care. 2000 Jul-Aug; 13(4 Pt 1): 164-8."}, {'sequence': '3', 'volume': '2003', 'issue': '4', 'first_page': '49', 'publication_date': {'publication_year': '2003'}, 'original_text': '3. Reed RL, Hepburn K, Adelson R, Center B, McKnight P. Low serum albumin levels, confusion, and fecal incontinence: are these risk factors for pressure ulcers in mobility-impaired hospitalized adults? Gerontology. 2003 Jul-Aug; 49(4): 255-9.'}, {'sequence': '4', 'volume': '18', 'issue': '3', 'first_page': '327', 'publication_date': {'publication_year': '1990'}, 'original_text': '4. Kaminski MV Jr, Williams SD. Review of the rapid normalization of serum albumin with modified total parenteral nutrition solutions. Crit Care Med. 1990 Mar; 18(3): 327-35.'}, {'sequence': '5', 'doi': '10.3143/geriatrics.34.486', 'title_list': [{'lang': 'en', 'title': 'Nutritional Status of Patients with Decubitus Ulcers, and Changes in the Skin Blood Flow when the Sacral Region was Compressed.'}], 'volume': '34', 'issue': '6', 'first_page': '486', 'last_page': '491', 'publication_date': {'publication_year': '1997'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Miyajima', 'first_name': 'Yoshio'}, {'lang': 'ja', 'last_name': '宮島', 'first_name': '良夫'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Asano', 'first_name': 'Tetsuichi'}, {'lang': 'ja', 'last_name': '浅野', 'first_name': '哲一'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Maehata', 'first_name': 'Yukihiko'}, {'lang': 'ja', 'last_name': '前畑', 'first_name': '幸彦'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Matsuda', 'first_name': 'Hiroshi'}, {'lang': 'ja', 'last_name': '松田', 'first_name': 'ひろし'}]}, {'sequence': '5', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Fukumoto', 'first_name': 'Ichiro'}, {'lang': 'ja', 'last_name': '福本', 'first_name': '一朗'}]}], 'content_language': 'ja', 'original_text': '5. Miyajima Y, Asano T, Maehata Y, Matsuda H, Fukumoto I. [Nutritional status of patients with decubitus ulcers, and changes in the skin blood flow when the sacral region was compressed] Nippon Ronen Igakkai Zasshi. 1997 Jun; 34(6): 486-91. Japanese.'}]}}, + {'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11163/akanekai.2.52', 'url': 'https://doi.org/10.11163/akanekai.2.52', 'ra': 'JaLC', 'prefix': '10.11163', 'site_name': 'J-STAGE', 'publisher_list': [{'publisher_name': 'Akane Medical Corporation, Showa Hospital', 'lang': 'en'}, {'publisher_name': '医療法人茜会\u3000昭和病院', 'lang': 'ja'}], 'title_list': [{'lang': 'en', 'title': 'Fractures at Showa Hospital'}, {'title': '昭和病院における骨折の現状'}], 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'AOKI', 'first_name': 'Michiko'}, {'lang': 'ja', 'last_name': '青木', 'first_name': '美智子'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'WADA', 'first_name': 'Hiroyuki'}, {'lang': 'ja', 'last_name': '和田', 'first_name': '宏幸'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'ARAI', 'first_name': 'Keiko'}, {'lang': 'ja', 'last_name': '新井', 'first_name': '敬子'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'TAMIYA', 'first_name': 'Chiharu'}, {'lang': 'ja', 'last_name': '田宮', 'first_name': '千春'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2005'}, 'relation_list': [{'content': 'http://www.jstage.jst.go.jp/article/akanekai/2/1/2_1_52/_pdf', 'type': 'URL', 'relation': 'fullTextPdf'}], 'updated_date': '2014-12-09', 'article_type': 'pub', 'journal_id_list': [{'journal_id': '1880-151X', 'type': 'ISSN', 'issn_type': 'online'}, {'journal_id': '1880-1528', 'type': 'ISSN', 'issn_type': 'print'}, {'journal_id': 'akanekai', 'type': 'JID'}], 'journal_title_name_list': [{'journal_title_name': 'The Journal of Showa Hospital', 'type': 'full', 'lang': 'en'}, {'journal_title_name': 'The Journal of Showa Hospital', 'type': 'abbreviation', 'lang': 'en'}, {'journal_title_name': '昭和病院雑誌', 'type': 'full', 'lang': 'ja'}, {'journal_title_name': '昭和病院雑誌', 'type': 'abbreviation', 'lang': 'ja'}], 'journal_classification': '01', 'recorded_year': '2004-2007', 'volume': '2', 'issue': '1', 'first_page': '52', 'last_page': '56', 'date': '2005-09-13', 'keyword_list': [{'keyword': '骨折', 'sequence': '1', 'lang': 'ja'}, {'keyword': 'fracture', 'sequence': '1', 'lang': 'en'}], 'citation_list': [{'sequence': '1', 'doi': '10.11163/akanekai.1.1', 'title_list': [{'lang': 'en', 'title': 'Ergot derivative dopamine agonist pergolide and cabergoline induced a man with Parkinsonism pleural effusion'}], 'volume': '1', 'issue': '1', 'first_page': '1', 'last_page': '6', 'publication_date': {'publication_year': '2004'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Muta', 'first_name': 'Yoshihiro'}, {'lang': 'ja', 'last_name': '牟田', 'first_name': '好博'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Hisazumi', 'first_name': 'Taichi'}, {'lang': 'ja', 'last_name': '久澄', 'first_name': '太一'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Unoki', 'first_name': 'Hideaki'}, {'lang': 'ja', 'last_name': '鵜木', 'first_name': '秀明'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'Kawai', 'first_name': 'Motoharu'}, {'lang': 'ja', 'last_name': '川井', 'first_name': '元晴'}]}], 'original_text': '1. 長澤俊明、久澄太一、鵜木秀明、吉水卓見、昭和病院入院患者様の転倒骨折とその特殊性、昭和病院雑誌\u3000第1巻1号Sep, 10, 2004'}, {'sequence': '2', 'volume': '130', 'issue': '2', 'first_page': '142', 'publication_date': {'publication_year': '2005'}, 'original_text': '2. Andress HJ, Grubwinkler M, Forkl H, Schinkel C, Lob G. [Change of daily life activity after femoral hip fracture in elderly patients.] Zentralbl Chir. 2005 Apr; 130(2): 142-7. German.'}, {'sequence': '3', 'first_page': '21', 'publication_date': {'publication_year': '2005'}, 'original_text': '3. Fjalestad T, Stromsoe K, Blucher J, Tennoe B. Fractures in the proximal humerus: functional outcome and evaluation of 70 patients treated in hospital. Arch Orthop Trauma Surg. 2005 Apr 21'}, {'sequence': '4', 'volume': '46', 'issue': '2', 'first_page': '102', 'publication_date': {'publication_year': '2001'}, 'original_text': '4. Senghor J, Sy MH, Ndiaye A, Dansokho AV, Seye SI. [Trochanteric fractures in elderly patients: management and prognosis of 68 cases] Dakar Med. 2001; 46(2): 102-4. French.'}]}}, + {'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11163/akanekai.2.9', 'url': 'https://doi.org/10.11163/akanekai.2.9', 'ra': 'JaLC', 'prefix': '10.11163', 'site_name': 'J-STAGE', 'publisher_list': [{'publisher_name': 'Akane Medical Corporation, Showa Hospital', 'lang': 'en'}, {'publisher_name': '医療法人茜会\u3000昭和病院', 'lang': 'ja'}], 'title_list': [{'lang': 'en', 'title': 'Diarrhea induced by Escherichia coli O serogroups at Group home'}, {'title': 'グループホームで経験した病原性大腸菌による腸炎について'}], 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'NAGASAWA', 'first_name': 'Toshiaki'}, {'lang': 'ja', 'last_name': '長沢', 'first_name': '俊明'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'HIRANO', 'first_name': 'Hideyasu'}, {'lang': 'ja', 'last_name': '平野', 'first_name': '英保'}], 'affiliation_list': [{'affiliation_name': 'Katsuyama satellite clinic', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '勝山サテライトクリニック', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'FUJIWARA', 'first_name': 'Hirokazu'}, {'lang': 'ja', 'last_name': '藤原', 'first_name': '弘一'}], 'affiliation_list': [{'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人茜会 昭和病院', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'YOSHIMIZU', 'first_name': 'Takumi'}, {'lang': 'ja', 'last_name': '吉水', 'first_name': '卓見'}], 'affiliation_list': [{'affiliation_name': "Yoshimizu Physician's Office", 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '吉水内科', 'sequence': '1', 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2005'}, 'relation_list': [{'content': 'http://www.jstage.jst.go.jp/article/akanekai/2/1/2_1_9/_pdf', 'type': 'URL', 'relation': 'fullTextPdf'}], 'updated_date': '2014-12-09', 'article_type': 'pub', 'journal_id_list': [{'journal_id': '1880-151X', 'type': 'ISSN', 'issn_type': 'online'}, {'journal_id': '1880-1528', 'type': 'ISSN', 'issn_type': 'print'}, {'journal_id': 'akanekai', 'type': 'JID'}], 'journal_title_name_list': [{'journal_title_name': 'The Journal of Showa Hospital', 'type': 'full', 'lang': 'en'}, {'journal_title_name': 'The Journal of Showa Hospital', 'type': 'abbreviation', 'lang': 'en'}, {'journal_title_name': '昭和病院雑誌', 'type': 'full', 'lang': 'ja'}, {'journal_title_name': '昭和病院雑誌', 'type': 'abbreviation', 'lang': 'ja'}], 'journal_classification': '01', 'recorded_year': '2004-2007', 'volume': '2', 'issue': '1', 'first_page': '9', 'last_page': '13', 'date': '2005-09-13', 'keyword_list': [{'keyword': '急性下痢', 'sequence': '1', 'lang': 'ja'}, {'keyword': 'acute diarrhea', 'sequence': '1', 'lang': 'en'}, {'keyword': '病原性大腸菌', 'sequence': '2', 'lang': 'ja'}, {'keyword': 'pathogenic bacteria', 'sequence': '2', 'lang': 'en'}], 'citation_list': [{'sequence': '1', 'doi': '10.1056/NEJM198303243081203', 'title_list': [{'lang': 'en', 'title': 'Hemorrhagic colitis associated with a rare Escherichia coli serotype'}], 'volume': '308', 'first_page': '681', 'last_page': '685', 'publication_date': {'publication_year': '1983'}, 'original_text': '1. Riley,J.W.et al.: Hemorrhagic colitis associated with a rare Escherichia coli serotype. N. Engl. J. Med.,308: 681-685,1983'}, {'sequence': '2', 'volume': '8', 'issue': '3', 'first_page': '489', 'publication_date': {'publication_year': '2001'}, 'original_text': '2. Iida K, Mizunoe Y, Wai SN, Yoshida S. Type 1 fimbriation and its phase switching in diarrheagenic Escherichia coli strains. Clin Diagn Lab Immunol. 2001 May; 8(3): 489-95.'}, {'sequence': '3', 'volume': '99', 'issue': '6', 'first_page': '545', 'publication_date': {'publication_year': '2004'}, 'original_text': '3. Campos LC, Franzolin MR, Trabulsi LR. Diarrheagenic Escherichia coli categories among the traditional enteropathogenic E. coli O serogroups-a review. Mem Inst Oswaldo Cruz. 2004 Oct; 99(6): 545-52. Epub 2004 Nov 18.'}]}}] +source_dict_creators = [{'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11163/akanekai.3.1', 'url': 'https://doi.org/10.11163/akanekai.3.1', 'ra': 'JaLC', 'prefix': '10.11163', 'site_name': 'J-STAGE', 'publisher_list': [{'publisher_name': 'Akane Medical Corporation, Showa Hospital', 'lang': 'en'}, {'publisher_name': '医療法人茜会\u3000昭和病院', 'lang': 'ja'}], 'title_list': [{'lang': 'en', 'title': "Let's spend hospitalized life comfortably and feel life rhythms"}, {'lang': 'ja', 'title': '生活リズムを感じながら,快適な入院生活を送ろう'}], 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'FUKUNAGA', 'first_name': 'Sayoko'}, {'lang': 'ja', 'last_name': '福永', 'first_name': '小夜子'}], 'affiliation_list': [{'affiliation_name': 'A3 Ward Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人 茜会 昭和病院 A3病棟', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'TANAKA', 'first_name': 'Yuuko'}, {'lang': 'ja', 'last_name': '田中', 'first_name': '裕子'}], 'affiliation_list': [{'affiliation_name': 'A3 Ward Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人 茜会 昭和病院 A3病棟', 'sequence': '1', 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'NAKAMURA', 'first_name': 'Youko'}, {'lang': 'ja', 'last_name': '中村', 'first_name': '陽子'}], 'affiliation_list': [{'affiliation_name': 'A3 Ward Showa Hospital, Akane Medical Corporation', 'sequence': '1', 'lang': 'en'}, {'affiliation_name': '医療法人 茜会 昭和病院 A3病棟', 'sequence': '1', 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2006'}, 'relation_list': [{'content': 'http://www.jstage.jst.go.jp/article/akanekai/3/1/3_1_1/_pdf', 'type': 'URL', 'relation': 'fullTextPdf'}], 'content_language': 'ja', 'updated_date': '2014-12-09', 'article_type': 'pub', 'journal_id_list': [{'journal_id': '1880-151X', 'type': 'ISSN', 'issn_type': 'online'}, {'journal_id': '1880-1528', 'type': 'ISSN', 'issn_type': 'print'}, {'journal_id': 'akanekai', 'type': 'JID'}], 'journal_title_name_list': [{'journal_title_name': 'The Journal of Showa Hospital', 'type': 'full', 'lang': 'en'}, {'journal_title_name': 'The Journal of Showa Hospital', 'type': 'abbreviation', 'lang': 'en'}, {'journal_title_name': '昭和病院雑誌', 'type': 'full', 'lang': 'ja'}, {'journal_title_name': '昭和病院雑誌', 'type': 'abbreviation', 'lang': 'ja'}], 'journal_classification': '01', 'recorded_year': '2004-2007', 'volume': '3', 'issue': '1', 'first_page': '1', 'last_page': '5', 'date': '2007-06-10', 'keyword_list': [{'keyword': '生活リズム', 'sequence': '1', 'lang': 'ja'}, {'keyword': 'Life rhythms', 'sequence': '1', 'lang': 'en'}], 'citation_list': [{'sequence': '1', 'volume': '08', 'first_page': 'h0805', 'publication_date': {'publication_year': '2002'}, 'original_text': '1. 国際生活機能分類、厚生労働省、http://www.mhlw.go.jp/houdou/2002/08/h0805-1.html'}, {'sequence': '2', 'volume': '2004', 'first_page': '0208', 'publication_date': {'publication_year': '2005'}, 'original_text': '2. 沖住 省吾, 内藤 由美子, 竹内 文夫, 土屋 幸代、通所リハビリテーションにおける個別療法導入後の変化について、日本理学療法学術大会, Vol. 2004 (2005) pp.E0208-E0208'}, {'sequence': '3', 'doi': '10.11350/jspta.2.31', 'volume': '2', 'issue': '1', 'first_page': '31', 'last_page': '34', 'publication_date': {'publication_year': '1994'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'ja', 'last_name': '清宮', 'first_name': '清美'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'ja', 'last_name': '塚野', 'first_name': '信'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'ja', 'last_name': '今井', 'first_name': '基次'}]}], 'content_language': 'ja', 'original_text': '3. 清宮 清美, 塚野 信, 今井 基次、当センターで関わった機能訓練事業について、埼玉理学療法, Vol. 2 (1994) No. 1 pp.31-34'}, {'sequence': '4', 'doi': '10.11163/akanekai.2.14', 'title_list': [{'lang': 'en', 'title': 'Improvement of ADL by rehabilitation training'}], 'volume': '2', 'issue': '1', 'first_page': '14', 'last_page': '18', 'publication_date': {'publication_year': '2005'}, 'creator_list': [{'sequence': '1', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'MIYAZAKI', 'first_name': 'Mitsuko'}, {'lang': 'ja', 'last_name': '宮崎', 'first_name': '満子'}]}, {'sequence': '2', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'HAYASHI', 'first_name': 'Minako'}, {'lang': 'ja', 'last_name': '林', 'first_name': '美奈子'}]}, {'sequence': '3', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'NISHIOKA', 'first_name': 'Chizue'}, {'lang': 'ja', 'last_name': '西岡', 'first_name': '千鶴恵'}]}, {'sequence': '4', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'NISHI', 'first_name': 'Atsuko'}, {'lang': 'ja', 'last_name': '西', 'first_name': 'アツ子'}]}, {'sequence': '5', 'type': 'person', 'names': [{'lang': 'en', 'last_name': 'HORITA', 'first_name': 'Masao'}, {'lang': 'ja', 'last_name': '堀田', 'first_name': '雅央'}]}], 'original_text': '4. 宮崎 満子, 林 美奈子, 西岡 千鶴恵, 西 アツ子, 堀田 雅央、リハビリ看護を行なうことでADLの活性化を図る、昭和病院雑誌 Vol. 2 (2005) No. 1 pp.14-18'}]}}, + {'status': 'OK', 'apiType': 'doi', 'apiVersion': '1.0.0', + 'message': {'total': 1, 'rows': 1, 'totalPages': 1, 'page': 1}, + 'data': {'siteId': 'SI/JST.JSTAGE', 'content_type': 'JA', 'doi': '10.11163/akanekai.2.52', + 'url': 'https://doi.org/10.11163/akanekai.2.52', 'ra': 'JaLC', 'prefix': '10.11163', + 'site_name': 'J-STAGE', 'publisher_list': [ + {'publisher_name': 'Akane Medical Corporation, Showa Hospital', 'lang': 'en'}, + {'publisher_name': '医療法人茜会\u3000昭和病院', 'lang': 'ja'}], + 'title_list': [{'lang': 'en', 'title': 'Fractures at Showa Hospital'}, + {'title': '昭和病院における骨折の現状'}], 'creator_list': [ + {'sequence': '1', 'type': 'person', + 'names': [{'lang': 'en', 'last_name': 'AOKI', 'first_name': 'Michiko'}, + {'lang': 'ja', 'last_name': '青木', 'first_name': '美智子'}], + 'affiliation_list': [ + {'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', + 'lang': 'en'}, + {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', + 'lang': 'ja'}]}, {'sequence': '2', 'type': 'person', 'names': [ + {'lang': 'en', 'last_name': 'WADA', 'first_name': 'Hiroyuki'}, + {'lang': 'ja', 'last_name': '和田', 'first_name': '宏幸'}], 'affiliation_list': [ + {'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', + 'lang': 'en'}, + {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', + 'lang': 'ja'}]}, {'sequence': '3', 'type': 'person', 'names': [ + {'lang': 'en', 'last_name': 'ARAI', 'first_name': 'Keiko'}, + {'lang': 'ja', 'last_name': '新井', 'first_name': '敬子'}], 'affiliation_list': [ + {'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', + 'lang': 'en'}, + {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', + 'lang': 'ja'}]}, {'sequence': '4', 'type': 'person', 'names': [ + {'lang': 'en', 'last_name': 'TAMIYA', 'first_name': 'Chiharu'}, + {'lang': 'ja', 'last_name': '田宮', 'first_name': '千春'}], 'affiliation_list': [ + {'affiliation_name': 'Showa Hospital, Akane Medical Corporation', 'sequence': '1', + 'lang': 'en'}, + {'affiliation_name': '医療法人茜会 昭和病院 放射線課', 'sequence': '1', + 'lang': 'ja'}]}], 'publication_date': {'publication_year': '2005'}, + 'relation_list': [ + {'content': 'http://www.jstage.jst.go.jp/article/akanekai/2/1/2_1_52/_pdf', + 'type': 'URL', 'relation': 'fullTextPdf'}], 'updated_date': '2014-12-09', + 'article_type': 'pub', 'journal_id_list': [ + {'journal_id': '1880-151X', 'type': 'ISSN', 'issn_type': 'online'}, + {'journal_id': '1880-1528', 'type': 'ISSN', 'issn_type': 'print'}, + {'journal_id': 'akanekai', 'type': 'JID'}], 'journal_title_name_list': [ + {'journal_title_name': 'The Journal of Showa Hospital', 'type': 'full', 'lang': 'en'}, + {'journal_title_name': 'The Journal of Showa Hospital', 'type': 'abbreviation', + 'lang': 'en'}, {'journal_title_name': '昭和病院雑誌', 'type': 'full', 'lang': 'ja'}, + {'journal_title_name': '昭和病院雑誌', 'type': 'abbreviation', 'lang': 'ja'}], + 'journal_classification': '01', 'recorded_year': '2004-2007', 'volume': '2', + 'issue': '1', 'first_page': '52', 'last_page': '56', 'date': '2005-09-13', + 'keyword_list': [{'keyword': '骨折', 'sequence': '1', 'lang': 'ja'}, + {'keyword': 'fracture', 'sequence': '1', 'lang': 'en'}], + 'citation_list': [{'sequence': '1', 'doi': '10.11163/akanekai.1.1', 'title_list': [ + {'lang': 'en', + 'title': 'Ergot derivative dopamine agonist pergolide and cabergoline induced a man with Parkinsonism pleural effusion'}], + 'volume': '1', 'issue': '1', 'first_page': '1', 'last_page': '6', + 'publication_date': {'publication_year': '2004'}, 'creator_list': [ + {'sequence': '1', 'type': 'person', + 'names': [{'lang': 'en', 'last_name': 'Muta', 'first_name': 'Yoshihiro'}, + {'lang': 'ja', 'last_name': '牟田', 'first_name': '好博'}]}, + {'sequence': '2', 'type': 'person', + 'names': [{'lang': 'en', 'last_name': 'Hisazumi', 'first_name': 'Taichi'}, + {'lang': 'ja', 'last_name': '久澄', 'first_name': '太一'}]}, + {'sequence': '3', 'type': 'person', + 'names': [{'lang': 'en', 'last_name': 'Unoki', 'first_name': 'Hideaki'}, + {'lang': 'ja', 'last_name': '鵜木', 'first_name': '秀明'}]}, + {'sequence': '4', 'type': 'person', + 'names': [{'lang': 'en', 'last_name': 'Kawai', 'first_name': 'Motoharu'}, + {'lang': 'ja', 'last_name': '川井', 'first_name': '元晴'}]}], + 'original_text': '1. 長澤俊明、久澄太一、鵜木秀明、吉水卓見、昭和病院入院患者様の転倒骨折とその特殊性、昭和病院雑誌\u3000第1巻1号Sep, 10, 2004'}, + {'sequence': '2', 'volume': '130', 'issue': '2', + 'first_page': '142', + 'publication_date': {'publication_year': '2005'}, + 'original_text': '2. Andress HJ, Grubwinkler M, Forkl H, Schinkel C, Lob G. [Change of daily life activity after femoral hip fracture in elderly patients.] Zentralbl Chir. 2005 Apr; 130(2): 142-7. German.'}, + {'sequence': '3', 'first_page': '21', + 'publication_date': {'publication_year': '2005'}, + 'original_text': '3. Fjalestad T, Stromsoe K, Blucher J, Tennoe B. Fractures in the proximal humerus: functional outcome and evaluation of 70 patients treated in hospital. Arch Orthop Trauma Surg. 2005 Apr 21'}, + {'sequence': '4', 'volume': '46', 'issue': '2', 'first_page': '102', + 'publication_date': {'publication_year': '2001'}, + 'original_text': '4. Senghor J, Sy MH, Ndiaye A, Dansokho AV, Seye SI. [Trochanteric fractures in elderly patients: management and prognosis of 68 cases] Dakar Med. 2001; 46(2): 102-4. French.'}]}}, + ] +DIR = "D:\JOCI\JOCI_PRE_SAMPLE" +CSV_FILE = "D:\JOCI\sample_count.csv" +class CountMetadataLangTest(unittest.TestCase): + + def test_count_publisher_lang(self): + my_dict = CountMetadataLang() + results_citing = my_dict.count_publisher_lang(source_dict_publisher_citing, citing=True) + en_ja = results_citing[0] + en = results_citing[1] + ja = results_citing[2] + self.assertEqual(1, en_ja) + self.assertEqual(1, en) + self.assertEqual(1, ja) + results_cited = my_dict.count_publisher_lang(source_dict_publisher_cited, citing=False) + en_ja_cited = results_cited[0] + en_cited = results_cited[1] + ja_cited = results_cited[2] + self.assertEqual(0, en_ja_cited) + self.assertEqual(1, en_cited) + self.assertEqual(1, ja_cited) + + def test_count_journal_title_lang(self): + my_dict = CountMetadataLang() + results_citing = my_dict.count_journal_title_lang(source_dict_publisher_citing, citing=True) + en_ja = results_citing[0] + en = results_citing[1] + ja = results_citing[2] + self.assertEqual(2, en_ja) + self.assertEqual(0, en) + self.assertEqual(0, ja) + results_cited = my_dict.count_journal_title_lang(source_dict_journal_title_cited, citing=False) + en_ja_cited = results_cited[0] + en_cited = results_cited[1] + ja_cited = results_cited[2] + self.assertEqual(0, en_ja_cited) + self.assertEqual(2, en_cited) + self.assertEqual(2, ja_cited) + + def test_count_title_lang(self): + my_dict = CountMetadataLang() + results_citing = my_dict.count_title_lang(source_dict_publisher_citing, citing=True) + en_ja = results_citing[0] + en = results_citing[1] + ja = results_citing[2] + self.assertEqual(1, en_ja) + self.assertEqual(0, en) + self.assertEqual(2, ja) + results_cited = my_dict.count_title_lang(source_dict_title_cited, citing=False) + en_ja_cited = results_cited[0] + en_cited = results_cited[1] + ja_cited = results_cited[2] + self.assertEqual(0, en_ja_cited) + self.assertEqual(4, en_cited) + self.assertEqual(0, ja_cited) + + def test_count_creator_names_lang(self): + my_dict = CountMetadataLang() + results_citing = my_dict.count_creator_names_lang(source_dict_creators, citing=True) + en_ja = results_citing[0] + en = results_citing[1] + ja = results_citing[2] + self.assertEqual(7, en_ja) + self.assertEqual(0, en) + self.assertEqual(0, ja) + results_cited = my_dict.count_creator_names_lang(source_dict_creators, citing=False) + en_ja_cited = results_cited[0] + en_cited = results_cited[1] + ja_cited = results_cited[2] + self.assertEqual(9, en_ja_cited) + self.assertEqual(0, en_cited) + self.assertEqual(3, ja_cited) + + def test_call_functions_for_all_zips(self): + my_dict = CountMetadataLang() + all_zips = my_dict.find_zip_subfiles(DIR) + my_dict.call_functions_for_all_zips(all_zips, [my_dict.count_publisher_lang, my_dict.count_title_lang, my_dict.count_journal_title_lang, my_dict.count_creator_names_lang], CSV_FILE, True, True) + publisher_count = ['3', '0', '0', '0', '0', '0'] + target_row0 = 0 + title = ['1', '2', '0', '0', '3', '3'] + target_row1 = 1 + journal_title = ['3', '0', '0', '0', '0', '3'] + target_row2 = 2 + creators = ['12', '4', '0', '11', '5', '6'] + target_row3 = 3 + with open(CSV_FILE, 'r') as csv_file: + csvreader = csv.reader(csv_file) + next(csvreader) + for idx, row in enumerate(csvreader, start=0): + if idx == target_row0: + self.assertEqual(publisher_count, row) + elif idx == target_row1: + self.assertEqual(title, row) + elif idx == target_row2: + self.assertEqual(journal_title, row) + elif idx == target_row3: + self.assertEqual(creators, row) + + + + + + + + + + + + + + + + diff --git a/test/crossref_process_test.py b/test/crossref_process_test.py index ccf0e83..7726c3d 100644 --- a/test/crossref_process_test.py +++ b/test/crossref_process_test.py @@ -3,30 +3,41 @@ import unittest from os.path import join from oc_ds_converter.run.crossref_process import * +from pathlib import Path + -BASE = os.path.join('test', 'crossref_processing') -TARGZ_INPUT_FOLDER = os.path.join(BASE, 'tar_gz_test') -TARGZ_INPUT = os.path.join(TARGZ_INPUT_FOLDER, '40228.tar.gz') -OUTPUT = os.path.join(BASE, 'output_dir') -PUBLISHERS_MAPPING = os.path.join(BASE, 'publishers.csv') -WANTED_DOIS_FOLDER = os.path.join(BASE, 'wanted_dois') -IOD = os.path.join(BASE, 'iod') -CACHE = os.path.join(BASE, 'cache.json') -DB = os.path.join(BASE, 'anydb.db') -TARGZ_CITED_INPUT_FOLDER = os.path.join(BASE, 'tar_gz_cited_test') -TARGZ_CITED_INPUT = os.path.join(TARGZ_CITED_INPUT_FOLDER, '3.tar.gz') class CrossrefProcessTest(unittest.TestCase): + def setUp(self) -> None: + self.test_dir = os.path.join('test', 'crossref_processing') + self.targz_input_folder = os.path.join(self.test_dir, 'tar_gz_test') + self.targz_input = os.path.join(self.targz_input_folder, '40228.tar.gz') + self.output = os.path.join(self.test_dir, 'output_dir') + self.publisher_mapping = os.path.join(self.test_dir, 'publishers.csv') + self.wanted_dois = os.path.join(self.test_dir, 'wanted_dois') + self.iod = os.path.join(self.test_dir, 'iod') + self.cache = os.path.join(self.test_dir, 'cache.json') + self.db = os.path.join(self.test_dir, 'anydb.db') + self.targz_cited_folder = os.path.join(self.test_dir, 'tar_gz_cited_test') + self.targz_cited_input = os.path.join(self.targz_cited_folder, '3.json.tar.gz') + self.gzip_input = os.path.join(self.test_dir, 'gzip_test') + self.sample_fake_dump_dir = os.path.join(self.test_dir, 'tar_gz_wrong_cited_doi') + self.sample_fake_dump = os.path.join(self.sample_fake_dump_dir, '1.tar.gz') + self.any_db1 = join(self.test_dir, "anydb1.db") + + def test_preprocess_base_decompress_and_read_without_cited(self): """CASE 1: compressed input without cited entities""" - if os.path.exists(OUTPUT): - shutil.rmtree(OUTPUT) + if os.path.exists(self.output): + shutil.rmtree(self.output) - citations_output_path = OUTPUT + "_citations" + citations_output_path = self.output + "_citations" if os.path.exists(citations_output_path): shutil.rmtree(citations_output_path) - preprocess(TARGZ_INPUT, PUBLISHERS_MAPPING, IOD, OUTPUT, redis_storage_manager=False, storage_path=DB, cache = CACHE) + preprocess(self.targz_input, publishers_filepath=self.publisher_mapping, orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=False, storage_path=self.db, cache = self.cache) + + citations_in_output = 0 encountered_ids = set() unique_entities = 0 @@ -49,19 +60,19 @@ def test_preprocess_base_decompress_and_read_without_cited(self): self.assertEqual(expected_entities_in_output, unique_entities) self.assertEqual(expected_citations_in_output, citations_in_output) - shutil.rmtree(OUTPUT) + shutil.rmtree(self.output) shutil.rmtree(citations_output_path) def test_preprocess_base_and_decompress_with_cited(self): """CASE2: compressed input with cited entities""" - if os.path.exists(OUTPUT): - shutil.rmtree(OUTPUT) + if os.path.exists(self.output): + shutil.rmtree(self.output) - citations_output_path = OUTPUT + "_citations" + citations_output_path = self.output + "_citations" if os.path.exists(citations_output_path): shutil.rmtree(citations_output_path) - preprocess(crossref_json_dir=TARGZ_CITED_INPUT, publishers_filepath=PUBLISHERS_MAPPING, orcid_doi_filepath=IOD, csv_dir=OUTPUT, redis_storage_manager=False, storage_path=DB, cache = CACHE) + preprocess(crossref_json_dir=self.targz_cited_input, publishers_filepath=self.publisher_mapping, orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=False, storage_path=self.db, cache = self.cache) citations_in_output = 0 encountered_ids = set() unique_entities = 0 @@ -84,16 +95,16 @@ def test_preprocess_base_and_decompress_with_cited(self): self.assertEqual(expected_entities_in_output, unique_entities) self.assertEqual(expected_citations_in_output, citations_in_output) - '''citations_files_n = len(list(os.listdir(citations_output_path))) + citations_files_n = len(list(os.listdir(citations_output_path))) - #shutil.rmtree(citations_output_path) + shutil.rmtree(citations_output_path) - meta_files_n = len(list(os.listdir(OUTPUT))) + meta_files_n = len(list(os.listdir(self.output))) # Make sure that a meta table row was created for each entity entities_in_meta_output = 0 - for file in os.listdir(OUTPUT): - with open(os.path.join(OUTPUT, file), 'r', encoding='utf-8') as f: + for file in os.listdir(self.output): + with open(os.path.join(self.output, file), 'r', encoding='utf-8') as f: entities_in_meta_output += len(list(csv.DictReader(f))) self.assertEqual(expected_entities_in_output, entities_in_meta_output) @@ -104,5 +115,187 @@ def test_preprocess_base_and_decompress_with_cited(self): self.assertTrue(meta_files_n == 2) self.assertTrue(citations_files_n == 1) - #shutil.rmtree(OUTPUT)''' - '''os.remove(DB)''' \ No newline at end of file + shutil.rmtree(self.output) + os.remove(self.db) + + def test_preprocess_base_and_decompress_with_cited_redis(self): + """CASE2: compressed input with cited entities""" + if os.path.exists(self.output): + shutil.rmtree(self.output) + + citations_output_path = self.output + "_citations" + if os.path.exists(citations_output_path): + shutil.rmtree(citations_output_path) + + preprocess(crossref_json_dir=self.targz_cited_input, publishers_filepath=self.publisher_mapping, orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=True, storage_path=self.any_db1, cache = self.cache) + citations_in_output = 0 + encountered_ids = set() + unique_entities = 0 + + for file in os.listdir(citations_output_path): + with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: + cits_rows = list(csv.DictReader(f)) + citations_in_output += len(cits_rows) + for x in cits_rows: + citing_ids = x["citing"].split(" ") + citied_ids = x["cited"].split(" ") + if all(id not in encountered_ids for id in citing_ids): + unique_entities += 1 + encountered_ids.update(citing_ids) + if all(id not in encountered_ids for id in citied_ids): + unique_entities += 1 + encountered_ids.update(citied_ids) + expected_entities_in_output = 17 + expected_citations_in_output = 16 + self.assertEqual(expected_entities_in_output, unique_entities) + self.assertEqual(expected_citations_in_output, citations_in_output) + + citations_files_n = len(list(os.listdir(citations_output_path))) + + shutil.rmtree(citations_output_path) + + meta_files_n = len(list(os.listdir(self.output))) + + # Make sure that a meta table row was created for each entity + entities_in_meta_output = 0 + for file in os.listdir(self.output): + with open(os.path.join(self.output, file), 'r', encoding='utf-8') as f: + entities_in_meta_output += len(list(csv.DictReader(f))) + + self.assertEqual(expected_entities_in_output, entities_in_meta_output) + self.assertEqual(unique_entities, entities_in_meta_output) + + + # make sure that for each of the input files was created a citation file and two meta input file + self.assertTrue(meta_files_n == 2) + self.assertTrue(citations_files_n == 1) + + shutil.rmtree(self.output) + #os.remove(self.any_db1) + + + + def test_preprocess_wrong_doi_cited(self): + + if os.path.exists(self.output): + shutil.rmtree(self.output) + + citations_output_path = self.output + "_citations" + if os.path.exists(citations_output_path): + shutil.rmtree(citations_output_path) + + preprocess(self.sample_fake_dump, publishers_filepath=self.publisher_mapping, orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=False, storage_path=self.db, cache = self.cache) + + citations_in_output = 0 + encountered_ids = set() + unique_entities = 0 + + for file in os.listdir(citations_output_path): + with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: + cits_rows = list(csv.DictReader(f)) + citations_in_output += len(cits_rows) + for x in cits_rows: + citing_ids = x["citing"].split(" ") + citied_ids = x["cited"].split(" ") + if all(id not in encountered_ids for id in citing_ids): + unique_entities += 1 + encountered_ids.update(citing_ids) + if all(id not in encountered_ids for id in citied_ids): + unique_entities += 1 + encountered_ids.update(citied_ids) + + expected_citations_in_output = 15 + + expected_entities_in_output = 16 + + + self.assertEqual(expected_entities_in_output, unique_entities) + self.assertEqual(expected_citations_in_output, citations_in_output) + + shutil.rmtree(self.output) + shutil.rmtree(citations_output_path) + + + os.remove(self.db) + + def test_any_db_creation_redis_no_testing(self): + try: + rsm = RedisStorageManager(testing=False) + rsm.set_value("TEST VALUE", False) + run_test = True + except: + run_test = False + print("test skipped: 'test_any_db_creation_redis_no_testing': Connect to redis before running the test") + + if run_test: + rsm.del_value("TEST VALUE") + if not len(rsm.get_all_keys()): + preprocess(crossref_json_dir=self.targz_cited_input, publishers_filepath=self.publisher_mapping, + orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=True, + storage_path=self.db, cache=self.cache) + + + rsm.delete_storage() + + else: + #print("get_all_keys()", rsm.get_all_keys()) + #rsm.delete_storage() + print("test skipped: 'test_storage_management_no_testing' because redis db 2 is not empty") + + def test_cache(self): + 'Nothing should be produced in output, since the cache file reports that all the files in input were completed' + + + if os.path.exists(self.output): + shutil.rmtree(self.output) + + citations_output_path = self.output + "_citations" + if os.path.exists(citations_output_path): + shutil.rmtree(citations_output_path) + cache_dict = {'first_iteration':[], 'second_iteration':[]} + targz_fd = tarfile.open(self.targz_cited_input, "r:gz", encoding="utf-8") + for cur_file in targz_fd: + if cur_file.name.endswith('.json') and not basename(cur_file.name).startswith("."): + cache_dict['first_iteration'].append(Path(cur_file.name).name) + cache_dict['second_iteration'].append(Path(cur_file.name).name) + + with open(self.cache, "w") as write_cache: + json.dump(cache_dict, write_cache) + + preprocess(crossref_json_dir=self.targz_cited_input, publishers_filepath=self.publisher_mapping, + orcid_doi_filepath=self.iod, csv_dir=self.output, redis_storage_manager=True, + storage_path=self.db, cache=self.cache) + + citations_in_output = 0 + encountered_ids = set() + unique_entities = 0 + + + + for file in os.listdir(citations_output_path): + with open(os.path.join(citations_output_path, file), 'r', encoding='utf-8') as f: + cits_rows = list(csv.DictReader(f)) + citations_in_output += len(cits_rows) + for x in cits_rows: + citing_ids = x["citing"].split(" ") + citied_ids = x["cited"].split(" ") + if all(id not in encountered_ids for id in citing_ids): + unique_entities += 1 + encountered_ids.update(citing_ids) + + if all(id not in encountered_ids for id in citied_ids): + unique_entities += 1 + encountered_ids.update(citied_ids) + + expected_citations_in_output = 0 + + expected_entities_in_output = 0 + + + self.assertEqual(expected_entities_in_output, unique_entities) + self.assertEqual(expected_citations_in_output, citations_in_output) + + shutil.rmtree(citations_output_path) + shutil.rmtree(self.output) + + diff --git a/test/jalc_process_test.py b/test/jalc_process_test.py index 69ed6af..b53eef9 100644 --- a/test/jalc_process_test.py +++ b/test/jalc_process_test.py @@ -82,7 +82,7 @@ def test_preprocess_base_decompress_and_read(self): citations_files_n = len(list(os.listdir(citations_output_path))) - shutil.rmtree(citations_output_path) + #shutil.rmtree(citations_output_path) meta_files_n = len(list(os.listdir(self.output_dir))) @@ -104,7 +104,7 @@ def test_preprocess_base_decompress_and_read(self): self.assertTrue(meta_files_n == 2*input_files_n == 4) self.assertTrue(citations_files_n == input_files_n) - shutil.rmtree(self.output_dir) + # shutil.rmtree(self.output_dir) for el in os.listdir(self.sample_dump_dir): if el.endswith("decompr_zip_dir"): @@ -147,7 +147,7 @@ def test_preprocess_wrong_doi_cited(self): expected_citations_in_output = 1 expected_entities_in_output = 2 - '''3 cited: + ''''3 cited: - 10.5100/jje.30.40: doi not found, - 10.5100/jje.33.1: https://www.jstage.jst.go.jp/article/jje1965/33/1/33_1_1/_article/-char/ja/, - 10.1539/joh1959.5.56: doi not found''' diff --git a/test/processing_crossref_test.py b/test/processing_crossref_test.py deleted file mode 100644 index dca9108..0000000 --- a/test/processing_crossref_test.py +++ /dev/null @@ -1,415 +0,0 @@ -import csv -import os -import shutil -import unittest -from pprint import pprint -from shutil import rmtree -from subprocess import Popen -from sys import executable - -from oc_ds_converter.crossref.crossref_processing import CrossrefProcessing -from oc_ds_converter.lib.csvmanager import CSVManager -from oc_ds_converter.lib.file_manager import get_csv_data -from oc_ds_converter.lib.jsonmanager import * -from oc_ds_converter.run.crossref_process import preprocess - -BASE = os.path.join('test', 'crossref_processing') -IOD = os.path.join(BASE, 'iod') -WANTED_DOIS = os.path.join(BASE, 'wanted_dois.csv') -WANTED_DOIS_FOLDER = os.path.join(BASE, 'wanted_dois') -DATA = os.path.join(BASE, '40228.json') -DATA_DIR = BASE -OUTPUT = os.path.join(BASE, 'meta_input') -MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test') -GZIP_INPUT = os.path.join(BASE, 'gzip_test') -PUBLISHERS_MAPPING = os.path.join(BASE, 'publishers.csv') - -class TestCrossrefProcessing(unittest.TestCase): - - def test_csv_creator(self): - crossref_processor = CrossrefProcessing(orcid_index=IOD, doi_csv=WANTED_DOIS_FOLDER, publishers_filepath=None) - data = load_json(DATA, None) - output = list() - for item in data['items']: - tabular_data = crossref_processor.csv_creator(item) - if tabular_data: - output.append(tabular_data) - expected_output = [ - {'id': 'doi:10.47886/9789251092637.ch7', 'title': 'Freshwater, Fish and the Future: Proceedings of the Global Cross-Sectoral Conference', 'author': '', 'pub_date': '2016', 'venue': 'Freshwater, Fish and the Future: Proceedings of the Global Cross-Sectoral Conference', 'volume': '', 'issue': '', 'page': '', 'type': 'book chapter', 'publisher': 'American Fisheries Society [crossref:460]', 'editor': 'Lymer, David; Food and Agriculture Organization of the United Nations Fisheries and Aquaculture Department Viale delle Terme di Caracalla Rome 00153 Italy; Marttin, Felix; Marmulla, Gerd; Bartley, Devin M.'}, - {'id': 'doi:10.9799/ksfan.2012.25.1.069', 'title': 'Nonthermal Sterilization and Shelf-life Extension of Seafood Products by Intense Pulsed Light Treatment', 'author': 'Cheigh, Chan-Ick [orcid:0000-0002-6227-4053]; Mun, Ji-Hye [orcid:0000-0002-6227-4053]; Chung, Myong-Soo', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '69-76', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': ''}, - {'id': 'doi:10.9799/ksfan.2012.25.1.105', 'title': 'A Study on Dietary Habit and Eating Snack Behaviors of Middle School Students with Different Obesity Indexes in Chungnam Area', 'author': 'Kim, Myung-Hee; Seo, Jin-Seon; Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]; Kim, Eun-Young', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '105-115', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': ''}, - {'id': 'doi:10.9799/ksfan.2012.25.1.123', 'title': 'The Protective Effects of Chrysanthemum cornarium L. var. spatiosum Extract on HIT-T15 Pancreatic β-Cells against Alloxan-induced Oxidative Stress', 'author': 'Kim, In-Hye; Cho, Kang-Jin; Ko, Jeong-Sook; Kim, Jae-Hyun; Om, Ae-Son', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '123-131', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': ''} - ] - self.assertEqual(output, expected_output) - - def test_get_ref_dois_not_in_crossref(self): - output_path = os.path.join(BASE, 'not_crossref_ref') - p = Popen([executable, '-m', 'oc_ds_converter.crossref.get_not_crossref_ref', '-c', GZIP_INPUT, '-o', output_path, '-w', WANTED_DOIS_FOLDER]) - p.wait() - output = {doi['id'] for doi in get_csv_data(os.path.join(output_path, 'dois_not_in_crossref', '1-3.csv'))} - expected_output = {'10.1001/jama.299.12.1471', '10.1177/003335490812300219', '10.2105/ajph.2006.101626'} - rmtree(output_path) - self.assertEqual(output, expected_output) - - def test_orcid_finder(self): - crossref_processor = CrossrefProcessing(IOD, WANTED_DOIS) - orcid_found = crossref_processor.orcid_finder('10.9799/ksfan.2012.25.1.105') - expected_output = {'0000-0002-6227-4053': 'choi, mi-kyeong'} - #print(orcid_found) - self.assertEqual(orcid_found, expected_output) - - def test_get_agents_strings_list_overlapping_surnames(self): - # The surname of one author is included in the surname of another. - authors_list = [ - { - "given": "Puvaneswari", - "family": "Paravamsivam", - "sequence": "first", - "affiliation": [], - "role": "author" - }, - { - "given": "Chua Kek", - "family": "Heng", - "sequence": "additional", - "affiliation": [], - "role": "author" - }, - { - "given": "Sri Nurestri Abdul", - "family": "Malek", - "sequence": "additional", - "affiliation": [], - "role": "author" - }, - { - "given": "Vikineswary", - "family": "Sabaratnam", - "sequence": "additional", - "affiliation": [], - "role": "author" - }, - { - "given": "Ravishankar Ram", - "family": "M", - "sequence": "additional", - "affiliation": [], - "role": "author" - }, - { - "given": "Sri Nurestri Abdul", - "family": "Malek", - "sequence": "additional", - "affiliation": [], - "role": "editor" - }, - { - "given": "Umah Rani", - "family": "Kuppusamy", - "sequence": "additional", - "affiliation": [], - "role": "author" - } - ] - crossref_processor = CrossrefProcessing(None, None) - csv_manager = CSVManager() - csv_manager.data = {'10.9799/ksfan.2012.25.1.105': {'Malek, Sri Nurestri Abdul [0000-0001-6278-8559]'}} - crossref_processor.orcid_index = csv_manager - authors_strings_list, editors_strings_list = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) - expected_authors_list = ['Paravamsivam, Puvaneswari', 'Heng, Chua Kek', 'Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]', 'Sabaratnam, Vikineswary', 'M, Ravishankar Ram', 'Kuppusamy, Umah Rani'] - expected_editors_list = ['Malek, Sri Nurestri Abdul [orcid:0000-0001-6278-8559]'] - self.assertEqual((authors_strings_list, editors_strings_list), (expected_authors_list, expected_editors_list)) - - def test_get_agents_strings_list(self): - authors_list = [ - { - 'given': 'Myung-Hee', - 'family': 'Kim', - 'affiliation': [], - "role": "author" - }, - { - 'given': 'Jin-Seon', - 'family': 'Seo', - 'affiliation': [], - "role": "author" - }, - { - 'given': 'Mi-Kyeong', - 'family': 'Choi', - 'affiliation': [], - "role": "author" - }, - { - 'given': 'Eun-Young', - 'family': 'Kim', - 'affiliation': [], - "role": "author" - } - ] - crossref_processor = CrossrefProcessing(IOD, WANTED_DOIS) - authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) - expected_authors_list = ['Kim, Myung-Hee', 'Seo, Jin-Seon', 'Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 'Kim, Eun-Young'] - self.assertEqual(authors_strings_list, expected_authors_list) - - def test_get_agents_strings_list_same_family(self): - # Two authors have the same family name and the same given name initials - authors_list = [ - { - 'given': 'Mi-Kyeong', - 'family': 'Choi', - 'affiliation': [], - "role": "author" - }, - { - 'given': 'Mi-Hong', - 'family': 'Choi', - 'affiliation': [], - "role": "author" - } - ] - crossref_processor = CrossrefProcessing(IOD, WANTED_DOIS) - authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) - expected_authors_list = ['Choi, Mi-Kyeong [orcid:0000-0002-6227-4053]', 'Choi, Mi-Hong'] - self.assertEqual(authors_strings_list, expected_authors_list) - - def test_get_agents_strings_list_homonyms(self): - # Two authors have the same family name and the same given name - authors_list = [ - { - 'given': 'Mi-Kyeong', - 'family': 'Choi', - 'affiliation': [], - "role": "author" - }, - { - 'given': 'Mi-Kyeong', - 'family': 'Choi', - 'affiliation': [], - "role": "author" - } - ] - crossref_processor = CrossrefProcessing(IOD, WANTED_DOIS) - authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) - expected_authors_list = ['Choi, Mi-Kyeong', 'Choi, Mi-Kyeong'] - self.assertEqual(authors_strings_list, expected_authors_list) - - def test_get_agents_strings_list_inverted_names(self): - # One author with an ORCID has as a name the surname of another - authors_list = [ - { - 'given': 'Choi', - 'family': 'Mi-Kyeong', - 'affiliation': [], - "role": "author" - }, - { - 'given': 'Mi-Hong', - 'family': 'Choi', - 'affiliation': [], - "role": "author" - } - ] - crossref_processor = CrossrefProcessing(IOD, WANTED_DOIS) - authors_strings_list, _ = crossref_processor.get_agents_strings_list('10.9799/ksfan.2012.25.1.105', authors_list) - expected_authors_list = ['Mi-Kyeong, Choi', 'Choi, Mi-Hong'] - self.assertEqual(authors_strings_list, expected_authors_list) - - def test_id_worker(self): - field_issn = 'ISSN 1050-124X' - field_isbn = ['978-1-56619-909-4'] - issn_list = list() - isbn_list = list() - CrossrefProcessing.id_worker(field_issn, issn_list, CrossrefProcessing.issn_worker) - CrossrefProcessing.id_worker(field_isbn, isbn_list, CrossrefProcessing.isbn_worker) - expected_issn_list = ['issn:1050-124X'] - expected_isbn_list = ['isbn:9781566199094'] - self.assertEqual((issn_list, isbn_list), (expected_issn_list, expected_isbn_list)) - - def test_issn_worker(self): - input = 'ISSN 1050-124X' - output = list() - CrossrefProcessing.issn_worker(input, output) - expected_output = ['issn:1050-124X'] - self.assertEqual(output, expected_output) - - def test_isbn_worker(self): - input = '978-1-56619-909-4' - output = list() - CrossrefProcessing.isbn_worker(input, output) - expected_output = ['isbn:9781566199094'] - self.assertEqual(output, expected_output) - - def test_preprocess(self): - self.maxDiff = None - if os.path.exists(OUTPUT): - shutil.rmtree(OUTPUT) - preprocess(crossref_json_dir=MULTIPROCESS_OUTPUT, publishers_filepath=None, orcid_doi_filepath=IOD, csv_dir=OUTPUT, wanted_doi_filepath=None) - output = dict() - for file in os.listdir(OUTPUT): - with open(os.path.join(OUTPUT, file), 'r', encoding='utf-8') as f: - output[file] = list(csv.DictReader(f)) - expected_output = { - '40228.csv': [ - {'id': 'doi:10.9799/ksfan.2012.25.1.069', 'title': 'Nonthermal Sterilization and Shelf-life Extension of Seafood Products by Intense Pulsed Light Treatment', 'author': 'Cheigh, Chan-Ick; Mun, Ji-Hye; Chung, Myong-Soo', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '69-76', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': ''}, - {'id': 'doi:10.9799/ksfan.2012.25.1.077', 'title': 'Properties of Immature Green Cherry Tomato Pickles', 'author': 'Koh, Jong-Ho; Shin, Hae-Hun; Kim, Young-Shik; Kook, Moo-Chang', 'pub_date': '2012-3-31', 'venue': 'The Korean Journal of Food And Nutrition [issn:1225-4339]', 'volume': '25', 'issue': '1', 'page': '77-82', 'type': 'journal article', 'publisher': 'The Korean Society of Food and Nutrition [crossref:4768]', 'editor': ''} - ], - '30719.csv': [ - {'id': 'doi:10.17117/na.2015.08.1067', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'component', 'publisher': 'Consulting Company Ucom [crossref:6623]', 'editor': ''} - ] - } - self.assertEqual(output, expected_output) - - def test_gzip_input(self): - if os.path.exists(OUTPUT): - shutil.rmtree(OUTPUT) - preprocess(crossref_json_dir=GZIP_INPUT, publishers_filepath=None, orcid_doi_filepath=IOD, csv_dir=OUTPUT, wanted_doi_filepath=WANTED_DOIS) - output = dict() - for file in os.listdir(OUTPUT): - with open(os.path.join(OUTPUT, file), 'r', encoding='utf-8') as f: - output[file] = list(csv.DictReader(f)) - expected_output = { - '0.csv': [{'id': 'doi:10.1001/.389', 'title': 'Decision Making at the Fringe of Evidence: Take What You Can Get', 'author': 'Col, N. F.', 'pub_date': '2006-2-27', 'venue': 'Archives of Internal Medicine [issn:0003-9926]', 'volume': '166', 'issue': '4', 'page': '389-390', 'type': 'journal article', 'publisher': 'American Medical Association (AMA) [crossref:10]', 'editor': ''}], - '1.csv': [{'id': 'doi:10.1001/archderm.108.4.583b', 'title': 'Letter: Bleaching of hair after use of benzoyl peroxide acne lotions', 'author': 'Bleiberg, J.', 'pub_date': '1973-10-1', 'venue': 'Archives of Dermatology [issn:0003-987X]', 'volume': '108', 'issue': '4', 'page': '583-583', 'type': 'journal article', 'publisher': 'American Medical Association (AMA) [crossref:10]', 'editor': ''}] - } - self.assertEqual(output, expected_output) - - def test_tar_gz_file(self): - tar_gz_file_path = f'{BASE}/tar_gz_test/40228.tar.gz' - result, targz_fd = get_all_files(tar_gz_file_path) - for file in result: - data = load_json(file, targz_fd) - output = data['items'][0]['DOI'] - expected_output = '10.9799/ksfan.2012.25.1.069' - self.assertEqual(output, expected_output) - - def test_load_publishers_mapping(self): - output = CrossrefProcessing.load_publishers_mapping(publishers_filepath=PUBLISHERS_MAPPING) - expected_output = { - '1': {'name': 'Annals of Family Medicine', 'prefixes': {'10.1370'}}, - '2': {'name': 'American Association of Petroleum Geologists AAPG/Datapages', 'prefixes': {'10.15530', '10.1306'}}, - '3': {'name': 'American Association of Physics Teachers (AAPT)','prefixes': {'10.1119'}}, - '6': {'name': 'American College of Medical Physics (ACMP)','prefixes': {'10.1120'}}, - '9': {'name': 'Allen Press', 'prefixes': {'10.1043'}}, - '10': {'name': 'American Medical Association (AMA)', 'prefixes': {'10.1001'}}, - '11': {'name': 'American Economic Association', 'prefixes': {'10.1257'}}, - '460': {'name': 'American Fisheries Society', 'prefixes': {'10.1577', '10.47886'}} - } - self.assertEqual(output, expected_output) - - def test_get_publisher_name(self): - # The item's member is in the publishers' mapping - item = { - 'publisher': 'American Fisheries Society', - 'DOI': '10.47886\/9789251092637.ch7', - 'prefix': '10.47886', - 'member': '460' - } - doi = '10.47886/9789251092637.ch7' - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - publisher_name = crossref_processor.get_publisher_name(doi, item) - self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') - - def test_get_publisher_name_no_member(self): - # The item has no member, but the DOI prefix is the publishers' mapping - item = { - 'publisher': 'American Fisheries Society', - 'DOI': '10.47886/9789251092637.ch7', - 'prefix': '10.47886' - } - doi = '10.47886/9789251092637.ch7' - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - publisher_name = crossref_processor.get_publisher_name(doi, item) - self.assertEqual(publisher_name, 'American Fisheries Society [crossref:460]') - - def test_get_venue_name(self): - item = { - 'container-title': ['Cerebrospinal Fluid [Working Title]'], - } - row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''} - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - venue_name = crossref_processor.get_venue_name(item, row) - self.assertEqual(venue_name, 'Cerebrospinal Fluid [Working Title]') - - def test_get_venue_name_with_ISSN(self): - item = { - 'container-title': ["troitel'stvo: nauka i obrazovanie [Construction: Science and Education]"], - 'ISSN': '2305-5502' - } - row = {'id': '', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'journal article', 'publisher': '', 'editor': ''} - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - venue_name = crossref_processor.get_venue_name(item, row) - self.assertEqual(venue_name, "troitel'stvo: nauka i obrazovanie [Construction: Science and Education] [issn:2305-5502]") - - def test_get_pages(self): - item = { - 'page': '469-476' - } - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - pages = crossref_processor.get_crossref_pages(item) - self.assertEqual(pages, '469-476') - - def test_get_pages_right_letter(self): - item = { - 'page': 'G22' - } - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - pages = crossref_processor.get_crossref_pages(item) - self.assertEqual(pages, 'G22-G22') - - def test_get_pages_wrong_letter(self): - item = { - 'page': '583b-584' - } - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - pages = crossref_processor.get_crossref_pages(item) - self.assertEqual(pages, '583-584') - - def test_get_pages_roman_letters(self): - item = { - 'page': 'iv-l' - } - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - pages = crossref_processor.get_crossref_pages(item) - self.assertEqual(pages, 'iv-l') - - def test_get_pages_non_roman_letters(self): - item = { - 'page': 'kj-hh' - } - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - pages = crossref_processor.get_crossref_pages(item) - self.assertEqual(pages, '') - - def test_report_series_venue_id(self): - crossref_processor = CrossrefProcessing(orcid_index=IOD, doi_csv=WANTED_DOIS, publishers_filepath=None) - items = {'items': [{ - 'DOI': '10.1007/978-3-030-00668-6_8', - 'container-title': ["troitel'stvo: nauka i obrazovanie [Construction: Science and Education]"], - 'ISSN': '2305-5502', - 'type': 'report-series' - }]} - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - output = list() - for item in items['items']: - output.append(crossref_processor.csv_creator(item)) - expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8', 'title': '', 'author': '', 'pub_date': '', 'venue': "troitel'stvo: nauka i obrazovanie [Construction: Science and Education] [issn:2305-5502]", 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}] - self.assertEqual(output, expected_output) - - def test_report_series_br_id(self): - crossref_processor = CrossrefProcessing(orcid_index=IOD, doi_csv=WANTED_DOIS, publishers_filepath=None) - items = {'items': [{ - 'DOI': '10.1007/978-3-030-00668-6_8', - 'container-title': [], - 'ISSN': '2305-5502', - 'type': 'report-series' - }]} - crossref_processor = CrossrefProcessing(orcid_index=None, doi_csv=None, publishers_filepath=PUBLISHERS_MAPPING) - output = list() - for item in items['items']: - output.append(crossref_processor.csv_creator(item)) - expected_output = [{'id': 'doi:10.1007/978-3-030-00668-6_8 issn:2305-5502', 'title': '', 'author': '', 'pub_date': '', 'venue': '', 'volume': '', 'issue': '', 'page': '', 'type': 'report series', 'publisher': '', 'editor': ''}] - self.assertEqual(output, expected_output) - -if __name__ == '__main__': # pragma: no cover - unittest.main() diff --git a/test/processing_oroci_test.py b/test/processing_oroci_test.py index 1189299..31a32cc 100644 --- a/test/processing_oroci_test.py +++ b/test/processing_oroci_test.py @@ -12,7 +12,7 @@ BASE = os.path.join('test', 'openaire_processing') DATA = os.path.join(BASE, 'jSonFile_1.json') DATA_DIR = BASE -TMP_SUPPORT_MATERIAL = os.path.join(BASE,"tmp_support") +TMP_SUPPORT_MATERIAL = os.path.join(BASE, "tmp_support") OUTPUT = os.path.join(BASE, 'meta_input') MULTIPROCESS_OUTPUT = os.path.join(BASE, 'multi_process_test') MEMO_JSON_PATH = "test/openaire_processing/tmp_support/memo.json" @@ -125,7 +125,7 @@ def test_get_reids_validity_dict_w_fakeredis_db_values_redis(self): opp.RA_redis.delete('orcid:0000-0002-8090-6886') def test_validated_as_default(self): - ''' + """ Check that, given an ID dict with keys "schema" (value: string of the schema) and "identifier" (value: string of the identifier, the method "validated_as" returns: - True if the id was already validated as valid @@ -133,7 +133,7 @@ def test_validated_as_default(self): - None if the id was not validated before The procedure is tested - With default storage manager (sqlite) without a pre-existent db associated - ''' + """ opp = OpenaireProcessing() validate_as_none = opp.validated_as({"schema":"pmid", "identifier": "pmid:23483834"}) @@ -471,7 +471,7 @@ def test_dict_to_cache(self): if os.path.exists(MEMO_JSON_PATH): os.remove(MEMO_JSON_PATH) self.assertFalse(os.path.exists(MEMO_JSON_PATH)) - op.dict_to_cache(sample_dict,MEMO_JSON_PATH) + op.dict_to_cache(sample_dict, MEMO_JSON_PATH) self.assertTrue(os.path.exists(MEMO_JSON_PATH)) self.delete_storege(specific_path=MEMO_JSON_PATH) self.assertFalse(os.path.exists(MEMO_JSON_PATH)) From dda1d38e596594fa0d8e2a822058681f50af1d1b Mon Sep 17 00:00:00 2001 From: martasoricetti Date: Thu, 26 Oct 2023 16:24:13 +0200 Subject: [PATCH 2/2] fix jalc_process.py('citation_list' error key) + crossref test + jalc_languages_metadata_count --- .../tar_gz_cited_test/3.json.tar.gz | Bin 0 -> 3002 bytes .../tar_gz_wrong_cited_doi/1.tar.gz | Bin 0 -> 2995 bytes 2 files changed, 0 insertions(+), 0 deletions(-) create mode 100644 test/crossref_processing/tar_gz_cited_test/3.json.tar.gz create mode 100644 test/crossref_processing/tar_gz_wrong_cited_doi/1.tar.gz diff --git a/test/crossref_processing/tar_gz_cited_test/3.json.tar.gz b/test/crossref_processing/tar_gz_cited_test/3.json.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..9f413d128c851ea64a2e148e63df7ada8f62ad68 GIT binary patch literal 3002 zcmV;r3q|xFiwFqb#yDjJ05dLXb8l`gbYXG;?O9uI<3tqRXC(f^C@-{9?AVSS#{nVr z$`)w1G%fcPn%a|OwzkKN?UaJ{zw@1OE=`+l0uj7GRH?S-%(>6GP1k-`s{GYI`g8CL z!w~j$nE27w#;_#Qf5?-g@i{R_Xkthi?E)qnK&M@v>BD_et~@RTRw zH<7dkf3y-_39I0`qLrUK7^&^I-QexpcC%E;Otc0qTCg17a)WE9H*ms15dGL{w`dh= z&5G8*4f?&P>-58@ANAk2TUv~S7I`eJSk-ye8u(tjRn)hsEGI$(-Q;pA zbSbL`cBw^Kvc035%YC4iWmyw-7tg7TMP3TS|IO9c_&TYoq8xNO_xJa9JV$0LJ?_Lx zi%unUR$?QWy;>)#+l8q8r;^fllGN#6JI=uTJ(aZE3%X(C8K-`a-6G`=mdve$KLB;u zi`uPN<(0@Q%gCj2;H~qr5V0Id=>7fMB}Ae!E_32<{`3>AYE_mQ*LRi%SuNwrU>WhW z6d+knl&&m7fgM7F_bMgTw2o!TOGwT#k=V?rJ7yEX(`v#BEegB!c4>vgDzwbk>&9(& z-@XMUehczE22ks>cf2YGhScW=hr{cugQM%uI*&d!0LAAgqmcl^Q^7kS!kJG3#X=PZ%!^U3f;sUb`1AQ+0K#1;1-E%_W2{tC zCjzTt3Wq3}%%>`y0=O=Xc@sQofRR`Qh#Md^g~o^@{40c(Drr<%7Cg^MBY={SL5nfC zZAg$6ey#EKF&}2vl2ojePGlpaAqX3&JQpcM6nZLS2s_(S6e&c1!JtbcRto$;3Yb3u z3vjl%B#oxI@%JUOLzRgo$~GSF1q5!hj6blH$9D`~S;sIo>_AQ&lV2D*6Ok4RDQ1>5 zJdJCa;8nw*G1)nVH#A}z+c_7o2>6W2T&1eJ`U&fy= zqS`t3T{c1ObrUTvAJHW&%S|?;F!pIM2z;yO?k`9nm><76qpafCuH(2J*RiI4?pc#^ zgmVgPmno7%X6+Bf6(o($(&vo^-v0R zqb8ped`ue++kk8wf)XY<0L)<$qqWGdiFNxh&#@s8&r`u^6TNsvMtD6BZzpq$t!Qq* z&hkpDq>kt3p0o2!9&b_!*pR=?mFb4y4Z*9d9zmmF) zCogaOg5N6rbe~AYI(kq{CbC;o<^;XgwF#!$unD*2S+DYbMw8qVL#W7!$a1!J-hY3{W898e`|c6-P;0R_!sa6 zTj1;c1$@!o?fw=h{jE{Dc7F?e-RJmxkKGK97HGOrkR)u+!>(-@Tw~*Jc{;&qK2o|+ zy2)ZwHmuWB(!n?=OBKT(DOaE6ahv%*JMAXSje<6!qQ~~|awN5lFHx`!3+|({+&V=- zxPu$x+blB93((dzk{#Z@%vFUEup*JA+gpD&XD?YD8tTMdK9AJbnJERoDQNsN`FGLrKh_A1_ z%6+r(B%g4~iSzPc_(1&wWj8U_{6qe5sHGSYrbDIfz|VjVy{T4~>2fG4%A{yciAU-t z?j;CNz26*+{Wg1pwgge1MQ!rmaI>Ozm+WW<_lY+}u(fL+9i7_* z<_50Q3+=GygudQ~)p>nQ5k{^p3|1RwU($n|{k)BcV;JK;0^9G_p>&Mk7W=(>G3 zbfezwXh$lpw@M}GbuLc6JTEy(;Dozk?{APHN}@1~ccjt{{(^O*J-$UsalB28k6bRi z9T|6nO;mi0tCAh5_|Deq>DulVx^dS8&@a%xQfzDD9D<+W4k2j~Elo>y#B~F5QwoY; zi{5F7glHmQi3rTT78QNG5(P5$x|sMKH^?iMx7qR0fL+YT9tSac_)Inp;4qd{r&7Ll zoy~G}SFF(u`<-}#Hpr&_2-CwI&EWWJ)OI9t2x{8#d0|39d)7vNQqD33>EuYIDOLHV zVqxfgzf_YP0IIl?_)j~(5DNLAQuq0CYAM%OH_Iu|-tZnZKRZq9efG8KhA*FX%U84U z*cQIgTSIGOuf`5VbuUD|Xavy$)tVNL=l%G}vLD}qizi{n?RAC@qNU~Ze19*B_A%J^ zd~4@=$orcG9+eX|`~K|gY$FXP+LR?jR9<1<>)GMX;mBD>>+tAmMk@$B%E+D**ay4P z^4-4@l=enj#B8^_M%PUUVGzdZh9RA!Q!dll81^j3`7r0{=NcixKb6=zi#qc$?NFEj z)1jnh7||++d9zF+J_$^@nII0-v3oMXmVw_!#)twbnG*6znTrY0%q3E(#nY ziPK`uhA*BUt?|6e#qi5eEif1d00}yqa)BQ9v&0;&)$#kvyzB9pq~|YttL5zDL2!Dd zQgB`8^urB1$a8djdddzm_VpGrhHC^e9hyj$EN!WHcRy3b}X`~84GtU zuDtF};pbbUpYPAU-Au??gJme1Y23GGXKiRJw~)#)zRu>;`K33`P7>v}&5qzbb9VHZ z>%1Maa9vT(xW~Pg@o{uV>8RpK~|Dt0R-5(|8c#&5;t z`zzywZ)!PC#lwozhimk4U8g)ZLgKtnw-9t4l8dwRAqEq0hXFIsGAmLu3vZN4Cwgjf zIt7>2YmM1SHa7~TX)L$Ml-|1Ydqy=$W5$)4>^aZ**lTp$9i*?Ja%`QAUQz`B{&veyV-iXrh z{mF&tx(n5*(liOD&k2q;r|T$Or8J8rjtR;dTq#?qFk^CCqp1X$r<5O>;76Fidf}Q0 zendCIUU=pvzzl6ZpnPVP@m-=?0&-u;m;=}BYJ9+62b~z3% zAK#_(<#f%G3AQ|UKr-smZZKQ&n~ivA?DfIh52BYEJrYwfltugyA7{hi1<@fg7U%$$ zp8Hnl``@=3S#o1O<1tg7)_7V0pV_v#cYfbq04?z>flOJTRW4(g&gk(SJ`e*dZrsMu wXDR__4sqJ#q&TjN+3}izwJ?@{|M5q!{{Q{|`~Ua<@4xo@6FdYKAOI);0OQZ*ng9R* literal 0 HcmV?d00001 diff --git a/test/crossref_processing/tar_gz_wrong_cited_doi/1.tar.gz b/test/crossref_processing/tar_gz_wrong_cited_doi/1.tar.gz new file mode 100644 index 0000000000000000000000000000000000000000..2551758b3bdfc91feea4d82a02c6eb248ffedb7a GIT binary patch literal 2995 zcmV;k3rzGMiwFp`IXPtn05L9fVR8WNSzB-8L=@guxFFSJtZ_?9>h2&q@LK)a=B zxv$XFo+PuiJ!WjD6tw@H?~HS4+H4bu;02;ewLNFfea>xM`(3H>SO4hG!7mI${5QWR z|2-#gdavA|>-Ic92z(#kL&xpDV$Q37;a8Ux*NnaTkN*B>$x38pYw#1E@CV`6!>DXRx| zsYO|`y`!7UeV~_RSrc^^<>UJAqi&DGcVI;pCn9CSMO_xE-@M`kNM?!-!qP9=0! zVk4QoS|_UOL)89LN$ERD>h!N2XJG!GO6vEzei(Viso!I_Ncn>$b1UHwKppm?b}LqS zCGyHLa;Y46>%1&PEJqT0fB$v~k*JKzocNnR{Y0x;m1V~Douxrm%eXRFMm#MANR|_& zD~nKIhtS}?N=Y@XV_EVNlCw-CHZ$su*+lTPny^BP!fw4?S|PCtE%Wudaohgex1hvt zL7v9|YJK*OSLMKv`uyNd$`21uv5@2{Ln8<`4!wDW3#3&T($9Sgl zF`KBekQJn1JWrUa;1Y8-W3B~wRa2Qjd8{O_Wh1RJmepw`i&U@?7*beEL?QA-0-wqm zaIlZ1pD~%MR5c!EYF(;usiLJl8>T=_+?5S7 zm~{pLf(!kDDXr@91j?WC98~yN3z0Q`loO3uB(-ilEUT-U)FYEd7R2!sr`gcb`HB>y z-| ze<&yyyn`*ENo(d1Scq|&tM!@Fb=qOubJpY&WX{m?oNFf-`297R(+i^>N+CaL@=3wR zw9&8)$i^WkVUh#D940YZi~O2cw-56i8v^k>6`VHFi&tcX*YogpGPl@@<_7F6ue3_) zcz*6VJKyB-CY68<`P*EXZV28Gyz0ydfEQj@NL40jm-VOt{ZeCLED`MJg+mYQ3mczC zJd^1IzP(mRyABL*0DW=e^#obAe6sZ`smpls^2RUt zt-Qv`%NxG}!X zBICROZCxYjv9I~PNdX5bVm6f$Q-pW`&%)^f_(I&XAC$fuuv3Ju#E8*!0H0C~vTH@UtYep!E2AAU(X}& zOq8@Zvc9fNXa?}2>)A}feU_cx@r>JE6xpt4yFuvgNWtx`P;kSrgH_+}29eWuJpco8 zM;cyljaiOOcf$_70l6n^oQYZSc_m_%;ceysBZS!@r5KJ9?)P~ix=2KPedSf|n~f*= zgi}tOmj}ZK>K`b(iLvG%@`pn$#fUH+Ds=~b26X66wX#f?Ls3yCMRQ6#Qa5ogL4fN0 z=3wl%*&DPai25vQlmCXB71doecR%XfmIe7!--aP}G=uxZnIf3lwU3U@Z31(&-jyyZzGyNQ&U)s(sQnB#ez8ktx?{>5! z71vv(((QFFPQE-ZIZ5DzyJGKekReKhW!N3*ESD;^!A=Un#UTVGgm+aEFMr2$rTJJL0+lxhVw&utn=ML_RbD zutWf6UyF)9UWoz;dtFHUjvM5a%G>PtXuvM!Q;&lfEqo@M{%;sdDpM(4yUu30{1t2X zVZRek&;{Al9$|R6qZyoFjoOYx4na*fKCeqCX3x6FPs&-MAe|hkG^HZn6f6v_@0V(l z13(pb68~xE7eXN&RO&uo4lU*S;$}Gn+8f@ZBS+?U_aPcJUxV_HMLA11-UJ&d>(SGQ7USRE94tamG zzN2cwX5XKkoo%GyM4Pf?h{7xEdp$ecIUG6bXdND1&1iMI9%W?D>DmXo(hA(a5|s8v zTf}VFU!&_LgfIwW`C&-s=#`ng7k2u>xo&Z5qIN;?#0zH}(57e=(o zVcsm0h))7TEufeR7<-IvhQMcMXHhCXK0XHib*(iGHU;~NR~q!Tkc$GxNaD0uv)PLm zL~A_naxwcdR13_-0YHMzhFqYB{VXv@Yjyg*GVXdjCFupr)@nKWco3XksS;e*IQ?+L z4)PogpPsUVjD5X@jNuxAOoJwpXBos~j&pb(UdkHmKd=)sn;yVS3VXEDvdoMY-q5Ib zbttzQOP;e&!0bZV%;{Q)+P=-MRD3s;7>Bpn`LSvKx|=2Xa5om&)Qg3?7FV9XQ~3GT z=;!;hZ#NTi)?gWmW*GPF*;yOf$}OaFOs})~aDM5Hvy(*mZL=eI&zv1?<~nc3C|p;R zGwyM%WqcgX5&1Q*=0|J5CUwU1X5K4dK&It*Qk6Ikql(>%iNpdQqwiaB`Toi@;hS2H zQ}M9k^x+zPT-PbjjgUC+(=7yDhveeye2BRO++o1Xv&@RrjKUkG(utm$oKC@I^;Tnc zlFfxeY5K}7GNrff{GL%w(wK2&Bzw+tJ~rG?k1Q#B$Ks{DZqKh0pWg#6C+0H78U%Q0 zbWXEe1>?yDo6S8WraX7sU%Ae2h&o&q)?(c2rdk&FMA@H!00xiDQDY1~;!a+=f?ZC7%g0ygd^udR zWP&Zv9gvK=v>VKp{AME_8hd^4@`LE5Mvuf)3}q2N#K+lictLcCj0HM?rRTmC2Eq5O zMwZ-|&v?v~r!}5dz-P8?uASev7eGrqOCVDgXqC$trZakchY!TSiW|2v^O;J(nM0g5 pIVq0oVs^Y{U@c7L-+%nktN(xh|Nj5||NGDV{sdA!@7e$;004ni=;;6e literal 0 HcmV?d00001