From aa4bff8b8552d436dc6a643b1469a0376833f83c Mon Sep 17 00:00:00 2001 From: Melissa Clegg Date: Thu, 11 Jul 2019 14:55:54 -0500 Subject: [PATCH] RefExtract: Update eprint extraction in engine and regex Signed-off-by: Melissa Clegg --- refextract/references/engine.py | 10 ++++++---- refextract/references/regexs.py | 24 +++++++++++++++--------- tests/test_engine.py | 33 +++++++++++++++++++++++++++++++++ tests/test_tag.py | 24 ++++++++++++++++++++++++ 4 files changed, 78 insertions(+), 13 deletions(-) diff --git a/refextract/references/engine.py b/refextract/references/engine.py index c6b0743..57e46c6 100644 --- a/refextract/references/engine.py +++ b/refextract/references/engine.py @@ -586,11 +586,13 @@ def add_recid_elements(splitted_citations): def arxiv_urls_to_report_numbers(citation_elements): - arxiv_url_prefix = 'http://arxiv.org/abs/' + arxiv_url_prefix = re.compile('^https?:\/\/(?:(?:cn\.|de\.|in\.|lanl\.)?arxiv\.org|xxx\.lanl\.gov)\/(?:abs|pdf)\/(\S+\d{4})(?:v\d)?(?:\.pdf)?', re.UNICODE | re.IGNORECASE) for el in citation_elements: - if el['type'] == 'URL' and el['url_string'].startswith(arxiv_url_prefix): - el['type'] = 'REPORTNUMBER' - el['report_num'] = el['url_string'].replace(arxiv_url_prefix, 'arXiv:') + if el['type'] == 'URL' and el['url_string']: + matchobj = arxiv_url_prefix.match(el['url_string']) + if matchobj: + el['type'] = 'REPORTNUMBER' + el['report_num'] = matchobj.group(1) def look_for_hdl(citation_elements): diff --git a/refextract/references/regexs.py b/refextract/references/regexs.py index a48d830..7acaa0c 100644 --- a/refextract/references/regexs.py +++ b/refextract/references/regexs.py @@ -72,20 +72,26 @@ def compute_pos_patterns(patterns): # Pattern for arxiv numbers # arxiv 9910-1234v9 [physics.ins-det] -re_arxiv = re.compile(ur""" - ARXIV[\s:-]*(?P\d{2})-?(?P\d{2}) +re_arxiv = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)| + (?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)| + (?:ARXIV[\s:-]*))(?P\d{2})-?(?P\d{2}) [\s.-]*(?P\d{4})(?!\d)(?:[\s-]*V(?P\d))? - \s*(?P\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE) + \s*(?P\[[A-Z.-]+\])? + (?:\.pdf)? """, re.VERBOSE | re.UNICODE | re.IGNORECASE) -re_arxiv_5digits = re.compile(ur""" - ARXIV[\s:-]*(?P(1[3-9]|[2-8][0-9]))-?(?P(0[1-9]|1[0-2])) - [\s.-]*(?P\d{5})(?!\d)(?:[\s-]*V(?P\d))? - \s*(?P\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE) +re_arxiv_5digits = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)| + (?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)| + (?:ARXIV[\s:-]*))(?P(1[3-9]|[2-8][0-9]))-?(?P(0[1-9]|1[0-2])) + [\s.-]*(?P\d{5})(?!\d) + (?:[\s-]*V(?P\d))? + \s*(?P\[[A-Z.-]+\])? + (?:\.pdf)? """, re.VERBOSE | re.UNICODE | re.IGNORECASE) # Pattern for arxiv numbers catchup # arxiv:9910-123 [physics.ins-det] -RE_ARXIV_CATCHUP = re.compile(ur""" - ARXIV[\s:-]*(?P\d{2})-?(?P\d{2}) +RE_ARXIV_CATCHUP = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)| + (?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)| + (?:ARXIV[\s:-]*))(?P\d{2})-?(?P\d{2}) [\s.-]*(?P\d{3}) \s*\[(?P[A-Z.-]+)\]""", re.VERBOSE | re.UNICODE | re.IGNORECASE) diff --git a/tests/test_engine.py b/tests/test_engine.py index e37b5b3..d7d34fe 100644 --- a/tests/test_engine.py +++ b/tests/test_engine.py @@ -266,12 +266,45 @@ def test_doi_subdivisions(): assert references[0]['linemarker'] == [u'10'] +def test_old_arxiv(): + ref_line = u'[20] B. Moore, T. R. Quinn, F. Governato, J. Stadel, and G. Lake, "Cold collapse and the corecatastrophe," Mon. Not. Roy. Astron. Soc.310(1999) 1147–1152, arXiv:astro-ph/9903164 [astro-ph].' + res = get_references(ref_line) + references = res[0] + assert references[0]['reportnumber'] == [u'astro-ph/9903164'] + assert references[0]['linemarker'] == [u'20'] + + +def test_old_lanl_url_version(): + ref_line = u'[44] Navarro, J.F., Frenk, C.S., White, S.D.M. http://xxx.lanl.gov/pdf/astro-ph/9508025v1' + res = get_references(ref_line) + references = res[0] + assert references[0]['reportnumber'] == [u'astro-ph/9508025'] + assert references[0]['linemarker'] == [u'44'] + + +def test_old_arxiv_url(): + ref_line = u'[298] V. Allori, D. Duerr, S. Goldstein, and N. Zanghi. 2002. Seven steps towards the classical world. Journal of Optics B : Quantum and semiclassical Optics, Volume 4, number 4. https://arxiv.org/abs/quant-ph/0112005' + res = get_references(ref_line) + references = res[0] + assert references[0]['reportnumber'] == [u'quant-ph/0112005'] + assert references[0]['linemarker'] == [u'298'] + + +def test_old_arxiv_mirror_url(): + ref_line = u'[13] A. Zupanc, et al, Belle Collaboration, https://cn.arxiv.org/abs/hep-ex/0703040 2007' + res = get_references(ref_line) + references = res[0] + assert references[0]['reportnumber'] == [u'hep-ex/0703040'] + assert references[0]['linemarker'] == [u'13'] + + def test_get_plaintext_document_body(tmpdir): input = [u"Some text\n", u"on multiple lines\n"] f = tmpdir.join("plain.txt") f.write("".join(input)) assert input == get_plaintext_document_body(str(f)) + with pytest.raises(UnknownDocumentTypeError) as excinfo: html = "Some page" f = tmpdir.join("page.html") diff --git a/tests/test_tag.py b/tests/test_tag.py index 76e417c..41006dc 100644 --- a/tests/test_tag.py +++ b/tests/test_tag.py @@ -175,3 +175,27 @@ def test_5_digits_suffix_version_new_2012(): ref_line = u"""{any prefix}1210.12345v9 [physics.ins-det]{any postfix}""" r = tag_arxiv(ref_line) assert r.strip(': ') == u"{any prefix}1210.12345v9 [physics.ins-det]{any postfix}" + + +def test_4_digits_new_url(): + ref_line = u"""{any prefix}https://arxiv.org/abs/1311.2198{any postfix}""" + r = tag_arxiv(ref_line) + assert r.strip(': ') == u"{any prefix}arXiv:1311.2198{any postfix}" + + +def test_5_digits_new_url(): + ref_line = u"""{any prefix}https://arxiv.org/abs/1602.03988{any postfix}""" + r = tag_arxiv(ref_line) + assert r.strip(': ') == u"{any prefix}arXiv:1602.03988{any postfix}" + + +def test_4_digits_version_new_url(): + ref_line = u"""{any prefix}https://arxiv.org/abs/0708.0882v1{any postfix}""" + r = tag_arxiv(ref_line) + assert r.strip(': ') == u"{any prefix}arXiv:0708.0882{any postfix}" + + +def test_5_digits_new_pdf_url(): + ref_line = u"""{any prefix}https://arxiv.org/pdf/1712.03976.pdf{any postfix}""" + r = tag_arxiv(ref_line) + assert r.strip(': ') == u"{any prefix}arXiv:1712.03976{any postfix}"