From aa4bff8b8552d436dc6a643b1469a0376833f83c Mon Sep 17 00:00:00 2001
From: Melissa Clegg <cleggm1@fnal.gov>
Date: Thu, 11 Jul 2019 14:55:54 -0500
Subject: [PATCH] RefExtract: Update eprint extraction in engine and regex

Signed-off-by: Melissa Clegg <cleggm1@fnal.gov>
---
 refextract/references/engine.py | 10 ++++++----
 refextract/references/regexs.py | 24 +++++++++++++++---------
 tests/test_engine.py            | 33 +++++++++++++++++++++++++++++++++
 tests/test_tag.py               | 24 ++++++++++++++++++++++++
 4 files changed, 78 insertions(+), 13 deletions(-)
diff --git a/refextract/references/engine.py b/refextract/references/engine.py
index c6b0743..57e46c6 100644
--- a/refextract/references/engine.py
+++ b/refextract/references/engine.py
@@ -586,11 +586,13 @@ def add_recid_elements(splitted_citations):
 
 
 def arxiv_urls_to_report_numbers(citation_elements):
-    arxiv_url_prefix = 'http://arxiv.org/abs/'
+    arxiv_url_prefix = re.compile('^https?:\/\/(?:(?:cn\.|de\.|in\.|lanl\.)?arxiv\.org|xxx\.lanl\.gov)\/(?:abs|pdf)\/(\S+\d{4})(?:v\d)?(?:\.pdf)?', re.UNICODE | re.IGNORECASE)
     for el in citation_elements:
-        if el['type'] == 'URL' and el['url_string'].startswith(arxiv_url_prefix):
-            el['type'] = 'REPORTNUMBER'
-            el['report_num'] = el['url_string'].replace(arxiv_url_prefix, 'arXiv:')
+        if el['type'] == 'URL' and el['url_string']:
+            matchobj = arxiv_url_prefix.match(el['url_string'])
+            if matchobj:
+                el['type'] = 'REPORTNUMBER'
+                el['report_num'] = matchobj.group(1)
 
 
 def look_for_hdl(citation_elements):
diff --git a/refextract/references/regexs.py b/refextract/references/regexs.py
index a48d830..7acaa0c 100644
--- a/refextract/references/regexs.py
+++ b/refextract/references/regexs.py
@@ -72,20 +72,26 @@ def compute_pos_patterns(patterns):
 
 # Pattern for arxiv numbers
 # arxiv 9910-1234v9 [physics.ins-det]
-re_arxiv = re.compile(ur"""
-    ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
+re_arxiv = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
+    (?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
+    (?:ARXIV[\s:-]*))(?P<year>\d{2})-?(?P<month>\d{2})
     [\s.-]*(?P<num>\d{4})(?!\d)(?:[\s-]*V(?P<version>\d))?
-    \s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
+    \s*(?P<suffix>\[[A-Z.-]+\])?
+    (?:\.pdf)? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
 
-re_arxiv_5digits = re.compile(ur"""
-    ARXIV[\s:-]*(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
-    [\s.-]*(?P<num>\d{5})(?!\d)(?:[\s-]*V(?P<version>\d))?
-    \s*(?P<suffix>\[[A-Z.-]+\])? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
+re_arxiv_5digits = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
+    (?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
+    (?:ARXIV[\s:-]*))(?P<year>(1[3-9]|[2-8][0-9]))-?(?P<month>(0[1-9]|1[0-2]))
+    [\s.-]*(?P<num>\d{5})(?!\d)
+    (?:[\s-]*V(?P<version>\d))?
+    \s*(?P<suffix>\[[A-Z.-]+\])?
+    (?:\.pdf)? """, re.VERBOSE | re.UNICODE | re.IGNORECASE)
 
 # Pattern for arxiv numbers catchup
 # arxiv:9910-123 [physics.ins-det]
-RE_ARXIV_CATCHUP = re.compile(ur"""
-    ARXIV[\s:-]*(?P<year>\d{2})-?(?P<month>\d{2})
+RE_ARXIV_CATCHUP = re.compile(ur"""(?:(?:https?://(?:www\.)?arxiv\.org/(?:abs|pdf)/)|
+    (?:https?://(?:xxx\.)?lanl\.gov/(?:abs|pdf)/)|
+    (?:ARXIV[\s:-]*))(?P<year>\d{2})-?(?P<month>\d{2})
     [\s.-]*(?P<num>\d{3})
     \s*\[(?P<suffix>[A-Z.-]+)\]""", re.VERBOSE | re.UNICODE | re.IGNORECASE)
 
diff --git a/tests/test_engine.py b/tests/test_engine.py
index e37b5b3..d7d34fe 100644
--- a/tests/test_engine.py
+++ b/tests/test_engine.py
@@ -266,12 +266,45 @@ def test_doi_subdivisions():
     assert references[0]['linemarker'] == [u'10']
 
 
+def test_old_arxiv():
+    ref_line = u'[20]  B. Moore, T. R. Quinn, F. Governato, J. Stadel, and G. Lake, "Cold collapse and the corecatastrophe," Mon. Not. Roy. Astron. Soc.310(1999) 1147–1152, arXiv:astro-ph/9903164 [astro-ph].'
+    res = get_references(ref_line)
+    references = res[0]
+    assert references[0]['reportnumber'] == [u'astro-ph/9903164']
+    assert references[0]['linemarker'] == [u'20']
+
+
+def test_old_lanl_url_version():
+    ref_line = u'[44]  Navarro, J.F., Frenk, C.S., White, S.D.M. http://xxx.lanl.gov/pdf/astro-ph/9508025v1'
+    res = get_references(ref_line)
+    references = res[0]
+    assert references[0]['reportnumber'] == [u'astro-ph/9508025']
+    assert references[0]['linemarker'] == [u'44']
+
+
+def test_old_arxiv_url():
+    ref_line = u'[298] 	V. Allori, D. Duerr, S. Goldstein, and N. Zanghi. 2002. Seven steps towards the classical world. Journal of Optics B : Quantum and semiclassical Optics, Volume 4, number 4. https://arxiv.org/abs/quant-ph/0112005'
+    res = get_references(ref_line)
+    references = res[0]
+    assert references[0]['reportnumber'] == [u'quant-ph/0112005']
+    assert references[0]['linemarker'] == [u'298']
+
+
+def test_old_arxiv_mirror_url():
+    ref_line = u'[13] A. Zupanc, et al, Belle Collaboration, https://cn.arxiv.org/abs/hep-ex/0703040 2007'
+    res = get_references(ref_line)
+    references = res[0]
+    assert references[0]['reportnumber'] == [u'hep-ex/0703040']
+    assert references[0]['linemarker'] == [u'13']
+
+
 def test_get_plaintext_document_body(tmpdir):
     input = [u"Some text\n", u"on multiple lines\n"]
     f = tmpdir.join("plain.txt")
     f.write("".join(input))
     assert input == get_plaintext_document_body(str(f))
 
+
     with pytest.raises(UnknownDocumentTypeError) as excinfo:
         html = "<html><body>Some page</body></html>"
         f = tmpdir.join("page.html")
diff --git a/tests/test_tag.py b/tests/test_tag.py
index 76e417c..41006dc 100644
--- a/tests/test_tag.py
+++ b/tests/test_tag.py
@@ -175,3 +175,27 @@ def test_5_digits_suffix_version_new_2012():
     ref_line = u"""{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"""
     r = tag_arxiv(ref_line)
     assert r.strip(': ') == u"{any prefix}1210.12345v9 [physics.ins-det]{any postfix}"
+
+
+def test_4_digits_new_url():
+    ref_line = u"""{any prefix}https://arxiv.org/abs/1311.2198{any postfix}"""
+    r = tag_arxiv(ref_line)
+    assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1311.2198</cds.REPORTNUMBER>{any postfix}"
+
+
+def test_5_digits_new_url():
+    ref_line = u"""{any prefix}https://arxiv.org/abs/1602.03988{any postfix}"""
+    r = tag_arxiv(ref_line)
+    assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1602.03988</cds.REPORTNUMBER>{any postfix}"
+
+
+def test_4_digits_version_new_url():
+    ref_line = u"""{any prefix}https://arxiv.org/abs/0708.0882v1{any postfix}"""
+    r = tag_arxiv(ref_line)
+    assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:0708.0882</cds.REPORTNUMBER>{any postfix}"
+
+
+def test_5_digits_new_pdf_url():
+    ref_line = u"""{any prefix}https://arxiv.org/pdf/1712.03976.pdf{any postfix}"""
+    r = tag_arxiv(ref_line)
+    assert r.strip(': ') == u"{any prefix}<cds.REPORTNUMBER>arXiv:1712.03976</cds.REPORTNUMBER>{any postfix}"