This repository has been archived by the owner on Jul 1, 2022. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 2
/
doi-crossref-pdf.py
58 lines (55 loc) · 1.76 KB
/
doi-crossref-pdf.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
#!/usr/bin/python
# -*- coding: utf-8 -*-
""" Bot to download a list of DOIs from the URLs of the TDM API. """
#
# (C) Federico Leva, 2018
#
# Distributed under the terms of the MIT license.
#
__version__ = '0.1.0'
import requests
from requests.adapters import HTTPAdapter
from requests.packages.urllib3.util.retry import Retry
import requests.exceptions
try:
from urllib import quote_plus
except:
from urllib.parse import quote_plus
from time import sleep
dois = open('dois.txt', 'rb')
s = requests.Session()
retries = Retry(total=5, backoff_factor=1, status_forcelist=[ 502, 503, 504 ])
s.mount('http://', HTTPAdapter(max_retries=retries))
s.mount('https://', HTTPAdapter(max_retries=retries))
headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64; rv:56.0) Gecko/20100101 Firefox/56.0' }
for doi in dois.readlines():
doi = doi.strip()
print("Looking up DOI: {}".format(doi))
sleep(1)
url = None
try:
api = s.get('https://api.crossref.org/works/{}'.format(doi)).json()
except ValueError:
continue
if not 'link' in api['message']:
continue
for link in api['message']['link']:
content = link['content-type']
if content == 'application/pdf':
url = link['URL']
if content == 'unspecified' and not url:
url = link['URL']
if url:
try:
pdf = s.get(url, headers=headers, timeout=10)
except requests.exceptions.ConnectionError:
print("ERROR: ConnectionError. Sleeping 10 seconds.")
sleep(10)
continue
if pdf.status_code == 200 and 'pdf' in pdf.headers['Content-Type']:
print("Saving PDF from {}".format(url))
with open('{}.pdf'.format(quote_plus(doi)), 'wb') as out:
out.write(pdf.content)
continue
# exiftool -overwrite_original -all=
# find . -maxdepth 0 -name "*pdf" -print0 | xargs -P8 -0 -I§ -n1 qpdf --linearize "§" "cleanpdf/§"