-
Notifications
You must be signed in to change notification settings - Fork 5
/
bsbang-extract.py
executable file
·68 lines (49 loc) · 1.81 KB
/
bsbang-extract.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
#!/usr/bin/env python3
import argparse
import contextlib
import json
import logging
import os
import sqlite3
import bioschemas
import bsbang
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
# FUNCTIONS
def insert_into_db(_conn, _url, jsonlds):
"""
Insert jsonlds into the database
:param _conn:
:param _url:
:param jsonlds: [<jsonld>]
:return:
"""
with contextlib.closing(_conn.cursor()) as _curs:
for jsonld in jsonlds:
_curs.execute('INSERT INTO jsonld (url, jsonld) VALUES (?, ?)', (_url, json.dumps(jsonld)))
_curs.execute('DELETE FROM extract_queue WHERE url=?', (_url,))
_conn.commit()
# MAIN
parser = argparse.ArgumentParser('Extract Bioschemas JSONLD from URLs on the crawl DB extract queue.')
parser.add_argument('path_to_crawl_db', help='Path to the database used to store crawl information.')
args = parser.parse_args()
if not os.path.exists(args.path_to_crawl_db):
logger.error('Crawl database %s does not exist', args.path_to_crawl_db)
exit(1)
config = bioschemas.DEFAULT_CONFIG.copy()
urls_to_exclude = set()
with sqlite3.connect(args.path_to_crawl_db) as conn:
conn.execute("PRAGMA busy_timeout = 30000")
conn.row_factory = sqlite3.Row
with contextlib.closing(conn.cursor()) as curs:
curs.execute('SELECT COUNT(*) from extract_queue')
count = int(curs.fetchone()[0])
i = 1
for row in curs.execute('SELECT url FROM extract_queue'):
url = str(row['url']).strip()
if url:
logger.info('Processing %s (%d of %d)', url, i, count)
insert_into_db(conn, url, bsbang.load_bioschemas_jsonld_from_html(url, config))
else:
logger.warning('Skipping (%d of %d) as entry is blank', i, count)
i += 1