-
Notifications
You must be signed in to change notification settings - Fork 21
/
tokenize-pdfs.py
63 lines (52 loc) · 1.35 KB
/
tokenize-pdfs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# This version outputs a CSV in normal form:
# slug, page, x, y, token
import pandas as pd
import pdfplumber
import csv
import json
nopened = 0
nparsed = 0
nopenerror = 0
nparseerror = 0
def print_stats():
global nopened, nparsed, nopenerror, nparseerror
print('-----')
print(f'Found {nopened} files, could not open {nopenerror}')
print(f'Parsed {nparsed}, could not parse {nparseerror}')
print('-----')
d = pd.read_csv('source/ftf-all-filings.tsv', sep='\t')
f = open('data/filings-tokens.csv', mode='w')
csv = csv.writer(f)
csv.writerow(['slug','page','x0','y0','x1','y1','token'])
for index, row in d.iterrows():
slug = row['dc_slug']
fname = 'pdfs/' + slug + '.pdf'
print('Extracting ' + fname)
try:
pdf = pdfplumber.open(fname)
nopened += 1
except Exception as e:
print(e)
nopenerror += 1
continue
try:
for p in range(len(pdf.pages)):
for w in pdf.pages[p].extract_words():
if '\0' not in w['text']: # some tokens have nulls in them, which are not valid in a csv
csv.writerow([slug,
p,
float(w['x0']),
float(w['top']),
float(w['x1']),
float(w['bottom']),
w['text']])
nparsed += 1
except Exception as e:
print(e)
nparseerror +=1
if (index % 100) == 0:
print_stats()
print('-----')
print("Done!")
print(f'{len(d)} rows total')
print_stats()