-
Notifications
You must be signed in to change notification settings - Fork 0
/
spacy_12_efficientPhraseMatching_3.py
63 lines (47 loc) · 1.75 KB
/
spacy_12_efficientPhraseMatching_3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
"""
Created on Tue May 28 2019
@author: Stacy Bridges
This script does a few different things:
it uses country matcher on a long text
it analyzes the syntax
it updates doc entities with the matched countries
"""
import spacy
import json
from spacy.lang.en import English
from spacy.matcher import PhraseMatcher
from spacy.tokens import Span
def main():
with open('countries.json') as f:
COUNTRIES = json.loads(f.read())
with open('country_text.txt') as f:
TEXT = f.read()
nlp = English()
matcher = PhraseMatcher(nlp.vocab)
patterns = list(nlp.pipe(COUNTRIES))
matcher.add('COUNTRY', None, *patterns)
# Create a doc and find matches in it
doc = nlp(TEXT)
# test print of ents
print('test print of ents: -----------------------------')
print([(ent.text, ent.label_) for ent in doc.ents])
print('\n')
# Iterate over the matches
print('iterate over the matches: -----------------------')
for match_id, start, end in matcher(doc):
# Create a Span with the label for 'GPE'
span = Span(doc, start, end, label = 'GPE')
# Overwire the doc.ents and add the span
doc.ents = list(doc.ents) + [span]
# Get the span's root head token
span_root_head = span.root.head
# Print the text of the span root's
# head token and the span text
print(span_root_head.text, '-->', span.text)
# print spacer
print('\n')
# Print the entities in the document
print('fin print of ents: -----------------------------')
print([(ent.text, ent.label_) for ent in doc.ents if ent.label_ == 'GPE'])
if __name__ == '__main__' : main()