-
Notifications
You must be signed in to change notification settings - Fork 0
/
spacy_ner.py
109 lines (97 loc) · 4.37 KB
/
spacy_ner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
#! /usr/bin/env python
#
# spaccy_ner.py: Runs spaCy named entity rercognition (NER) over text
#
# Notes:
# - Based on following:
# https://towardsdatascience.com/named-entity-recognition-with-nltk-and-spacy-8c4a7d88e7da
# https://towardsdatascience.com/a-little-spacy-food-for-thought-easy-to-use-nlp-framework-97cbcc81f977
# - TODO: Mention limitations (e.g., prone to over-generalization).
#
# TODO:
# - Rework in terms of spacy_nlp.py (e.g., via subclassing).
#
"""Performs named entity recognition (NER) via spaCy"""
import spacy
from main import Main
import debug
import system
# Constants (e.g., for arguments)
#
LANG_MODEL = "lang-model"
## TODO: XYZ = "xyz"
# Main class
#
class Script(Main):
"""Input processing class: runs text through spaCy NLP pipeline"""
nlp = None
type_prefix = ":"
entity_delim = ", "
entity_quote = '"'
spacy_model = "en_core_web_lg"
# TODO: add class constructor
## def __init__(self, *args, **kwargs):
## debug.trace_fmtd(5, "Script.__init__({a}): keywords={kw}; self={s}",
## a=",".join(args), kw=kwargs, s=self)
## super(Script, self).__init__(*args, **kwargs)
def setup(self):
"""Check results of command line processing"""
debug.trace_fmtd(5, "Script.setup(): self={s}", s=self)
self.spacy_model = self.get_parsed_option(LANG_MODEL, self.spacy_model)
debug.assertion(self.type_prefix != self.entity_delim)
# Load SpaCy language model (normally large model for English)
debug.trace_fmt(4, "loading SpaCy model {m}", m=self.spacy_model)
try:
self.nlp = spacy.load(self.spacy_model)
except:
system.print_stderr("Problem loading model {m} via spacy: {exc}",
m=self.spacy_model, exc=system.get_exception())
# If model package not properly installed, the model can be
# created by the following workaround (n.b., uses specific model for clarity);
# nlp = spacy.load("en_core_web_sm")
# =>
# import en_core_web_sm
# nlp = en_core_web_sm.load()
# TODO: Figure out step needed for propeer spaCy bookkeeping after a
# new package is added to python (e.g., en_core_web_sm under site-packages).
try:
debug.trace(3, "Warning: Trying eval hack to load model")
# pylint: disable=eval-used, exec-used
exec("import " + self.spacy_model)
debug.trace_fmt(4, "dir({m}): {d}", m=self.spacy_model, d=eval("dir(" + self.spacy_model + ")"))
self.nlp = eval(self.spacy_model + ".load()")
except:
system.print_stderr("Problem with alternative load of model {m}: {exc}",
m=self.spacy_model, exc=system.get_exception())
debug.trace_object(5, self, label="Script instance")
def process_line(self, line):
"""Processes current line from input,producing comma-separate list of entities (with type prefix)"""
# TODO: add entity-type filter
debug.trace_fmtd(6, "Script.process_line({l})", l=line)
type_prefix = self.type_prefix
entity_delim = self.entity_delim
# Invoke NER over text
# TODO: allow for embedded sentences
debug.assertion(self.nlp)
doc = self.nlp(line)
# Format as comma-separated list of typed entities
entity_specs = []
for ent in doc.ents:
ent_text = ent.text
debug.assertion(type_prefix not in ent.label_)
if (type_prefix in ent_text):
ent_text = (self.entity_quote + ent_text + self.entity_quote)
entity_specs.append(ent.label_ + type_prefix + ent_text)
print(entity_delim.join(entity_specs))
#-------------------------------------------------------------------------------
if __name__ == '__main__':
debug.trace_current_context(level=debug.QUITE_DETAILED)
app = Script(
description=__doc__,
# Note: skip_input controls the line-by-line processing, which is inefficient but simple to
# understand; in contrast, manual_input controls iterator-based input (the opposite of both).
skip_input=False,
manual_input=False,
boolean_options=[],
text_options=[(LANG_MODEL, "Language model for NER, etc.")])
app.run()