Merge pull request #1442 from explosion/feature/fix-sp

💫Fix SP tag, tweak Vectors.__init__, fix Morphology
explosion · Oct 24, 2017 · ef3e5a3 · ef3e5a3
2 parents fdf25d1 + 9010a1a
commit ef3e5a3
Show file tree

Hide file tree

Showing 10 changed files with 85 additions and 70 deletions.
diff --git a/spacy/lang/de/tag_map.py b/spacy/lang/de/tag_map.py
@@ -62,5 +62,5 @@
     "VVIZU":    {POS: VERB, "VerbForm": "inf"},
     "VVPP":     {POS: VERB, "Aspect": "perf", "VerbForm": "part"},
     "XY":       {POS: X},
-    "SP":       {POS: SPACE}
+    "_SP":      {POS: SPACE}
 }
diff --git a/spacy/lang/en/tag_map.py b/spacy/lang/en/tag_map.py
@@ -55,11 +55,11 @@
     "WP":       {POS: NOUN, "PronType": "int|rel"},
     "WP$":      {POS: ADJ, "Poss": "yes", "PronType": "int|rel"},
     "WRB":      {POS: ADV, "PronType": "int|rel"},
-    "SP":       {POS: SPACE},
     "ADD":      {POS: X},
     "NFP":      {POS: PUNCT},
     "GW":       {POS: X},
     "XX":       {POS: X},
     "BES":      {POS: VERB},
-    "HVS":      {POS: VERB}
+    "HVS":      {POS: VERB},
+    "_SP":       {POS: SPACE},
 }
diff --git a/spacy/lang/es/tag_map.py b/spacy/lang/es/tag_map.py
@@ -303,5 +303,5 @@
     "VERB__VerbForm=Ger": {"morph": "VerbForm=Ger", "pos": "VERB"},
     "VERB__VerbForm=Inf": {"morph": "VerbForm=Inf", "pos": "VERB"},
     "X___": {"morph": "_", "pos": "X"},
-    "SP": {"morph": "_", "pos": "SPACE"},
+    "_SP": {"morph": "_", "pos": "SPACE"},
 }
diff --git a/spacy/lang/th/tag_map.py b/spacy/lang/th/tag_map.py
@@ -19,63 +19,64 @@
     "NPRP":     {POS: PRON},
     # ADJ
     "ADJ":      {POS: ADJ},
-    "NONM":      {POS: ADJ},
-    "VATT":      {POS: ADJ},
-    "DONM":      {POS: ADJ},
+    "NONM":     {POS: ADJ},
+    "VATT":     {POS: ADJ},
+    "DONM":     {POS: ADJ},
     # ADV
     "ADV":      {POS: ADV},
-    "ADVN":      {POS: ADV},
-    "ADVI":      {POS: ADV},
-    "ADVP":      {POS: ADV},
-    "ADVS":      {POS: ADV},
+    "ADVN":     {POS: ADV},
+    "ADVI":     {POS: ADV},
+    "ADVP":     {POS: ADV},
+    "ADVS":     {POS: ADV},
 	# INT
     "INT":      {POS: INTJ},
     # PRON
     "PROPN":    {POS: PROPN},
-    "PPRS":    {POS: PROPN},
-    "PDMN":    {POS: PROPN},
-    "PNTR":    {POS: PROPN},
+    "PPRS":     {POS: PROPN},
+    "PDMN":     {POS: PROPN},
+    "PNTR":     {POS: PROPN},
     # DET
     "DET":      {POS: DET},
-    "DDAN":      {POS: DET},
-    "DDAC":      {POS: DET},
-    "DDBQ":      {POS: DET},
-    "DDAQ":      {POS: DET},
-    "DIAC":      {POS: DET},
-    "DIBQ":      {POS: DET},
-    "DIAQ":      {POS: DET},
-    "DCNM":      {POS: DET},
+    "DDAN":     {POS: DET},
+    "DDAC":     {POS: DET},
+    "DDBQ":     {POS: DET},
+    "DDAQ":     {POS: DET},
+    "DIAC":     {POS: DET},
+    "DIBQ":     {POS: DET},
+    "DIAQ":     {POS: DET},
+    "DCNM":     {POS: DET},
     # NUM
     "NUM":      {POS: NUM},
-    "NCNM":      {POS: NUM},
-    "NLBL":      {POS: NUM},
-    "DCNM":      {POS: NUM},
+    "NCNM":     {POS: NUM},
+    "NLBL":     {POS: NUM},
+    "DCNM":     {POS: NUM},
 	# AUX
     "AUX":      {POS: AUX},
-    "XVBM":      {POS: AUX},
-    "XVAM":      {POS: AUX},
-    "XVMM":      {POS: AUX},
-    "XVBB":      {POS: AUX},
-    "XVAE":      {POS: AUX},
+    "XVBM":     {POS: AUX},
+    "XVAM":     {POS: AUX},
+    "XVMM":     {POS: AUX},
+    "XVBB":     {POS: AUX},
+    "XVAE":     {POS: AUX},
 	# ADP
     "ADP":      {POS: ADP},
-    "RPRE":      {POS: ADP},
+    "RPRE":     {POS: ADP},
     # CCONJ
     "CCONJ":    {POS: CCONJ},
-    "JCRG":    {POS: CCONJ},
+    "JCRG":     {POS: CCONJ},
 	# SCONJ
     "SCONJ":    {POS: SCONJ},
-    "PREL":    {POS: SCONJ},
-    "JSBR":    {POS: SCONJ},
-    "JCMP":    {POS: SCONJ},
+    "PREL":     {POS: SCONJ},
+    "JSBR":     {POS: SCONJ},
+    "JCMP":     {POS: SCONJ},
     # PART
-    "PART":    {POS: PART},
-    "FIXN":    {POS: PART},
-    "FIXV":    {POS: PART},
-    "EAFF":    {POS: PART},
-    "AITT":    {POS: PART},
-    "NEG":    {POS: PART},
+    "PART":     {POS: PART},
+    "FIXN":     {POS: PART},
+    "FIXV":     {POS: PART},
+    "EAFF":     {POS: PART},
+    "AITT":     {POS: PART},
+    "NEG":      {POS: PART},
     # PUNCT
     "PUNCT":    {POS: PUNCT},
-    "PUNC":    {POS: PUNCT}
+    "PUNC":     {POS: PUNCT},
+    "_SP":      {POS: SPACE}
 }
diff --git a/spacy/morphology.pxd b/spacy/morphology.pxd
@@ -44,7 +44,7 @@ cdef class Morphology:
     cdef int assign_feature(self, uint64_t* morph, univ_morph_t feat_id, bint value) except -1
 
 
-cpdef enum univ_morph_t:
+cdef enum univ_morph_t:
     NIL = 0
     Animacy_anim = symbols.Animacy_anim
     Animacy_inam

diff --git a/spacy/morphology.pyx b/spacy/morphology.pyx
@@ -4,7 +4,7 @@ from __future__ import unicode_literals
 
 from libc.string cimport memset
 
-from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT
+from .parts_of_speech cimport ADJ, VERB, NOUN, PUNCT, SPACE
 from .attrs cimport POS, IS_SPACE
 from .parts_of_speech import IDS as POS_IDS
 from .lexeme cimport Lexeme
@@ -36,14 +36,22 @@ cdef class Morphology:
     def __init__(self, StringStore string_store, tag_map, lemmatizer, exc=None):
         self.mem = Pool()
         self.strings = string_store
+        # Add special space symbol. We prefix with underscore, to make sure it
+        # always sorts to the end.
+        space_attrs = tag_map.pop('SP', {POS: SPACE})
+        if '_SP' not in tag_map:
+            self.strings.add('_SP')
+            tag_map = dict(tag_map)
+            tag_map['_SP'] = space_attrs
+        self.tag_names = tuple(sorted(tag_map.keys()))
         self.tag_map = {}
         self.lemmatizer = lemmatizer
         self.n_tags = len(tag_map)
-        self.tag_names = tuple(sorted(tag_map.keys()))
         self.reverse_index = {}
 
         self.rich_tags = <RichTagC*>self.mem.alloc(self.n_tags+1, sizeof(RichTagC))
         for i, (tag_str, attrs) in enumerate(sorted(tag_map.items())):
+            self.strings.add(tag_str)
             self.tag_map[tag_str] = dict(attrs)
             attrs = _normalize_props(attrs)
             attrs = intify_attrs(attrs, self.strings, _do_deprecated=True)
@@ -93,7 +101,7 @@ cdef class Morphology:
         # the statistical model fails.
         # Related to Issue #220
         if Lexeme.c_check_flag(token.lex, IS_SPACE):
-            tag_id = self.reverse_index[self.strings.add('SP')]
+            tag_id = self.reverse_index[self.strings.add('_SP')]
         rich_tag = self.rich_tags[tag_id]
         analysis = <MorphAnalysisC*>self._cache.get(tag_id, token.lex.orth)
         if analysis is NULL:
@@ -426,3 +434,7 @@ IDS = {
 
 
 NAMES = [key for key, value in sorted(IDS.items(), key=lambda item: item[1])]
+# Unfortunate hack here, to work around problem with long cpdef enum
+# (which is generating an enormous amount of C++ in Cython 0.24+)
+# We keep the enum cdef, and just make sure the names are available to Python
+locals().update(IDS)
diff --git a/spacy/tests/vectors/test_vectors.py b/spacy/tests/vectors/test_vectors.py
@@ -35,18 +35,18 @@ def vocab(en_vocab, vectors):
 
 
 def test_init_vectors_with_data(strings, data):
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
     assert v.shape == data.shape
 
 def test_init_vectors_with_width(strings):
-    v = Vectors(strings, 3)
+    v = Vectors(strings, width=3)
     for string in strings:
         v.add(string)
     assert v.shape == (len(strings), 3)
 
 
 def test_get_vector(strings, data):
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
     for string in strings:
         v.add(string)
     assert list(v[strings[0]]) == list(data[0])
@@ -56,7 +56,7 @@ def test_get_vector(strings, data):
 
 def test_set_vector(strings, data):
     orig = data.copy()
-    v = Vectors(strings, data)
+    v = Vectors(strings, data=data)
     for string in strings:
         v.add(string)
     assert list(v[strings[0]]) == list(orig[0])

diff --git a/spacy/vectors.pyx b/spacy/vectors.pyx
@@ -32,22 +32,24 @@ cdef class Vectors:
     cdef public object keys
     cdef public int i
 
-    def __init__(self, strings, data_or_width=0):
+    def __init__(self, strings, width=0, data=None):
         if isinstance(strings, StringStore):
             self.strings = strings
         else:
             self.strings = StringStore()
             for string in strings:
                 self.strings.add(string)
-        if isinstance(data_or_width, int):
-            self.data = data = numpy.zeros((len(strings), data_or_width),
-                                           dtype='f')
+        if data is not None:
+            self.data = numpy.asarray(data, dtype='f')
         else:
-            data = data_or_width
+            self.data = numpy.zeros((len(self.strings), width), dtype='f')
         self.i = 0
-        self.data = data
         self.key2row = {}
-        self.keys = np.ndarray((self.data.shape[0],), dtype='uint64')
+        self.keys = numpy.zeros((self.data.shape[0],), dtype='uint64')
+        for i, string in enumerate(self.strings):
+            if i >= self.data.shape[0]:
+                break
+            self.add(self.strings[string], self.data[i])
 
     def __reduce__(self):
         return (Vectors, (self.strings, self.data))

diff --git a/spacy/vocab.pyx b/spacy/vocab.pyx
@@ -62,12 +62,9 @@ cdef class Vocab:
         if strings:
             for string in strings:
                 _ = self[string]
-        for name in tag_map.keys():
-            if name:
-                self.strings.add(name)
         self.lex_attr_getters = lex_attr_getters
         self.morphology = Morphology(self.strings, tag_map, lemmatizer)
-        self.vectors = Vectors(self.strings)
+        self.vectors = Vectors(self.strings, width=0)
 
     property lang:
         def __get__(self):
@@ -255,7 +252,7 @@ cdef class Vocab:
         """
         if new_dim is None:
             new_dim = self.vectors.data.shape[1]
-        self.vectors = Vectors(self.strings, new_dim)
+        self.vectors = Vectors(self.strings, width=new_dim)
 
     def get_vector(self, orth):
         """Retrieve a vector for a word in the vocabulary.
@@ -338,7 +335,7 @@ cdef class Vocab:
             if self.vectors is None:
                 return None
             else:
-                return self.vectors.to_bytes(exclude='strings.json')
+                return self.vectors.to_bytes()
 
         getters = OrderedDict((
             ('strings', lambda: self.strings.to_bytes()),
@@ -358,7 +355,7 @@ cdef class Vocab:
             if self.vectors is None:
                 return None
             else:
-                return self.vectors.from_bytes(b, exclude='strings')
+                return self.vectors.from_bytes(b)
         setters = OrderedDict((
             ('strings', lambda b: self.strings.from_bytes(b)),
             ('lexemes', lambda b: self.lexemes_from_bytes(b)),

diff --git a/website/api/vectors.jade b/website/api/vectors.jade
@@ -12,7 +12,7 @@ p
 
 p
     |  Create a new vector store. To keep the vector table empty, pass
-    |  #[code data_or_width=0]. You can also create the vector table and add
+    |  #[code width=0]. You can also create the vector table and add
     |  vectors one by one, or set the vector values directly on initialisation.
 
 +aside-code("Example").
@@ -21,11 +21,11 @@ p
 
     empty_vectors = Vectors(StringStore())
 
-    vectors = Vectors([u'cat'], 300)
+    vectors = Vectors([u'cat'], width=300)
     vectors[u'cat'] = numpy.random.uniform(-1, 1, (300,))
 
     vector_table = numpy.zeros((3, 300), dtype='f')
-    vectors = Vectors(StringStore(), vector_table)
+    vectors = Vectors(StringStore(), data=vector_table)
 
 +table(["Name", "Type", "Description"])
     +row
@@ -36,9 +36,12 @@ p
             |  that maps strings to hash values, and vice versa.
 
     +row
-        +cell #[code data_or_width]
-        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']] or int
-        +cell Vector data or number of dimensions.
+        +cell #[code data]
+        +cell #[code.u-break numpy.ndarray[ndim=1, dtype='float32']]
+
+    +row
+        +cell #[code width]
+        +cell Number of dimensions.
 
     +row("foot")
         +cell returns