Merge pull request #160 from atom/wl-entities-take-two

Improve entity scopes, take two
atom · Sep 20, 2017 · 9fde136 · 9fde136
2 parents 457b294 + adef4ad
commit 9fde136
Show file tree

Hide file tree

Showing 2 changed files with 124 additions and 16 deletions.
diff --git a/grammars/html.cson b/grammars/html.cson
@@ -398,7 +398,7 @@
     ]
   }
   {
-    'include': '#entities'
+    'include': '#text-entities'
   }
   {
     'match': '<>'
@@ -415,21 +415,61 @@
         'include': '#python'
       }
     ]
-  'entities':
+  'text-entities':
+    # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference
     'patterns': [
       {
-        'match': '(&)([a-zA-Z0-9]+|#[0-9]+|#x[0-9a-fA-F]+)(;)'
-        'captures':
+        'begin': '(&)([a-zA-Z0-9]+|#\\d+|#[xX][0-9a-fA-F]+)'
+        'beginCaptures':
           '1':
             'name': 'punctuation.definition.entity.begin.html'
           '2':
             'name': 'entity.name.entity.other.html'
-          '3':
+        'end': ';'
+        'endCaptures':
+          '0':
+            'name': 'punctuation.definition.entity.end.html'
+        'name': 'constant.character.entity.html'
+      }
+      {
+        'match': '&(?!\\s|<|&)'
+        'name': 'invalid.illegal.bad-ampersand.html'
+      }
+    ]
+  'attribute-entities':
+    # https://www.w3.org/TR/html51/syntax.html#consume-a-character-reference
+    # Because it would be infeasible to include the entire list of allowed entities,
+    # make sure that an equals sign or the end of a string does not follow a potential reference.
+    'patterns': [
+      {
+        'begin': '(&)(#\\d+|#[xX][0-9a-fA-F]+)'
+        'beginCaptures':
+          '1':
+            'name': 'punctuation.definition.entity.begin.html'
+          '2':
+            'name': 'entity.name.entity.other.html'
+        'end': ';'
+        'endCaptures':
+          '0':
+            'name': 'punctuation.definition.entity.end.html'
+        'name': 'constant.character.entity.html'
+      }
+      {
+        'begin': '(&)([a-zA-Z0-9]++)(?!["\'=])'
+        'beginCaptures':
+          '1':
+            'name': 'punctuation.definition.entity.begin.html'
+          '2':
+            'name': 'entity.name.entity.other.html'
+        'end': ';'
+        'endCaptures':
+          '0':
             'name': 'punctuation.definition.entity.end.html'
         'name': 'constant.character.entity.html'
       }
       {
-        'match': '&'
+        # In attributes, potential references that end with an equals sign are fine
+        'match': '&(?!\\s|<|&|[a-zA-Z0-9]+=)'
         'name': 'invalid.illegal.bad-ampersand.html'
       }
     ]
@@ -480,7 +520,7 @@
         'include': '#embedded-code'
       }
       {
-        'include': '#entities'
+        'include': '#attribute-entities'
       }
     ]
   'string-single-quoted':
@@ -498,7 +538,7 @@
         'include': '#embedded-code'
       }
       {
-        'include': '#entities'
+        'include': '#attribute-entities'
       }
     ]
   'tag-generic-attribute':
@@ -559,7 +599,7 @@
             'include': '#embedded-code'
           }
           {
-            'include': '#entities'
+            'include': '#attribute-entities'
           }
         ]
       }
@@ -579,7 +619,7 @@
             'include': '#embedded-code'
           }
           {
-            'include': '#entities'
+            'include': '#attribute-entities'
           }
         ]
       }
@@ -608,7 +648,7 @@
   'unquoted-attribute':
     'patterns': [
       {
-        'include': '#entities'
+        'include': '#attribute-entities'
       }
       {
         # https://www.w3.org/TR/html51/syntax.html#attribute-value-unquoted-state

diff --git a/spec/html-spec.coffee b/spec/html-spec.coffee
@@ -376,14 +376,82 @@ describe 'HTML grammar', ->
       expect(lines[2][1]).toEqual value: 'disabled', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-without-value.html', 'entity.other.attribute-name.html']
       expect(lines[2][2]).toEqual value: '>', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'punctuation.definition.tag.end.html']
 
-  describe "entities", ->
+  describe "entities in text", ->
     it "tokenizes & and characters after it", ->
       {tokens} = grammar.tokenizeLine '& &amp; &a'
 
-      expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html']
-      expect(tokens[3]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
-      expect(tokens[4]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
-      expect(tokens[7]).toEqual value: 'a', scopes: ['text.html.basic']
+      expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic']
+      expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[2]).toEqual value: 'amp', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[3]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[5]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[6]).toEqual value: 'a', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+
+      lines = grammar.tokenizeLines '&\n'
+      expect(lines[0][0]).toEqual value: '&', scopes: ['text.html.basic']
+
+    it "tokenizes hexadecimal and digit entities", ->
+      {tokens} = grammar.tokenizeLine '&#x00022; &#X00022; &#34;'
+
+      expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[1]).toEqual value: '#x00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[2]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[4]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[5]).toEqual value: '#X00022', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[6]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+      expect(tokens[8]).toEqual value: '&', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[9]).toEqual value: '#34', scopes: ['text.html.basic', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[10]).toEqual value: ';', scopes: ['text.html.basic', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+
+    it "tokenizes invalid ampersands", ->
+      {tokens} = grammar.tokenizeLine 'PSE&>'
+      expect(tokens[0]).toEqual value: 'PSE', scopes: ['text.html.basic']
+      expect(tokens[1]).toEqual value: '&', scopes: ['text.html.basic', 'invalid.illegal.bad-ampersand.html']
+      expect(tokens[2]).toEqual value: '>', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine 'PSE&'
+      expect(tokens[0]).toEqual value: 'PSE&', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '&<'
+      expect(tokens[0]).toEqual value: '&<', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '& '
+      expect(tokens[0]).toEqual value: '& ', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '&'
+      expect(tokens[0]).toEqual value: '&', scopes: ['text.html.basic']
+
+      {tokens} = grammar.tokenizeLine '&&'
+      expect(tokens[0]).toEqual value: '&&', scopes: ['text.html.basic']
+
+  describe "entities in attributes", ->
+    it "tokenizes entities", ->
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&amp;">'
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.begin.html']
+      expect(tokens[8]).toEqual value: 'amp', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'entity.name.entity.other.html']
+      expect(tokens[9]).toEqual value: ';', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'constant.character.entity.html', 'punctuation.definition.entity.end.html']
+
+    it "does not tokenize query parameters as entities", ->
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?one=1&type=json&topic=css">'
+      expect(tokens[6]).toEqual value: 'http://example.com?one=1&type=json&topic=css', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+
+    it "tokenizes invalid ampersands", ->
+      # Note: in order to replicate the following tests' behaviors, make sure you have language-hyperlink disabled
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&">'
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&=">'
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
+
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?& ">'
+      expect(tokens[6]).toEqual value: 'http://example.com?& ', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+
+      lines = grammar.tokenizeLines '<a href="http://example.com?&\n">'
+      expect(lines[0][6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+
+      {tokens} = grammar.tokenizeLine '<a href="http://example.com?&&">'
+      expect(tokens[6]).toEqual value: 'http://example.com?&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html']
+      expect(tokens[7]).toEqual value: '&', scopes: ['text.html.basic', 'meta.tag.inline.any.html', 'meta.attribute-with-value.html', 'string.quoted.double.html', 'invalid.illegal.bad-ampersand.html']
 
   describe "firstLineMatch", ->
     it "recognises HTML5 doctypes", ->