Skip to content

Commit

Permalink
Make consecutive hyphens in comments a non-error
Browse files Browse the repository at this point in the history
This is experiment and may be reverted later.
  • Loading branch information
sideshowbarker committed Jun 10, 2016
1 parent 0706524 commit f0b12a2
Show file tree
Hide file tree
Showing 2 changed files with 104 additions and 4 deletions.
4 changes: 2 additions & 2 deletions src/nu/validator/htmlparser/impl/ErrorReportingTokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -413,8 +413,8 @@ private boolean isAstralPrivateUse(int c) {
err("Nameless doctype.");
}

@Override protected void errConsecutiveHyphens() throws SAXException {
err("Consecutive hyphens did not terminate a comment. \u201C--\u201D is not permitted inside a comment, but e.g. \u201C- -\u201D is.");
@Override protected void errNestedComment() throws SAXException {
err("Saw \u201C<!--\u201D within a comment. Probable cause: Nested comment (not allowed).");
}

@Override protected void errPrematureEndOfComment() throws SAXException {
Expand Down
104 changes: 102 additions & 2 deletions src/nu/validator/htmlparser/impl/Tokenizer.java
Original file line number Diff line number Diff line change
Expand Up @@ -222,6 +222,12 @@ public class Tokenizer implements Locator {

public static final int AMBIGUOUS_AMPERSAND = 75;

public static final int COMMENT_LESSTHAN = 76;

public static final int COMMENT_LESSTHAN_BANG = 77;

public static final int COMMENT_LESSTHAN_BANG_DASH = 78;

/**
* Magic value for UTF-16 operations.
*/
Expand Down Expand Up @@ -923,7 +929,6 @@ private void maybeAppendSpaceToBogusComment() throws SAXException {

@Inline private void adjustDoubleHyphenAndAppendToStrBufAndErr(char c)
throws SAXException {
errConsecutiveHyphens();
// [NOCPP[
switch (commentPolicy) {
case ALTER_INFOSET:
Expand Down Expand Up @@ -2446,6 +2451,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '<':
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
Expand Down Expand Up @@ -2491,6 +2500,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
state = transition(state, Tokenizer.COMMENT_END_DASH, reconsume, pos);
break commentloop;
// continue stateloop;
case '<':
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
Expand Down Expand Up @@ -2592,6 +2605,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
* Stay in the comment end state.
*/
continue;
case '<':
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
continue stateloop;
case '\r':
adjustDoubleHyphenAndAppendToStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
Expand Down Expand Up @@ -2683,6 +2700,85 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
continue stateloop;
}
}
case COMMENT_LESSTHAN:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '!':
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
}
case COMMENT_LESSTHAN_BANG:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '-':
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_LESSTHAN_BANG_DASH, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
}
case COMMENT_LESSTHAN_BANG_DASH:
for (;;) {
if (++pos == endPos) {
break stateloop;
}
c = checkChar(buf, pos);
switch (c) {
case '-':
errNestedComment();
adjustDoubleHyphenAndAppendToStrBufAndErr(c);
state = transition(state, Tokenizer.COMMENT_END, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
break stateloop;
case '\n':
appendStrBufLineFeed();
continue;
case '\u0000':
c = '\uFFFD';
// fall thru
default:
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
continue stateloop;
}
}
// XXX reorder point
case COMMENT_START_DASH:
if (++pos == endPos) {
Expand Down Expand Up @@ -2712,6 +2808,10 @@ private void ensureBufferSpace(int inputLength) throws SAXException {
*/
state = transition(state, Tokenizer.DATA, reconsume, pos);
continue stateloop;
case '<':
appendStrBuf(c);
state = transition(state, Tokenizer.COMMENT_LESSTHAN, reconsume, pos);
continue stateloop;
case '\r':
appendStrBufCarriageReturn();
state = transition(state, Tokenizer.COMMENT, reconsume, pos);
Expand Down Expand Up @@ -6777,7 +6877,7 @@ protected void errGtInPublicId() throws SAXException {
protected void errNamelessDoctype() throws SAXException {
}

protected void errConsecutiveHyphens() throws SAXException {
protected void errNestedComment() throws SAXException {
}

protected void errPrematureEndOfComment() throws SAXException {
Expand Down

2 comments on commit f0b12a2

@zcorpan
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nice work :-) I have not looked at the diff yet but I tested the cases in whatwg/html#1356 (comment) in https://checker.html5.org/#textarea and found the following bugs:

Handling of --!

<!----!

Gives 2 errors, should be 1 error (End of file inside comment.)

<!----!-->

Gives 1 error, should give no errors.

Number of warnings for mappability to XML

<!------------->

Gives many warnings about mappability to XML. Not a regression, but seems annoying.

<!--<!--x-->

Has two warnings about mappability to XML. Previously only 1 warning.

Handling of EOF in the new states

<!--<

No error. Should have 1 error (End of file inside comment.)

<!--<!

No error. Should have 1 error (End of file inside comment.)

<!--<!-

No error. Should have 1 error (End of file inside comment.)

Handling of < in the new states

<!--<<!--x-->

No error. Should have 1 error about nested comments.

<!--<!<!--x-->

No error. Should have 1 error about nested comments.

<!--<!-<!--x-->

No error. Should have 1 error about nested comments.

Handling of - in the comment less-than state

<!--<-->

Gives 1 error. Should have no error.

HTH

@sideshowbarker
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Handling of --!

Gives 1 error, should give no errors.

Both fixed.

Number of warnings for mappability to XML

Gives many warnings about mappability to XML. Not a regression, but seems annoying.

The checker now emits only one two-consecutive-hyphens warning per comment.

Has two warnings about mappability to XML. Previously only 1 warning.

Just one warning there now.

Handling of EOF in the new states

All fixed. Errors now reported as expected.

Handling of - in the comment less-than state

Gives 1 error. Should have no error.

Fixed.

Thanks for all the cases and for testing.

Please sign in to comment.