Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Emit a single error for contiguous sequences of unknown tokens #106566

Merged
merged 1 commit into from
Jan 14, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
28 changes: 24 additions & 4 deletions compiler/rustc_parse/src/lexer/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
/// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false;

let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens
loop {
let token = self.cursor.advance_token();
Expand Down Expand Up @@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),

rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
// Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 {
swallow_next_invalid -= 1;
continue;
}
let mut it = self.str_from_to_end(start).chars();
let c = it.next().unwrap();
let repeats = it.take_while(|c1| *c1 == c).count();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
// token. Ideally, this should be inside `rustc_lexer`. However, we should
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this
// way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
if repeats > 0 {
if repeats == 1 {
err.note(format!("character appears once more"));
} else {
err.note(format!("character appears {repeats} more times"));
}
swallow_next_invalid = repeats;
}
err.emit();
if let Some(token) = token {
token
Expand Down Expand Up @@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}

/// Slice of the source text spanning from `start` until the end
fn str_from_to_end(&self, start: BytePos) -> &str {
&self.src[self.src_index(start)..]
}

fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
Err(RawStrError::InvalidStarter { bad_char }) => {
Expand Down
10 changes: 8 additions & 2 deletions compiler/rustc_parse/src/lexer/unicode_chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -337,10 +337,11 @@ pub(super) fn check_for_substitution<'a>(
pos: BytePos,
ch: char,
err: &mut Diagnostic,
count: usize,
) -> Option<token::TokenKind> {
let &(_u_char, u_name, ascii_char) = UNICODE_ARRAY.iter().find(|&&(c, _, _)| c == ch)?;

let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8()));
let span = Span::with_root_ctxt(pos, pos + Pos::from_usize(ch.len_utf8() * count));

let Some((_ascii_char, ascii_name, token)) = ASCII_ARRAY.iter().find(|&&(c, _, _)| c == ascii_char) else {
let msg = format!("substitution character not found for '{}'", ch);
Expand Down Expand Up @@ -369,7 +370,12 @@ pub(super) fn check_for_substitution<'a>(
"Unicode character '{}' ({}) looks like '{}' ({}), but it is not",
ch, u_name, ascii_char, ascii_name
);
err.span_suggestion(span, &msg, ascii_char, Applicability::MaybeIncorrect);
err.span_suggestion(
span,
&msg,
ascii_char.to_string().repeat(count),
Applicability::MaybeIncorrect,
);
}
token.clone()
}
Expand Down
2 changes: 0 additions & 2 deletions tests/rustdoc-ui/invalid-syntax.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -77,8 +77,6 @@ LL | /// ```
| ^^^
|
= note: error from rustc: unknown start of token: `
= note: error from rustc: unknown start of token: `
= note: error from rustc: unknown start of token: `

warning: could not parse code block as Rust code
--> $DIR/invalid-syntax.rs:64:5
Expand Down
Binary file modified tests/ui/parser/issues/issue-66473.stderr
Binary file not shown.
Binary file modified tests/ui/parser/issues/issue-68629.stderr
Binary file not shown.
Binary file modified tests/ui/parser/issues/issue-68730.stderr
Binary file not shown.
4 changes: 4 additions & 0 deletions tests/ui/parser/unicode-chars.rs
Original file line number Diff line number Diff line change
Expand Up @@ -2,4 +2,8 @@ fn main() {
let y = 0;
//~^ ERROR unknown start of token: \u{37e}
//~^^ HELP Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), but it is not
    let x = 0;
//~^ ERROR unknown start of token: \u{a0}
//~^^ NOTE character appears 3 more times
//~^^^ HELP Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
}
14 changes: 13 additions & 1 deletion tests/ui/parser/unicode-chars.stderr
Original file line number Diff line number Diff line change
Expand Up @@ -9,5 +9,17 @@ help: Unicode character ';' (Greek Question Mark) looks like ';' (Semicolon), b
LL | let y = 0;
| ~

error: aborting due to previous error
error: unknown start of token: \u{a0}
--> $DIR/unicode-chars.rs:5:5
|
LL |     let x = 0;
| ^^^^
|
= note: character appears 3 more times
help: Unicode character ' ' (No-Break Space) looks like ' ' (Space), but it is not
|
LL | let x = 0;
| ++++

error: aborting due to 2 previous errors