Emit a single error for contiguous sequences of Unicode homoglyphs

This commit is contained in:
clubby789
2023-01-07 16:33:05 +00:00
parent ef4046e4f3
commit a3d6bc3468
8 changed files with 49 additions and 9 deletions

View File

@@ -79,7 +79,7 @@ impl<'a> StringReader<'a> {
/// preceded by whitespace.
fn next_token(&mut self) -> (Token, bool) {
let mut preceded_by_whitespace = false;
let mut swallow_next_invalid = 0;
// Skip trivial (whitespace & comments) tokens
loop {
let token = self.cursor.advance_token();
@@ -232,19 +232,34 @@ impl<'a> StringReader<'a> {
rustc_lexer::TokenKind::Percent => token::BinOp(token::Percent),
rustc_lexer::TokenKind::Unknown | rustc_lexer::TokenKind::InvalidIdent => {
let c = self.str_from(start).chars().next().unwrap();
// Don't emit diagnostics for sequences of the same invalid token
if swallow_next_invalid > 0 {
swallow_next_invalid -= 1;
continue;
}
let mut it = self.str_from_to_end(start).chars();
let c = it.next().unwrap();
let repeats = it.take_while(|c1| *c1 == c).count();
let mut err =
self.struct_err_span_char(start, self.pos, "unknown start of token", c);
self.struct_err_span_char(start, self.pos + Pos::from_usize(repeats * c.len_utf8()), "unknown start of token", c);
// FIXME: the lexer could be used to turn the ASCII version of unicode
// homoglyphs, instead of keeping a table in `check_for_substitution`into the
// token. Ideally, this should be inside `rustc_lexer`. However, we should
// first remove compound tokens like `<<` from `rustc_lexer`, and then add
// fancier error recovery to it, as there will be less overall work to do this
// way.
let token = unicode_chars::check_for_substitution(self, start, c, &mut err);
let token = unicode_chars::check_for_substitution(self, start, c, &mut err, repeats+1);
if c == '\x00' {
err.help("source files must contain UTF-8 encoded text, unexpected null bytes might occur when a different encoding is used");
}
if repeats > 0 {
if repeats == 1 {
err.note(format!("character appears once more"));
} else {
err.note(format!("character appears {repeats} more times"));
}
swallow_next_invalid = repeats;
}
err.emit();
if let Some(token) = token {
token
@@ -486,6 +501,11 @@ impl<'a> StringReader<'a> {
&self.src[self.src_index(start)..self.src_index(end)]
}
/// Slice of the source text spanning from `start` until the end
fn str_from_to_end(&self, start: BytePos) -> &str {
&self.src[self.src_index(start)..]
}
fn report_raw_str_error(&self, start: BytePos, prefix_len: u32) -> ! {
match rustc_lexer::validate_raw_str(self.str_from(start), prefix_len) {
Err(RawStrError::InvalidStarter { bad_char }) => {