mirror of https://github.com/astral-sh/ruff
Apply NFKC normalization to unicode identifiers in the lexer (#10412)
This commit is contained in:
parent
bb540718c2
commit
92e6026446
|
|
@ -2374,6 +2374,7 @@ dependencies = [
|
||||||
"static_assertions",
|
"static_assertions",
|
||||||
"tiny-keccak",
|
"tiny-keccak",
|
||||||
"unicode-ident",
|
"unicode-ident",
|
||||||
|
"unicode-normalization",
|
||||||
"unicode_names2",
|
"unicode_names2",
|
||||||
]
|
]
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -108,6 +108,7 @@ unic-ucd-category = { version = "0.9" }
|
||||||
unicode-ident = { version = "1.0.12" }
|
unicode-ident = { version = "1.0.12" }
|
||||||
unicode-width = { version = "0.1.11" }
|
unicode-width = { version = "0.1.11" }
|
||||||
unicode_names2 = { version = "1.2.2" }
|
unicode_names2 = { version = "1.2.2" }
|
||||||
|
unicode-normalization = { version = "0.1.23" }
|
||||||
ureq = { version = "2.9.6" }
|
ureq = { version = "2.9.6" }
|
||||||
url = { version = "2.5.0" }
|
url = { version = "2.5.0" }
|
||||||
uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
|
uuid = { version = "1.6.1", features = ["v4", "fast-rng", "macro-diagnostics", "js"] }
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,9 @@
|
||||||
|
"""Test that unicode identifiers are NFKC-normalised"""
|
||||||
|
|
||||||
|
𝒞 = 500
|
||||||
|
print(𝒞)
|
||||||
|
print(C + 𝒞) # 2 references to the same variable due to NFKC normalization
|
||||||
|
print(C / 𝒞)
|
||||||
|
print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
|
||||||
|
|
||||||
|
print(𝒟) # F821
|
||||||
|
|
@ -156,6 +156,7 @@ mod tests {
|
||||||
#[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
|
#[test_case(Rule::UndefinedName, Path::new("F821_26.py"))]
|
||||||
#[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
|
#[test_case(Rule::UndefinedName, Path::new("F821_26.pyi"))]
|
||||||
#[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
|
#[test_case(Rule::UndefinedName, Path::new("F821_27.py"))]
|
||||||
|
#[test_case(Rule::UndefinedName, Path::new("F821_28.py"))]
|
||||||
#[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
|
#[test_case(Rule::UndefinedExport, Path::new("F822_0.py"))]
|
||||||
#[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
|
#[test_case(Rule::UndefinedExport, Path::new("F822_0.pyi"))]
|
||||||
#[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
|
#[test_case(Rule::UndefinedExport, Path::new("F822_1.py"))]
|
||||||
|
|
|
||||||
|
|
@ -0,0 +1,10 @@
|
||||||
|
---
|
||||||
|
source: crates/ruff_linter/src/rules/pyflakes/mod.rs
|
||||||
|
---
|
||||||
|
F821_28.py:9:7: F821 Undefined name `𝒟`
|
||||||
|
|
|
||||||
|
7 | print(C == 𝑪 == 𝒞 == 𝓒 == 𝕮)
|
||||||
|
8 |
|
||||||
|
9 | print(𝒟) # F821
|
||||||
|
| ^ F821
|
||||||
|
|
|
||||||
|
|
@ -1,4 +1,4 @@
|
||||||
use ruff_formatter::{write, FormatContext};
|
use ruff_formatter::write;
|
||||||
use ruff_python_ast::AnyNodeRef;
|
use ruff_python_ast::AnyNodeRef;
|
||||||
use ruff_python_ast::ExprName;
|
use ruff_python_ast::ExprName;
|
||||||
|
|
||||||
|
|
@ -11,16 +11,11 @@ pub struct FormatExprName;
|
||||||
|
|
||||||
impl FormatNodeRule<ExprName> for FormatExprName {
|
impl FormatNodeRule<ExprName> for FormatExprName {
|
||||||
fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> {
|
fn fmt_fields(&self, item: &ExprName, f: &mut PyFormatter) -> FormatResult<()> {
|
||||||
let ExprName { id, range, ctx: _ } = item;
|
let ExprName {
|
||||||
|
id: _,
|
||||||
debug_assert_eq!(
|
range,
|
||||||
id.as_str(),
|
ctx: _,
|
||||||
f.context()
|
} = item;
|
||||||
.source_code()
|
|
||||||
.slice(*range)
|
|
||||||
.text(f.context().source_code())
|
|
||||||
);
|
|
||||||
|
|
||||||
write!(f, [source_text_slice(*range)])
|
write!(f, [source_text_slice(*range)])
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -28,6 +28,7 @@ rustc-hash = { workspace = true }
|
||||||
static_assertions = { workspace = true }
|
static_assertions = { workspace = true }
|
||||||
unicode-ident = { workspace = true }
|
unicode-ident = { workspace = true }
|
||||||
unicode_names2 = { workspace = true }
|
unicode_names2 = { workspace = true }
|
||||||
|
unicode-normalization = { workspace = true }
|
||||||
|
|
||||||
[dev-dependencies]
|
[dev-dependencies]
|
||||||
insta = { workspace = true }
|
insta = { workspace = true }
|
||||||
|
|
|
||||||
|
|
@ -32,6 +32,7 @@ use std::iter::FusedIterator;
|
||||||
use std::{char, cmp::Ordering, str::FromStr};
|
use std::{char, cmp::Ordering, str::FromStr};
|
||||||
|
|
||||||
use unicode_ident::{is_xid_continue, is_xid_start};
|
use unicode_ident::{is_xid_continue, is_xid_start};
|
||||||
|
use unicode_normalization::UnicodeNormalization;
|
||||||
|
|
||||||
use ruff_python_ast::{Int, IpyEscapeKind};
|
use ruff_python_ast::{Int, IpyEscapeKind};
|
||||||
use ruff_text_size::{TextLen, TextRange, TextSize};
|
use ruff_text_size::{TextLen, TextRange, TextSize};
|
||||||
|
|
@ -197,10 +198,25 @@ impl<'source> Lexer<'source> {
|
||||||
_ => {}
|
_ => {}
|
||||||
}
|
}
|
||||||
|
|
||||||
self.cursor.eat_while(is_identifier_continuation);
|
// Keep track of whether the identifier is ASCII-only or not.
|
||||||
|
//
|
||||||
|
// This is important because Python applies NFKC normalization to
|
||||||
|
// identifiers: https://docs.python.org/3/reference/lexical_analysis.html#identifiers.
|
||||||
|
// We need to therefore do the same in our lexer, but applying NFKC normalization
|
||||||
|
// unconditionally is extremely expensive. If we know an identifier is ASCII-only,
|
||||||
|
// (by far the most common case), we can skip NFKC normalization of the identifier.
|
||||||
|
let mut is_ascii = first.is_ascii();
|
||||||
|
self.cursor
|
||||||
|
.eat_while(|c| is_identifier_continuation(c, &mut is_ascii));
|
||||||
|
|
||||||
let text = self.token_text();
|
let text = self.token_text();
|
||||||
|
|
||||||
|
if !is_ascii {
|
||||||
|
return Ok(Tok::Name {
|
||||||
|
name: text.nfkc().collect::<String>().into_boxed_str(),
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
let keyword = match text {
|
let keyword = match text {
|
||||||
"False" => Tok::False,
|
"False" => Tok::False,
|
||||||
"None" => Tok::None,
|
"None" => Tok::None,
|
||||||
|
|
@ -1583,14 +1599,19 @@ fn is_unicode_identifier_start(c: char) -> bool {
|
||||||
is_xid_start(c)
|
is_xid_start(c)
|
||||||
}
|
}
|
||||||
|
|
||||||
// Checks if the character c is a valid continuation character as described
|
/// Checks if the character c is a valid continuation character as described
|
||||||
// in https://docs.python.org/3/reference/lexical_analysis.html#identifiers
|
/// in <https://docs.python.org/3/reference/lexical_analysis.html#identifiers>.
|
||||||
fn is_identifier_continuation(c: char) -> bool {
|
///
|
||||||
|
/// Additionally, this function also keeps track of whether or not the total
|
||||||
|
/// identifier is ASCII-only or not by mutably altering a reference to a
|
||||||
|
/// boolean value passed in.
|
||||||
|
fn is_identifier_continuation(c: char, identifier_is_ascii_only: &mut bool) -> bool {
|
||||||
// Arrange things such that ASCII codepoints never
|
// Arrange things such that ASCII codepoints never
|
||||||
// result in the slower `is_xid_continue` getting called.
|
// result in the slower `is_xid_continue` getting called.
|
||||||
if c.is_ascii() {
|
if c.is_ascii() {
|
||||||
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
matches!(c, 'a'..='z' | 'A'..='Z' | '_' | '0'..='9')
|
||||||
} else {
|
} else {
|
||||||
|
*identifier_is_ascii_only = false;
|
||||||
is_xid_continue(c)
|
is_xid_continue(c)
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
@ -2042,6 +2063,17 @@ def f(arg=%timeit a = b):
|
||||||
assert_debug_snapshot!(lex_source(source));
|
assert_debug_snapshot!(lex_source(source));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn get_tokens_only(source: &str) -> Vec<Tok> {
|
||||||
|
lex_source(source).into_iter().map(|(tok, _)| tok).collect()
|
||||||
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_nfkc_normalization() {
|
||||||
|
let source1 = "𝒞 = 500";
|
||||||
|
let source2 = "C = 500";
|
||||||
|
assert_eq!(get_tokens_only(source1), get_tokens_only(source2));
|
||||||
|
}
|
||||||
|
|
||||||
fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
|
fn triple_quoted_eol(eol: &str) -> Vec<Spanned> {
|
||||||
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
|
let source = format!("\"\"\"{eol} test string{eol} \"\"\"");
|
||||||
lex_source(&source)
|
lex_source(&source)
|
||||||
|
|
|
||||||
|
|
@ -16,6 +16,9 @@ pub enum Tok {
|
||||||
/// Token value for a name, commonly known as an identifier.
|
/// Token value for a name, commonly known as an identifier.
|
||||||
Name {
|
Name {
|
||||||
/// The name value.
|
/// The name value.
|
||||||
|
///
|
||||||
|
/// Unicode names are NFKC-normalized by the lexer,
|
||||||
|
/// matching [the behaviour of Python's lexer](https://docs.python.org/3/reference/lexical_analysis.html#identifiers)
|
||||||
name: Box<str>,
|
name: Box<str>,
|
||||||
},
|
},
|
||||||
/// Token value for an integer.
|
/// Token value for an integer.
|
||||||
|
|
|
||||||
Loading…
Reference in New Issue