[refurb] Count codepoints not bytes for `slice-to-remove-prefix-or-suffix (FURB188)` (#13631)

2024-10-07 09:13:28 -05:00 · 2024-10-07 09:13:28 -05:00 · 14ee5dbfde
parent 27ac34d683
commit 14ee5dbfde
4 changed files with 114 additions and 7 deletions
--- a/crates/ruff_linter/resources/test/fixtures/refurb/FURB188.py
+++ b/crates/ruff_linter/resources/test/fixtures/refurb/FURB188.py
@ -169,4 +169,32 @@ def ignore_step():
    text = "!x!y!z"
    if text.startswith("!"):
        text = text[1::2]
-    print(text)
+    print(text)
+
+def handle_unicode():
+    # should be skipped!
+    text = "řetězec"
+    if text.startswith("ř"): 
+        text = text[2:]
+
+    # should be linted
+    # with fix `text = text.removeprefix("ř")`
+    text = "řetězec"
+    if text.startswith("ř"): 
+        text = text[1:]
+
+
+def handle_surrogates():
+    # should be linted
+    text = "\ud800\udc00heythere"
+    if text.startswith("\ud800\udc00"):
+        text = text[2:]
+    text = "\U00010000heythere"
+    if text.startswith("\U00010000"):
+        text = text[1:]
+    
+    # should not be linted
+    text = "\ud800\udc00heythere"
+    if text.startswith("\ud800\udc00"):
+        text = text[1:]
+    
--- a/crates/ruff_linter/src/rules/refurb/rules/slice_to_remove_prefix_or_suffix.rs
+++ b/crates/ruff_linter/src/rules/refurb/rules/slice_to_remove_prefix_or_suffix.rs
@ -4,7 +4,7 @@ use ruff_macros::{derive_message_formats, violation};
 use ruff_python_ast as ast;
 use ruff_python_semantic::SemanticModel;
 use ruff_source_file::Locator;
-use ruff_text_size::{Ranged, TextLen};
+use ruff_text_size::Ranged;

 /// ## What it does
 /// Checks for the removal of a prefix or suffix from a string by assigning
@ -334,8 +334,9 @@ fn affix_matches_slice_bound(data: &RemoveAffixData, semantic: &SemanticModel) -
            }),
        ) => num
            .as_int()
-            .and_then(ast::Int::as_u32) // Only support prefix removal for size at most `u32::MAX`
-            .is_some_and(|x| x == string_val.to_str().text_len().to_u32()),
+            // Only support prefix removal for size at most `usize::MAX`
+            .and_then(ast::Int::as_usize)
+            .is_some_and(|x| x == string_val.chars().count()),
        (
            AffixKind::StartsWith,
            ast::Expr::Call(ast::ExprCall {
@ -369,8 +370,8 @@ fn affix_matches_slice_bound(data: &RemoveAffixData, semantic: &SemanticModel) -
                // Only support prefix removal for size at most `u32::MAX`
                value
                    .as_int()
-                    .and_then(ast::Int::as_u32)
-                    .is_some_and(|x| x == string_val.to_str().text_len().to_u32())
+                    .and_then(ast::Int::as_usize)
+                    .is_some_and(|x| x == string_val.chars().count())
            },
        ),
        (
--- a/crates/ruff_linter/src/rules/refurb/snapshots/ruff_linterrulesrefurbtestsFURB188_FURB188.py.snap
+++ b/crates/ruff_linter/src/rules/refurb/snapshots/ruff_linterrulesrefurbtestsFURB188_FURB188.py.snap
@ -250,4 +250,74 @@ FURB188.py:162:5: FURB188 [*] Prefer `removeprefix` over conditionally replacing
    162 |+    text = text.removeprefix("!")
 164 163 |     print(text)
 165 164 | 
-166 165 |
+166 165 | 
+
+FURB188.py:183:5: FURB188 [*] Prefer `removeprefix` over conditionally replacing with slice.
+    |
+181 |       # with fix `text = text.removeprefix("ř")`
+182 |       text = "řetězec"
+183 |       if text.startswith("ř"): 
+    |  _____^
+184 | |         text = text[1:]
+    | |_______________________^ FURB188
+    |
+    = help: Use removeprefix instead of assignment conditional upon startswith.
+
+ℹ Safe fix
+180 180 |     # should be linted
+181 181 |     # with fix `text = text.removeprefix("ř")`
+182 182 |     text = "řetězec"
+183     |-    if text.startswith("ř"): 
+184     |-        text = text[1:]
+    183 |+    text = text.removeprefix("ř")
+185 184 | 
+186 185 | 
+187 186 | def handle_surrogates():
+
+FURB188.py:190:5: FURB188 [*] Prefer `removeprefix` over conditionally replacing with slice.
+    |
+188 |       # should be linted
+189 |       text = "\ud800\udc00heythere"
+190 |       if text.startswith("\ud800\udc00"):
+    |  _____^
+191 | |         text = text[2:]
+    | |_______________________^ FURB188
+192 |       text = "\U00010000heythere"
+193 |       if text.startswith("\U00010000"):
+    |
+    = help: Use removeprefix instead of assignment conditional upon startswith.
+
+ℹ Safe fix
+187 187 | def handle_surrogates():
+188 188 |     # should be linted
+189 189 |     text = "\ud800\udc00heythere"
+190     |-    if text.startswith("\ud800\udc00"):
+191     |-        text = text[2:]
+    190 |+    text = text.removeprefix("\ud800\udc00")
+192 191 |     text = "\U00010000heythere"
+193 192 |     if text.startswith("\U00010000"):
+194 193 |         text = text[1:]
+
+FURB188.py:193:5: FURB188 [*] Prefer `removeprefix` over conditionally replacing with slice.
+    |
+191 |           text = text[2:]
+192 |       text = "\U00010000heythere"
+193 |       if text.startswith("\U00010000"):
+    |  _____^
+194 | |         text = text[1:]
+    | |_______________________^ FURB188
+195 |       
+196 |       # should not be linted
+    |
+    = help: Use removeprefix instead of assignment conditional upon startswith.
+
+ℹ Safe fix
+190 190 |     if text.startswith("\ud800\udc00"):
+191 191 |         text = text[2:]
+192 192 |     text = "\U00010000heythere"
+193     |-    if text.startswith("\U00010000"):
+194     |-        text = text[1:]
+    193 |+    text = text.removeprefix("\U00010000")
+195 194 |     
+196 195 |     # should not be linted
+197 196 |     text = "\ud800\udc00heythere"
--- a/crates/ruff_python_ast/src/int.rs
+++ b/crates/ruff_python_ast/src/int.rs
@ -96,6 +96,14 @@ impl Int {
        }
    }

+    /// Return the [`Int`] as an u64, if it can be represented as that data type.
+    pub fn as_usize(&self) -> Option<usize> {
+        match &self.0 {
+            Number::Small(small) => usize::try_from(*small).ok(),
+            Number::Big(_) => None,
+        }
+    }
+
    /// Return the [`Int`] as an i8, if it can be represented as that data type.
    pub fn as_i8(&self) -> Option<i8> {
        match &self.0 {