Update the `invalid-escape-sequence` rule (#5359)

Just a couple small tweaks based on reading the rule with fresh eyes and
new best-practices.
This commit is contained in:
Charlie Marsh 2023-06-25 18:20:31 -04:00 committed by GitHub
parent b233763156
commit 1fe4073b56
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
1 changed files with 100 additions and 88 deletions

View File

@ -1,10 +1,9 @@
use anyhow::{bail, Result};
use log::error;
use ruff_text_size::{TextLen, TextRange, TextSize};
use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
use ruff_macros::{derive_message_formats, violation};
use ruff_python_ast::source_code::Locator;
use ruff_python_ast::str::{leading_quote, trailing_quote};
/// ## What it does
/// Checks for invalid escape sequences.
@ -21,6 +20,9 @@ use ruff_python_ast::source_code::Locator;
/// ```python
/// regex = r"\.png$"
/// ```
///
/// ## References
/// - [Python documentation: String and Bytes literals](https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals)
#[violation]
pub struct InvalidEscapeSequence(char);
@ -36,24 +38,6 @@ impl AlwaysAutofixableViolation for InvalidEscapeSequence {
}
}
// See: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals
const VALID_ESCAPE_SEQUENCES: &[char; 23] = &[
'\n', '\\', '\'', '"', 'a', 'b', 'f', 'n', 'r', 't', 'v', '0', '1', '2', '3', '4', '5', '6',
'7', 'x', // Escape sequences only recognized in string literals
'N', 'u', 'U',
];
/// Return the quotation markers used for a String token.
fn extract_quote(text: &str) -> Result<&str> {
for quote in ["'''", "\"\"\"", "'", "\""] {
if text.ends_with(quote) {
return Ok(quote);
}
}
bail!("Unable to find quotation mark for String token")
}
/// W605
pub(crate) fn invalid_escape_sequence(
locator: &Locator,
@ -65,17 +49,19 @@ pub(crate) fn invalid_escape_sequence(
let text = locator.slice(range);
// Determine whether the string is single- or triple-quoted.
let Ok(quote) = extract_quote(text) else {
error!("Unable to find quotation mark for string token");
let Some(leading_quote) = leading_quote(text) else {
return diagnostics;
};
let quote_pos = text.find(quote).unwrap();
let prefix = &text[..quote_pos];
let body = &text[quote_pos + quote.len()..text.len() - quote.len()];
let Some(trailing_quote) = trailing_quote(text) else {
return diagnostics;
};
let body = &text[leading_quote.len()..text.len() - trailing_quote.len()];
if !prefix.contains(['r', 'R']) {
let start_offset =
range.start() + TextSize::try_from(quote_pos).unwrap() + quote.text_len();
if leading_quote.contains(['r', 'R']) {
return diagnostics;
}
let start_offset = range.start() + TextSize::try_from(leading_quote.len()).unwrap();
let mut chars_iter = body.char_indices().peekable();
@ -102,7 +88,34 @@ pub(crate) fn invalid_escape_sequence(
}
// If the next character is a valid escape sequence, skip.
if VALID_ESCAPE_SEQUENCES.contains(next_char) {
// See: https://docs.python.org/3/reference/lexical_analysis.html#string-and-bytes-literals.
if matches!(
next_char,
'\n'
| '\\'
| '\''
| '"'
| 'a'
| 'b'
| 'f'
| 'n'
| 'r'
| 't'
| 'v'
| '0'
| '1'
| '2'
| '3'
| '4'
| '5'
| '6'
| '7'
| 'x'
// Escape sequences only recognized in string literals
| 'N'
| 'u'
| 'U'
) {
contains_valid_escape_sequence = true;
continue;
}
@ -140,12 +153,11 @@ pub(crate) fn invalid_escape_sequence(
} else {
"r".to_string()
},
range.start() + TextSize::try_from(quote_pos).unwrap(),
range.start(),
)));
}
}
}
}
diagnostics
}