From f401050878a3ca2506c65c29968a943557f1fdd8 Mon Sep 17 00:00:00 2001 From: Charlie Marsh Date: Fri, 9 Jun 2023 21:44:50 -0400 Subject: [PATCH] Introduce `PythonWhitespace` to confine trim operations to Python whitespace (#4994) ## Summary We use `.trim()` and friends in a bunch of places, to strip whitespace from source code. However, not all Unicode whitespace characters are considered "whitespace" in Python, which only supports the standard space, tab, and form-feed characters. This PR audits our usages of `.trim()`, `.trim_start()`, `.trim_end()`, and `char::is_whitespace`, and replaces them as appropriate with a new `.trim_whitespace()` analogues, powered by a `PythonWhitespace` trait. In general, the only place that should continue to use `.trim()` is content within docstrings, which don't need to adhere to Python's semantic definitions of whitespace. Closes #4991. --- crates/ruff/src/autofix/edits.rs | 8 +++--- crates/ruff/src/importer/insertion.rs | 4 +-- crates/ruff/src/noqa.rs | 4 +-- .../flake8_pytest_style/rules/helpers.rs | 3 +- .../src/rules/flake8_simplify/rules/fix_if.rs | 3 +- crates/ruff/src/rules/isort/helpers.rs | 4 +-- .../src/rules/isort/rules/organize_imports.rs | 6 ++-- .../pycodestyle/rules/logical_lines/mod.rs | 5 ++-- .../whitespace_before_comment.rs | 3 +- .../rules/blank_before_after_class.rs | 9 +++--- .../rules/blank_before_after_function.rs | 11 ++++---- crates/ruff_python_ast/src/helpers.rs | 4 +-- .../src/comments/placement.rs | 4 +-- .../ruff_python_whitespace/src/whitespace.rs | 28 +++++++++++++++++++ 14 files changed, 64 insertions(+), 32 deletions(-) diff --git a/crates/ruff/src/autofix/edits.rs b/crates/ruff/src/autofix/edits.rs index 00a86bbb5c..623e1876ed 100644 --- a/crates/ruff/src/autofix/edits.rs +++ b/crates/ruff/src/autofix/edits.rs @@ -7,7 +7,7 @@ use rustpython_parser::{lexer, Mode, Tok}; use ruff_diagnostics::Edit; use ruff_python_ast::helpers; use ruff_python_ast::source_code::{Indexer, Locator, Stylist}; -use ruff_python_whitespace::NewlineWithTrailingNewline; +use ruff_python_whitespace::{is_python_whitespace, NewlineWithTrailingNewline, PythonWhitespace}; use crate::autofix::codemods; @@ -242,7 +242,7 @@ fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option { let contents = locator.after(stmt.end()); for line in NewlineWithTrailingNewline::from(contents) { - let trimmed = line.trim_start(); + let trimmed = line.trim_whitespace_start(); if trimmed.starts_with(';') { let colon_offset = line.text_len() - trimmed.text_len(); @@ -262,7 +262,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize { let contents = &locator.contents()[usize::from(start_location)..]; for line in NewlineWithTrailingNewline::from(contents) { - let trimmed = line.trim(); + let trimmed = line.trim_whitespace(); // Skip past any continuations. if trimmed.starts_with('\\') { continue; @@ -276,7 +276,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize { } else { // Otherwise, find the start of the next statement. (Or, anything that isn't // whitespace.) - let relative_offset = line.find(|c: char| !c.is_whitespace()).unwrap(); + let relative_offset = line.find(|c: char| !is_python_whitespace(c)).unwrap(); line.start() + TextSize::try_from(relative_offset).unwrap() }; } diff --git a/crates/ruff/src/importer/insertion.rs b/crates/ruff/src/importer/insertion.rs index a347f795fa..469f7a1d28 100644 --- a/crates/ruff/src/importer/insertion.rs +++ b/crates/ruff/src/importer/insertion.rs @@ -8,7 +8,7 @@ use rustpython_parser::{lexer, Mode, Tok}; use ruff_diagnostics::Edit; use ruff_python_ast::helpers::is_docstring_stmt; use ruff_python_ast::source_code::{Locator, Stylist}; -use ruff_python_whitespace::UniversalNewlineIterator; +use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator}; use ruff_textwrap::indent; #[derive(Debug, Clone, PartialEq, Eq)] @@ -69,7 +69,7 @@ impl<'a> Insertion<'a> { // Skip over commented lines. for line in UniversalNewlineIterator::with_offset(locator.after(location), location) { - if line.trim_start().starts_with('#') { + if line.trim_whitespace_start().starts_with('#') { location = line.full_end(); } else { break; diff --git a/crates/ruff/src/noqa.rs b/crates/ruff/src/noqa.rs index 4dba6aee9d..1d8aa49dae 100644 --- a/crates/ruff/src/noqa.rs +++ b/crates/ruff/src/noqa.rs @@ -12,7 +12,7 @@ use ruff_text_size::{TextLen, TextRange, TextSize}; use ruff_diagnostics::Diagnostic; use ruff_python_ast::source_code::Locator; -use ruff_python_whitespace::LineEnding; +use ruff_python_whitespace::{LineEnding, PythonWhitespace}; use crate::codes::NoqaCode; use crate::registry::{AsRule, Rule, RuleSet}; @@ -87,7 +87,7 @@ enum ParsedExemption<'a> { /// Return a [`ParsedExemption`] for a given comment line. fn parse_file_exemption(line: &str) -> ParsedExemption { - let line = line.trim_start(); + let line = line.trim_whitespace_start(); if line.starts_with("# flake8: noqa") || line.starts_with("# flake8: NOQA") diff --git a/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs b/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs index 248005a0c7..f753163036 100644 --- a/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs +++ b/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs @@ -3,6 +3,7 @@ use rustpython_parser::ast::{self, Constant, Decorator, Expr, Keyword}; use ruff_python_ast::call_path::{collect_call_path, CallPath}; use ruff_python_ast::helpers::map_callable; use ruff_python_semantic::model::SemanticModel; +use ruff_python_whitespace::PythonWhitespace; pub(super) fn get_mark_decorators( decorators: &[Decorator], @@ -81,7 +82,7 @@ pub(super) fn split_names(names: &str) -> Vec<&str> { names .split(',') .filter_map(|s| { - let trimmed = s.trim(); + let trimmed = s.trim_whitespace(); if trimmed.is_empty() { None } else { diff --git a/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs b/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs index a36364dedf..0988aaf3b3 100644 --- a/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs +++ b/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs @@ -11,6 +11,7 @@ use crate::autofix::codemods::CodegenStylist; use ruff_diagnostics::Edit; use ruff_python_ast::source_code::{Locator, Stylist}; use ruff_python_ast::whitespace; +use ruff_python_whitespace::PythonWhitespace; use crate::cst::matchers::{match_function_def, match_if, match_indented_block, match_statement}; @@ -44,7 +45,7 @@ pub(crate) fn fix_nested_if_statements( let contents = locator.lines(stmt.range()); // Handle `elif` blocks differently; detect them upfront. - let is_elif = contents.trim_start().starts_with("elif"); + let is_elif = contents.trim_whitespace_start().starts_with("elif"); // If this is an `elif`, we have to remove the `elif` keyword for now. (We'll // restore the `el` later on.) diff --git a/crates/ruff/src/rules/isort/helpers.rs b/crates/ruff/src/rules/isort/helpers.rs index 4bcf005aa4..e37cbb389e 100644 --- a/crates/ruff/src/rules/isort/helpers.rs +++ b/crates/ruff/src/rules/isort/helpers.rs @@ -2,7 +2,7 @@ use rustpython_parser::ast::{Ranged, Stmt}; use rustpython_parser::{lexer, Mode, Tok}; use ruff_python_ast::source_code::Locator; -use ruff_python_whitespace::UniversalNewlines; +use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines}; use crate::rules::isort::types::TrailingComma; @@ -63,7 +63,7 @@ pub(super) fn has_comment_break(stmt: &Stmt, locator: &Locator) -> bool { // def f(): pass let mut seen_blank = false; for line in locator.up_to(stmt.start()).universal_newlines().rev() { - let line = line.trim(); + let line = line.trim_whitespace(); if seen_blank { if line.starts_with('#') { return true; diff --git a/crates/ruff/src/rules/isort/rules/organize_imports.rs b/crates/ruff/src/rules/isort/rules/organize_imports.rs index e1acc53c49..d7a3dbe549 100644 --- a/crates/ruff/src/rules/isort/rules/organize_imports.rs +++ b/crates/ruff/src/rules/isort/rules/organize_imports.rs @@ -10,7 +10,7 @@ use ruff_python_ast::helpers::{ followed_by_multi_statement_line, preceded_by_multi_statement_line, trailing_lines_end, }; use ruff_python_ast::source_code::{Indexer, Locator, Stylist}; -use ruff_python_whitespace::{leading_indentation, UniversalNewlines}; +use ruff_python_whitespace::{leading_indentation, PythonWhitespace, UniversalNewlines}; use ruff_textwrap::indent; use crate::line_width::LineWidth; @@ -72,7 +72,9 @@ fn matches_ignoring_indentation(val1: &str, val2: &str) -> bool { val1.universal_newlines() .zip_longest(val2.universal_newlines()) .all(|pair| match pair { - EitherOrBoth::Both(line1, line2) => line1.trim_start() == line2.trim_start(), + EitherOrBoth::Both(line1, line2) => { + line1.trim_whitespace_start() == line2.trim_whitespace_start() + } _ => false, }) } diff --git a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs index 27086f77a8..ca87e1264a 100644 --- a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs +++ b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs @@ -25,6 +25,7 @@ pub(crate) use missing_whitespace_around_operator::{ MissingWhitespaceAroundBitwiseOrShiftOperator, MissingWhitespaceAroundModuloOperator, MissingWhitespaceAroundOperator, }; +use ruff_python_whitespace::is_python_whitespace; pub(crate) use space_around_operator::{ space_around_operator, MultipleSpacesAfterOperator, MultipleSpacesBeforeOperator, TabAfterOperator, TabBeforeOperator, @@ -378,7 +379,7 @@ impl Whitespace { len += c.text_len(); } else if matches!(c, '\n' | '\r') { break; - } else if c.is_whitespace() { + } else if is_python_whitespace(c) { count += 1; len += c.text_len(); } else { @@ -409,7 +410,7 @@ impl Whitespace { } else if matches!(c, '\n' | '\r') { // Indent return (Self::None, TextSize::default()); - } else if c.is_whitespace() { + } else if is_python_whitespace(c) { count += 1; len += c.text_len(); } else { diff --git a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs index 455bd5f0f3..e9ee52f52d 100644 --- a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs +++ b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs @@ -4,6 +4,7 @@ use ruff_diagnostics::Violation; use ruff_macros::{derive_message_formats, violation}; use ruff_python_ast::source_code::Locator; use ruff_python_ast::token_kind::TokenKind; +use ruff_python_whitespace::PythonWhitespace; use crate::checkers::logical_lines::LogicalLinesContext; use crate::rules::pycodestyle::rules::logical_lines::LogicalLine; @@ -156,7 +157,7 @@ pub(crate) fn whitespace_before_comment( )); let token_text = locator.slice(range); - let is_inline_comment = !line_text.trim().is_empty(); + let is_inline_comment = !line_text.trim_whitespace().is_empty(); if is_inline_comment { if range.start() - prev_end < " ".text_len() { context.push( diff --git a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs index 54e5c887e7..17d106d550 100644 --- a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs +++ b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs @@ -4,7 +4,7 @@ use rustpython_parser::ast::Ranged; use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix}; use ruff_macros::{derive_message_formats, violation}; use ruff_python_semantic::definition::{Definition, Member, MemberKind}; -use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines}; +use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines}; use crate::checkers::ast::Checker; use crate::docstrings::Docstring; @@ -133,10 +133,9 @@ pub(crate) fn blank_before_after_class(checker: &mut Checker, docstring: &Docstr .locator .slice(TextRange::new(docstring.end(), stmt.end())); - let all_blank_after = after - .universal_newlines() - .skip(1) - .all(|line| line.trim().is_empty() || line.trim_start().starts_with('#')); + let all_blank_after = after.universal_newlines().skip(1).all(|line| { + line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#') + }); if all_blank_after { return; } diff --git a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs index f7b4a5182a..d8c52c5b66 100644 --- a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs +++ b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs @@ -6,7 +6,7 @@ use rustpython_parser::ast::Ranged; use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix}; use ruff_macros::{derive_message_formats, violation}; use ruff_python_semantic::definition::{Definition, Member, MemberKind}; -use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines}; +use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines}; use crate::checkers::ast::Checker; use crate::docstrings::Docstring; @@ -102,10 +102,9 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc .slice(TextRange::new(docstring.end(), stmt.end())); // If the docstring is only followed by blank and commented lines, abort. - let all_blank_after = after - .universal_newlines() - .skip(1) - .all(|line| line.trim().is_empty() || line.trim_start().starts_with('#')); + let all_blank_after = after.universal_newlines().skip(1).all(|line| { + line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#') + }); if all_blank_after { return; } @@ -129,7 +128,7 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc // Avoid violations for blank lines followed by inner functions or classes. if blank_lines_after == 1 && lines - .find(|line| !line.trim_start().starts_with('#')) + .find(|line| !line.trim_whitespace_start().starts_with('#')) .map_or(false, |line| INNER_FUNCTION_OR_CLASS_REGEX.is_match(&line)) { return; diff --git a/crates/ruff_python_ast/src/helpers.rs b/crates/ruff_python_ast/src/helpers.rs index 627b9c2f09..6b4a2ddbff 100644 --- a/crates/ruff_python_ast/src/helpers.rs +++ b/crates/ruff_python_ast/src/helpers.rs @@ -4,7 +4,7 @@ use std::path::Path; use itertools::Itertools; use log::error; use num_traits::Zero; -use ruff_python_whitespace::UniversalNewlineIterator; +use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator}; use ruff_text_size::{TextRange, TextSize}; use rustc_hash::{FxHashMap, FxHashSet}; use rustpython_parser::ast::{ @@ -1031,7 +1031,7 @@ pub fn trailing_lines_end(stmt: &Stmt, locator: &Locator) -> TextSize { let rest = &locator.contents()[usize::from(line_end)..]; UniversalNewlineIterator::with_offset(rest, line_end) - .take_while(|line| line.trim().is_empty()) + .take_while(|line| line.trim_whitespace().is_empty()) .last() .map_or(line_end, |l| l.full_end()) } diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs index b5e99495a1..c1c2b8df1f 100644 --- a/crates/ruff_python_formatter/src/comments/placement.rs +++ b/crates/ruff_python_formatter/src/comments/placement.rs @@ -4,7 +4,7 @@ use crate::trivia::{SimpleTokenizer, TokenKind}; use ruff_python_ast::node::AnyNodeRef; use ruff_python_ast::source_code::Locator; use ruff_python_ast::whitespace; -use ruff_python_whitespace::UniversalNewlines; +use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines}; use ruff_text_size::{TextRange, TextSize}; use rustpython_parser::ast::Ranged; use std::cmp::Ordering; @@ -986,7 +986,7 @@ fn max_empty_lines(contents: &str) -> usize { let mut max_empty_lines = 0; for line in contents.universal_newlines().skip(1) { - if line.trim().is_empty() { + if line.trim_whitespace().is_empty() { empty_lines += 1; } else { max_empty_lines = max_empty_lines.max(empty_lines); diff --git a/crates/ruff_python_whitespace/src/whitespace.rs b/crates/ruff_python_whitespace/src/whitespace.rs index 00139d3640..8bb161aba3 100644 --- a/crates/ruff_python_whitespace/src/whitespace.rs +++ b/crates/ruff_python_whitespace/src/whitespace.rs @@ -13,3 +13,31 @@ pub fn leading_indentation(line: &str) -> &str { line.find(|char: char| !is_python_whitespace(char)) .map_or(line, |index| &line[..index]) } + +pub trait PythonWhitespace { + /// Like `str::trim()`, but only removes whitespace characters that Python considers + /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens). + fn trim_whitespace(&self) -> &Self; + + /// Like `str::trim_start()`, but only removes whitespace characters that Python considers + /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens). + fn trim_whitespace_start(&self) -> &Self; + + /// Like `str::trim_end()`, but only removes whitespace characters that Python considers + /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens). + fn trim_whitespace_end(&self) -> &Self; +} + +impl PythonWhitespace for str { + fn trim_whitespace(&self) -> &Self { + self.trim_matches(is_python_whitespace) + } + + fn trim_whitespace_start(&self) -> &Self { + self.trim_start_matches(is_python_whitespace) + } + + fn trim_whitespace_end(&self) -> &Self { + self.trim_end_matches(is_python_whitespace) + } +}