From f401050878a3ca2506c65c29968a943557f1fdd8 Mon Sep 17 00:00:00 2001
From: Charlie Marsh <charlie.r.marsh@gmail.com>
Date: Fri, 9 Jun 2023 21:44:50 -0400
Subject: [PATCH] Introduce `PythonWhitespace` to confine trim operations to
 Python whitespace (#4994)

## Summary

We use `.trim()` and friends in a bunch of places, to strip whitespace
from source code. However, not all Unicode whitespace characters are
considered "whitespace" in Python, which only supports the standard
space, tab, and form-feed characters.

This PR audits our usages of `.trim()`, `.trim_start()`, `.trim_end()`,
and `char::is_whitespace`, and replaces them as appropriate with a new
`.trim_whitespace()` analogues, powered by a `PythonWhitespace` trait.

In general, the only place that should continue to use `.trim()` is
content within docstrings, which don't need to adhere to Python's
semantic definitions of whitespace.

Closes #4991.
---
 crates/ruff/src/autofix/edits.rs              |  8 +++---
 crates/ruff/src/importer/insertion.rs         |  4 +--
 crates/ruff/src/noqa.rs                       |  4 +--
 .../flake8_pytest_style/rules/helpers.rs      |  3 +-
 .../src/rules/flake8_simplify/rules/fix_if.rs |  3 +-
 crates/ruff/src/rules/isort/helpers.rs        |  4 +--
 .../src/rules/isort/rules/organize_imports.rs |  6 ++--
 .../pycodestyle/rules/logical_lines/mod.rs    |  5 ++--
 .../whitespace_before_comment.rs              |  3 +-
 .../rules/blank_before_after_class.rs         |  9 +++---
 .../rules/blank_before_after_function.rs      | 11 ++++----
 crates/ruff_python_ast/src/helpers.rs         |  4 +--
 .../src/comments/placement.rs                 |  4 +--
 .../ruff_python_whitespace/src/whitespace.rs  | 28 +++++++++++++++++++
 14 files changed, 64 insertions(+), 32 deletions(-)
diff --git a/crates/ruff/src/autofix/edits.rs b/crates/ruff/src/autofix/edits.rs
index 00a86bbb5c..623e1876ed 100644
--- a/crates/ruff/src/autofix/edits.rs
+++ b/crates/ruff/src/autofix/edits.rs
@@ -7,7 +7,7 @@ use rustpython_parser::{lexer, Mode, Tok};
 use ruff_diagnostics::Edit;
 use ruff_python_ast::helpers;
 use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
-use ruff_python_whitespace::NewlineWithTrailingNewline;
+use ruff_python_whitespace::{is_python_whitespace, NewlineWithTrailingNewline, PythonWhitespace};
 
 use crate::autofix::codemods;
 
@@ -242,7 +242,7 @@ fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option<TextSize> {
     let contents = locator.after(stmt.end());
 
     for line in NewlineWithTrailingNewline::from(contents) {
-        let trimmed = line.trim_start();
+        let trimmed = line.trim_whitespace_start();
 
         if trimmed.starts_with(';') {
             let colon_offset = line.text_len() - trimmed.text_len();
@@ -262,7 +262,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
 
     let contents = &locator.contents()[usize::from(start_location)..];
     for line in NewlineWithTrailingNewline::from(contents) {
-        let trimmed = line.trim();
+        let trimmed = line.trim_whitespace();
         // Skip past any continuations.
         if trimmed.starts_with('\\') {
             continue;
@@ -276,7 +276,7 @@ fn next_stmt_break(semicolon: TextSize, locator: &Locator) -> TextSize {
             } else {
                 // Otherwise, find the start of the next statement. (Or, anything that isn't
                 // whitespace.)
-                let relative_offset = line.find(|c: char| !c.is_whitespace()).unwrap();
+                let relative_offset = line.find(|c: char| !is_python_whitespace(c)).unwrap();
                 line.start() + TextSize::try_from(relative_offset).unwrap()
             };
     }
diff --git a/crates/ruff/src/importer/insertion.rs b/crates/ruff/src/importer/insertion.rs
index a347f795fa..469f7a1d28 100644
--- a/crates/ruff/src/importer/insertion.rs
+++ b/crates/ruff/src/importer/insertion.rs
@@ -8,7 +8,7 @@ use rustpython_parser::{lexer, Mode, Tok};
 use ruff_diagnostics::Edit;
 use ruff_python_ast::helpers::is_docstring_stmt;
 use ruff_python_ast::source_code::{Locator, Stylist};
-use ruff_python_whitespace::UniversalNewlineIterator;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
 use ruff_textwrap::indent;
 
 #[derive(Debug, Clone, PartialEq, Eq)]
@@ -69,7 +69,7 @@ impl<'a> Insertion<'a> {
 
         // Skip over commented lines.
         for line in UniversalNewlineIterator::with_offset(locator.after(location), location) {
-            if line.trim_start().starts_with('#') {
+            if line.trim_whitespace_start().starts_with('#') {
                 location = line.full_end();
             } else {
                 break;
diff --git a/crates/ruff/src/noqa.rs b/crates/ruff/src/noqa.rs
index 4dba6aee9d..1d8aa49dae 100644
--- a/crates/ruff/src/noqa.rs
+++ b/crates/ruff/src/noqa.rs
@@ -12,7 +12,7 @@ use ruff_text_size::{TextLen, TextRange, TextSize};
 
 use ruff_diagnostics::Diagnostic;
 use ruff_python_ast::source_code::Locator;
-use ruff_python_whitespace::LineEnding;
+use ruff_python_whitespace::{LineEnding, PythonWhitespace};
 
 use crate::codes::NoqaCode;
 use crate::registry::{AsRule, Rule, RuleSet};
@@ -87,7 +87,7 @@ enum ParsedExemption<'a> {
 
 /// Return a [`ParsedExemption`] for a given comment line.
 fn parse_file_exemption(line: &str) -> ParsedExemption {
-    let line = line.trim_start();
+    let line = line.trim_whitespace_start();
 
     if line.starts_with("# flake8: noqa")
         || line.starts_with("# flake8: NOQA")
diff --git a/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs b/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs
index 248005a0c7..f753163036 100644
--- a/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs
+++ b/crates/ruff/src/rules/flake8_pytest_style/rules/helpers.rs
@@ -3,6 +3,7 @@ use rustpython_parser::ast::{self, Constant, Decorator, Expr, Keyword};
 use ruff_python_ast::call_path::{collect_call_path, CallPath};
 use ruff_python_ast::helpers::map_callable;
 use ruff_python_semantic::model::SemanticModel;
+use ruff_python_whitespace::PythonWhitespace;
 
 pub(super) fn get_mark_decorators(
     decorators: &[Decorator],
@@ -81,7 +82,7 @@ pub(super) fn split_names(names: &str) -> Vec<&str> {
     names
         .split(',')
         .filter_map(|s| {
-            let trimmed = s.trim();
+            let trimmed = s.trim_whitespace();
             if trimmed.is_empty() {
                 None
             } else {
diff --git a/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs b/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs
index a36364dedf..0988aaf3b3 100644
--- a/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs
+++ b/crates/ruff/src/rules/flake8_simplify/rules/fix_if.rs
@@ -11,6 +11,7 @@ use crate::autofix::codemods::CodegenStylist;
 use ruff_diagnostics::Edit;
 use ruff_python_ast::source_code::{Locator, Stylist};
 use ruff_python_ast::whitespace;
+use ruff_python_whitespace::PythonWhitespace;
 
 use crate::cst::matchers::{match_function_def, match_if, match_indented_block, match_statement};
 
@@ -44,7 +45,7 @@ pub(crate) fn fix_nested_if_statements(
     let contents = locator.lines(stmt.range());
 
     // Handle `elif` blocks differently; detect them upfront.
-    let is_elif = contents.trim_start().starts_with("elif");
+    let is_elif = contents.trim_whitespace_start().starts_with("elif");
 
     // If this is an `elif`, we have to remove the `elif` keyword for now. (We'll
     // restore the `el` later on.)
diff --git a/crates/ruff/src/rules/isort/helpers.rs b/crates/ruff/src/rules/isort/helpers.rs
index 4bcf005aa4..e37cbb389e 100644
--- a/crates/ruff/src/rules/isort/helpers.rs
+++ b/crates/ruff/src/rules/isort/helpers.rs
@@ -2,7 +2,7 @@ use rustpython_parser::ast::{Ranged, Stmt};
 use rustpython_parser::{lexer, Mode, Tok};
 
 use ruff_python_ast::source_code::Locator;
-use ruff_python_whitespace::UniversalNewlines;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
 
 use crate::rules::isort::types::TrailingComma;
 
@@ -63,7 +63,7 @@ pub(super) fn has_comment_break(stmt: &Stmt, locator: &Locator) -> bool {
     //   def f(): pass
     let mut seen_blank = false;
     for line in locator.up_to(stmt.start()).universal_newlines().rev() {
-        let line = line.trim();
+        let line = line.trim_whitespace();
         if seen_blank {
             if line.starts_with('#') {
                 return true;
diff --git a/crates/ruff/src/rules/isort/rules/organize_imports.rs b/crates/ruff/src/rules/isort/rules/organize_imports.rs
index e1acc53c49..d7a3dbe549 100644
--- a/crates/ruff/src/rules/isort/rules/organize_imports.rs
+++ b/crates/ruff/src/rules/isort/rules/organize_imports.rs
@@ -10,7 +10,7 @@ use ruff_python_ast::helpers::{
     followed_by_multi_statement_line, preceded_by_multi_statement_line, trailing_lines_end,
 };
 use ruff_python_ast::source_code::{Indexer, Locator, Stylist};
-use ruff_python_whitespace::{leading_indentation, UniversalNewlines};
+use ruff_python_whitespace::{leading_indentation, PythonWhitespace, UniversalNewlines};
 use ruff_textwrap::indent;
 
 use crate::line_width::LineWidth;
@@ -72,7 +72,9 @@ fn matches_ignoring_indentation(val1: &str, val2: &str) -> bool {
     val1.universal_newlines()
         .zip_longest(val2.universal_newlines())
         .all(|pair| match pair {
-            EitherOrBoth::Both(line1, line2) => line1.trim_start() == line2.trim_start(),
+            EitherOrBoth::Both(line1, line2) => {
+                line1.trim_whitespace_start() == line2.trim_whitespace_start()
+            }
             _ => false,
         })
 }
diff --git a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs
index 27086f77a8..ca87e1264a 100644
--- a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/mod.rs
@@ -25,6 +25,7 @@ pub(crate) use missing_whitespace_around_operator::{
     MissingWhitespaceAroundBitwiseOrShiftOperator, MissingWhitespaceAroundModuloOperator,
     MissingWhitespaceAroundOperator,
 };
+use ruff_python_whitespace::is_python_whitespace;
 pub(crate) use space_around_operator::{
     space_around_operator, MultipleSpacesAfterOperator, MultipleSpacesBeforeOperator,
     TabAfterOperator, TabBeforeOperator,
@@ -378,7 +379,7 @@ impl Whitespace {
                 len += c.text_len();
             } else if matches!(c, '\n' | '\r') {
                 break;
-            } else if c.is_whitespace() {
+            } else if is_python_whitespace(c) {
                 count += 1;
                 len += c.text_len();
             } else {
@@ -409,7 +410,7 @@ impl Whitespace {
             } else if matches!(c, '\n' | '\r') {
                 // Indent
                 return (Self::None, TextSize::default());
-            } else if c.is_whitespace() {
+            } else if is_python_whitespace(c) {
                 count += 1;
                 len += c.text_len();
             } else {
diff --git a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs
index 455bd5f0f3..e9ee52f52d 100644
--- a/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs
+++ b/crates/ruff/src/rules/pycodestyle/rules/logical_lines/whitespace_before_comment.rs
@@ -4,6 +4,7 @@ use ruff_diagnostics::Violation;
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_ast::source_code::Locator;
 use ruff_python_ast::token_kind::TokenKind;
+use ruff_python_whitespace::PythonWhitespace;
 
 use crate::checkers::logical_lines::LogicalLinesContext;
 use crate::rules::pycodestyle::rules::logical_lines::LogicalLine;
@@ -156,7 +157,7 @@ pub(crate) fn whitespace_before_comment(
             ));
             let token_text = locator.slice(range);
 
-            let is_inline_comment = !line_text.trim().is_empty();
+            let is_inline_comment = !line_text.trim_whitespace().is_empty();
             if is_inline_comment {
                 if range.start() - prev_end < "  ".text_len() {
                     context.push(
diff --git a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs
index 54e5c887e7..17d106d550 100644
--- a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs
+++ b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_class.rs
@@ -4,7 +4,7 @@ use rustpython_parser::ast::Ranged;
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_semantic::definition::{Definition, Member, MemberKind};
-use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines};
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};
 
 use crate::checkers::ast::Checker;
 use crate::docstrings::Docstring;
@@ -133,10 +133,9 @@ pub(crate) fn blank_before_after_class(checker: &mut Checker, docstring: &Docstr
             .locator
             .slice(TextRange::new(docstring.end(), stmt.end()));
 
-        let all_blank_after = after
-            .universal_newlines()
-            .skip(1)
-            .all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
+        let all_blank_after = after.universal_newlines().skip(1).all(|line| {
+            line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
+        });
         if all_blank_after {
             return;
         }
diff --git a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs
index f7b4a5182a..d8c52c5b66 100644
--- a/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs
+++ b/crates/ruff/src/rules/pydocstyle/rules/blank_before_after_function.rs
@@ -6,7 +6,7 @@ use rustpython_parser::ast::Ranged;
 use ruff_diagnostics::{AlwaysAutofixableViolation, Diagnostic, Edit, Fix};
 use ruff_macros::{derive_message_formats, violation};
 use ruff_python_semantic::definition::{Definition, Member, MemberKind};
-use ruff_python_whitespace::{UniversalNewlineIterator, UniversalNewlines};
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator, UniversalNewlines};
 
 use crate::checkers::ast::Checker;
 use crate::docstrings::Docstring;
@@ -102,10 +102,9 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
             .slice(TextRange::new(docstring.end(), stmt.end()));
 
         // If the docstring is only followed by blank and commented lines, abort.
-        let all_blank_after = after
-            .universal_newlines()
-            .skip(1)
-            .all(|line| line.trim().is_empty() || line.trim_start().starts_with('#'));
+        let all_blank_after = after.universal_newlines().skip(1).all(|line| {
+            line.trim_whitespace().is_empty() || line.trim_whitespace_start().starts_with('#')
+        });
         if all_blank_after {
             return;
         }
@@ -129,7 +128,7 @@ pub(crate) fn blank_before_after_function(checker: &mut Checker, docstring: &Doc
         // Avoid violations for blank lines followed by inner functions or classes.
         if blank_lines_after == 1
             && lines
-                .find(|line| !line.trim_start().starts_with('#'))
+                .find(|line| !line.trim_whitespace_start().starts_with('#'))
                 .map_or(false, |line| INNER_FUNCTION_OR_CLASS_REGEX.is_match(&line))
         {
             return;
diff --git a/crates/ruff_python_ast/src/helpers.rs b/crates/ruff_python_ast/src/helpers.rs
index 627b9c2f09..6b4a2ddbff 100644
--- a/crates/ruff_python_ast/src/helpers.rs
+++ b/crates/ruff_python_ast/src/helpers.rs
@@ -4,7 +4,7 @@ use std::path::Path;
 use itertools::Itertools;
 use log::error;
 use num_traits::Zero;
-use ruff_python_whitespace::UniversalNewlineIterator;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlineIterator};
 use ruff_text_size::{TextRange, TextSize};
 use rustc_hash::{FxHashMap, FxHashSet};
 use rustpython_parser::ast::{
@@ -1031,7 +1031,7 @@ pub fn trailing_lines_end(stmt: &Stmt, locator: &Locator) -> TextSize {
     let rest = &locator.contents()[usize::from(line_end)..];
 
     UniversalNewlineIterator::with_offset(rest, line_end)
-        .take_while(|line| line.trim().is_empty())
+        .take_while(|line| line.trim_whitespace().is_empty())
         .last()
         .map_or(line_end, |l| l.full_end())
 }
diff --git a/crates/ruff_python_formatter/src/comments/placement.rs b/crates/ruff_python_formatter/src/comments/placement.rs
index b5e99495a1..c1c2b8df1f 100644
--- a/crates/ruff_python_formatter/src/comments/placement.rs
+++ b/crates/ruff_python_formatter/src/comments/placement.rs
@@ -4,7 +4,7 @@ use crate::trivia::{SimpleTokenizer, TokenKind};
 use ruff_python_ast::node::AnyNodeRef;
 use ruff_python_ast::source_code::Locator;
 use ruff_python_ast::whitespace;
-use ruff_python_whitespace::UniversalNewlines;
+use ruff_python_whitespace::{PythonWhitespace, UniversalNewlines};
 use ruff_text_size::{TextRange, TextSize};
 use rustpython_parser::ast::Ranged;
 use std::cmp::Ordering;
@@ -986,7 +986,7 @@ fn max_empty_lines(contents: &str) -> usize {
     let mut max_empty_lines = 0;
 
     for line in contents.universal_newlines().skip(1) {
-        if line.trim().is_empty() {
+        if line.trim_whitespace().is_empty() {
             empty_lines += 1;
         } else {
             max_empty_lines = max_empty_lines.max(empty_lines);
diff --git a/crates/ruff_python_whitespace/src/whitespace.rs b/crates/ruff_python_whitespace/src/whitespace.rs
index 00139d3640..8bb161aba3 100644
--- a/crates/ruff_python_whitespace/src/whitespace.rs
+++ b/crates/ruff_python_whitespace/src/whitespace.rs
@@ -13,3 +13,31 @@ pub fn leading_indentation(line: &str) -> &str {
     line.find(|char: char| !is_python_whitespace(char))
         .map_or(line, |index| &line[..index])
 }
+
+pub trait PythonWhitespace {
+    /// Like `str::trim()`, but only removes whitespace characters that Python considers
+    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
+    fn trim_whitespace(&self) -> &Self;
+
+    /// Like `str::trim_start()`, but only removes whitespace characters that Python considers
+    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
+    fn trim_whitespace_start(&self) -> &Self;
+
+    /// Like `str::trim_end()`, but only removes whitespace characters that Python considers
+    /// to be [whitespace](https://docs.python.org/3/reference/lexical_analysis.html#whitespace-between-tokens).
+    fn trim_whitespace_end(&self) -> &Self;
+}
+
+impl PythonWhitespace for str {
+    fn trim_whitespace(&self) -> &Self {
+        self.trim_matches(is_python_whitespace)
+    }
+
+    fn trim_whitespace_start(&self) -> &Self {
+        self.trim_start_matches(is_python_whitespace)
+    }
+
+    fn trim_whitespace_end(&self) -> &Self {
+        self.trim_end_matches(is_python_whitespace)
+    }
+}