From 23084c57864d4577ea655637ae13d2919fc5d0b7 Mon Sep 17 00:00:00 2001 From: magic-akari Date: Sun, 7 Dec 2025 05:47:35 +0800 Subject: [PATCH 1/4] Remove `regex` dependency from `ruff_python_formatter` --- crates/ruff_python_formatter/Cargo.toml | 1 - .../src/string/docstring.rs | 254 ++++++++++++++---- 2 files changed, 206 insertions(+), 49 deletions(-) diff --git a/crates/ruff_python_formatter/Cargo.toml b/crates/ruff_python_formatter/Cargo.toml index 95ca60ee10..56b9792c6e 100644 --- a/crates/ruff_python_formatter/Cargo.toml +++ b/crates/ruff_python_formatter/Cargo.toml @@ -29,7 +29,6 @@ clap = { workspace = true } countme = { workspace = true } itertools = { workspace = true } memchr = { workspace = true } -regex = { workspace = true } rustc-hash = { workspace = true } salsa = { workspace = true } serde = { workspace = true, optional = true } diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index ad357fc65e..01317cc798 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -3,11 +3,9 @@ #![allow(clippy::doc_markdown)] use std::cmp::Ordering; -use std::sync::LazyLock; use std::{borrow::Cow, collections::VecDeque}; use itertools::Itertools; -use regex::Regex; use ruff_formatter::printer::SourceMapGeneration; use ruff_python_ast::{AnyStringFlags, StringFlags, str::Quote}; @@ -1073,13 +1071,38 @@ impl<'src> CodeExampleRst<'src> { // [directives]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#directives // [Pygments lexer names]: https://pygments.org/docs/lexers/ // [code-block]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block - static DIRECTIVE_START: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?m)^\s*\.\. \s*(?i:code-block|sourcecode)::\s*(?i:python|py|python3|py3)$", - ) - .unwrap() - }); - if !DIRECTIVE_START.is_match(original.line) { + fn is_rst_directive_start(line: &str) -> bool { + let trimmed = line.trim_start(); + + // Must start with ".. " + let Some(rest) = trimmed.strip_prefix(".. ") else { + return false; + }; + let rest = rest.trim_start(); + + // Match "code-block" or "sourcecode" (case-insensitive) + let Some(rest) = strip_prefix_ignore_ascii_case(rest, "code-block") + .or_else(|| strip_prefix_ignore_ascii_case(rest, "sourcecode")) + else { + return false; + }; + + // Must be followed by "::" + let Some(rest) = rest.strip_prefix("::") else { + return false; + }; + let rest = rest.trim_start(); + + // Match Python language identifier (case-insensitive) + let Some(rest) = strip_python_lang_prefix(rest) else { + return false; + }; + + // Line must end with only whitespace after the language identifier + rest.trim().is_empty() + } + + if !is_rst_directive_start(original.line) { return None; } Some(CodeExampleRst { @@ -1318,50 +1341,13 @@ impl<'src> CodeExampleMarkdown<'src> { /// /// [fenced code block]: https://spec.commonmark.org/0.30/#fenced-code-blocks fn new(original: InputDocstringLine<'src>) -> Option> { - static FENCE_START: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?xm) - ^ - (?: - # In the backtick case, info strings (following the fence) - # cannot contain backticks themselves, since it would - # introduce ambiguity with parsing inline code. In other - # words, if we didn't specifically exclude matching ` - # in the info string for backtick fences, then we might - # erroneously consider something to be a code fence block - # that is actually inline code. - # - # NOTE: The `ticklang` and `tildlang` capture groups are - # currently unused, but there was some discussion about not - # assuming unlabeled blocks were Python. At the time of - # writing, we do assume unlabeled blocks are Python, but - # one could inspect the `ticklang` and `tildlang` capture - # groups to determine whether the block is labeled or not. - (?```+)(?:\s*(?(?i:python|py|python3|py3))[^`]*)? - | - (?~~~+)(?:\s*(?(?i:python|py|python3|py3))\p{any}*)? - ) - $ - ", - ) - .unwrap() - }); - let (opening_fence_indent, rest) = indent_with_suffix(original.line); // Quit quickly in the vast majority of cases. if !rest.starts_with("```") && !rest.starts_with("~~~") { return None; } - let caps = FENCE_START.captures(rest)?; - let (fence_kind, fence_len) = if let Some(ticks) = caps.name("ticks") { - (MarkdownFenceKind::Backtick, ticks.as_str().chars().count()) - } else { - let tildes = caps - .name("tilds") - .expect("no ticks means it must be tildes"); - (MarkdownFenceKind::Tilde, tildes.as_str().chars().count()) - }; + let (fence_kind, fence_len) = Self::parse_markdown_fence_start(rest)?; Some(CodeExampleMarkdown { lines: vec![], opening_fence_indent: Indentation::from_str(opening_fence_indent), @@ -1481,6 +1467,58 @@ impl<'src> CodeExampleMarkdown<'src> { fn into_reset_action(self) -> CodeExampleAddAction<'src> { CodeExampleAddAction::Reset { code: self.lines } } + + /// Parses a Markdown fenced code block opening line. + /// + /// Returns the fence type and length if the line is a valid Python code fence. + /// Returns `None` if not a valid Python code fence. + /// + /// In the backtick case, info strings (following the fence) cannot contain + /// backticks themselves, since it would introduce ambiguity with parsing + /// inline code. In other words, if we didn't specifically exclude matching + /// backticks in the info string for backtick fences, then we might + /// erroneously consider something to be a code fence block that is actually + /// inline code. + fn parse_markdown_fence_start(line: &str) -> Option<(MarkdownFenceKind, usize)> { + // Check if it's backticks or tildes + let (fence_char, kind) = if line.starts_with('`') { + ('`', MarkdownFenceKind::Backtick) + } else if line.starts_with('~') { + ('~', MarkdownFenceKind::Tilde) + } else { + return None; + }; + + // Count consecutive fence characters (need at least 3) + let fence_len = line.bytes().take_while(|&b| b == fence_char as u8).count(); + if fence_len < 3 { + return None; + } + + // Get content after the fence + let rest = &line[fence_len..]; + + // For backtick fences, info string cannot contain backticks + if fence_char == '`' && rest.contains('`') { + return None; + } + + // Check the language identifier + let info_string = rest.trim(); + + // Empty info string is treated as Python (matches original implementation) + if info_string.is_empty() { + return Some((kind, fence_len)); + } + + // Check if it starts with a Python language identifier using state machine + // The state machine only matches if followed by whitespace or end of string + if strip_python_lang_prefix(info_string).is_some() { + return Some((kind, fence_len)); + } + + None + } } /// The kind of fence used in a Markdown code block. @@ -1897,9 +1935,90 @@ fn is_rst_option(line: &str) -> bool { .any(|ch| ch == ':') } +/// Case-insensitive ASCII prefix stripping. +/// +/// If `s` starts with `prefix` (case-insensitive), returns the remainder of `s` +/// after the prefix. Otherwise, returns `None`. +fn strip_prefix_ignore_ascii_case<'a>(s: &'a str, prefix: &str) -> Option<&'a str> { + if s.len() < prefix.len() { + return None; + } + if s[..prefix.len()].eq_ignore_ascii_case(prefix) { + Some(&s[prefix.len()..]) + } else { + None + } +} + +/// Matches a Python language identifier using a state machine. +/// +/// Matches "py", "py3", "python", or "python3" (case-insensitive) and returns +/// the remainder of the string after the match. This is more efficient than +/// multiple `strip_prefix_ignore_ascii_case` calls as it traverses the input +/// only once. +/// +/// State machine structure: +/// ```text +/// Start -> 'p' -> 'y' -> (accept "py") +/// -> '3' -> (accept "py3") +/// -> 't' -> 'h' -> 'o' -> 'n' -> (accept "python") +/// -> '3' -> (accept "python3") +/// ``` +fn strip_python_lang_prefix(s: &str) -> Option<&str> { + let bytes = s.as_bytes(); + + // State 0-1: expect "py" + if !bytes.get(..2)?.eq_ignore_ascii_case(b"py") { + return None; + } + + // State 2: "py" matched - check what's next + match bytes.get(2).map(u8::to_ascii_lowercase) { + // "py" followed by end or whitespace -> accept "py" + None => return Some(&s[2..]), + Some(b) if b.is_ascii_whitespace() => return Some(&s[2..]), + + // "py3" -> accept "py3" + Some(b'3') => { + return match bytes.get(3) { + None => Some(&s[3..]), + Some(b) if b.is_ascii_whitespace() => Some(&s[3..]), + Some(_) => None, + }; + } + + // Continue to "python" - check "thon" suffix + Some(b't') => { + if !bytes.get(3..6)?.eq_ignore_ascii_case(b"hon") { + return None; + } + } + + // Invalid + Some(_) => return None, + } + + // State 6: "python" matched - check what's next + match bytes.get(6).map(u8::to_ascii_lowercase) { + // "python" followed by end or whitespace -> accept "python" + None => Some(&s[6..]), + Some(b) if b.is_ascii_whitespace() => Some(&s[6..]), + + // "python3" -> accept "python3" + Some(b'3') => match bytes.get(7) { + None => Some(&s[7..]), + Some(b) if b.is_ascii_whitespace() => Some(&s[7..]), + Some(_) => None, + }, + + // Invalid (e.g., "pythonx") + Some(_) => None, + } +} + #[cfg(test)] mod tests { - use crate::string::docstring::Indentation; + use crate::string::docstring::{Indentation, strip_python_lang_prefix}; #[test] fn indentation_like_black() { @@ -1908,4 +2027,43 @@ mod tests { assert_eq!(Indentation::from_str("\t\t\t").columns(), 24); assert_eq!(Indentation::from_str(" ").columns(), 4); } + + #[test] + fn python_lang_state_machine() { + // Valid matches - exact + assert_eq!(strip_python_lang_prefix("py"), Some("")); + assert_eq!(strip_python_lang_prefix("py3"), Some("")); + assert_eq!(strip_python_lang_prefix("python"), Some("")); + assert_eq!(strip_python_lang_prefix("python3"), Some("")); + + // Valid matches - case insensitive + assert_eq!(strip_python_lang_prefix("PY"), Some("")); + assert_eq!(strip_python_lang_prefix("Py3"), Some("")); + assert_eq!(strip_python_lang_prefix("Python"), Some("")); + assert_eq!(strip_python_lang_prefix("PYTHON3"), Some("")); + assert_eq!(strip_python_lang_prefix("PyThOn"), Some("")); + + // Valid matches - with trailing whitespace + assert_eq!(strip_python_lang_prefix("py "), Some(" ")); + assert_eq!(strip_python_lang_prefix("python\t"), Some("\t")); + assert_eq!(strip_python_lang_prefix("python3 extra"), Some(" extra")); + + // Invalid - prefix only + assert_eq!(strip_python_lang_prefix("p"), None); + assert_eq!(strip_python_lang_prefix("pyt"), None); + assert_eq!(strip_python_lang_prefix("pyth"), None); + assert_eq!(strip_python_lang_prefix("pytho"), None); + + // Invalid - no word boundary + assert_eq!(strip_python_lang_prefix("pyx"), None); + assert_eq!(strip_python_lang_prefix("py33"), None); + assert_eq!(strip_python_lang_prefix("pythonx"), None); + assert_eq!(strip_python_lang_prefix("python33"), None); + assert_eq!(strip_python_lang_prefix("python3x"), None); + + // Invalid - completely different + assert_eq!(strip_python_lang_prefix("rust"), None); + assert_eq!(strip_python_lang_prefix(""), None); + assert_eq!(strip_python_lang_prefix("javascript"), None); + } } From 59ccc11e610eb80a4fde2b20b64839f1686645db Mon Sep 17 00:00:00 2001 From: magic-akari Date: Sun, 7 Dec 2025 06:01:46 +0800 Subject: [PATCH 2/4] fix: typos issue --- crates/ruff_python_formatter/src/string/docstring.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index 01317cc798..2f377f5510 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -2052,7 +2052,7 @@ mod tests { assert_eq!(strip_python_lang_prefix("p"), None); assert_eq!(strip_python_lang_prefix("pyt"), None); assert_eq!(strip_python_lang_prefix("pyth"), None); - assert_eq!(strip_python_lang_prefix("pytho"), None); + assert_eq!(strip_python_lang_prefix("pytho"), None); // # spellchecker:disable-line // Invalid - no word boundary assert_eq!(strip_python_lang_prefix("pyx"), None); From 3d93408526c9ad1c22c2dc0d2513df9e65d6eaea Mon Sep 17 00:00:00 2001 From: magic-akari Date: Sun, 7 Dec 2025 06:49:23 +0800 Subject: [PATCH 3/4] Address review feedback --- crates/ruff_python_formatter/src/string/docstring.rs | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index 2f377f5510..6d002dbebd 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -1098,8 +1098,8 @@ impl<'src> CodeExampleRst<'src> { return false; }; - // Line must end with only whitespace after the language identifier - rest.trim().is_empty() + // Line must end immediately after the language identifier (no trailing whitespace) + rest.is_empty() } if !is_rst_directive_start(original.line) { @@ -1511,8 +1511,11 @@ impl<'src> CodeExampleMarkdown<'src> { return Some((kind, fence_len)); } - // Check if it starts with a Python language identifier using state machine - // The state machine only matches if followed by whitespace or end of string + // Check if it starts with a Python language identifier using state machine. + // NOTE: This is stricter than the original regex which matched any info string + // starting with py/python (e.g., "python-repl" would have matched). We now require + // an exact language identifier followed by whitespace or end of string, which is + // more conservative and avoids matching non-Python formats like "pycon" or "python-repl". if strip_python_lang_prefix(info_string).is_some() { return Some((kind, fence_len)); } From 4e88adbdc46239ed236c9368f59fb9be2f4791f5 Mon Sep 17 00:00:00 2001 From: magic-akari Date: Mon, 8 Dec 2025 19:44:54 +0800 Subject: [PATCH 4/4] Address review feedback --- .../src/string/docstring.rs | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index 6d002dbebd..3162374be0 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -1498,14 +1498,13 @@ impl<'src> CodeExampleMarkdown<'src> { // Get content after the fence let rest = &line[fence_len..]; + let info_string = rest.trim(); + // For backtick fences, info string cannot contain backticks - if fence_char == '`' && rest.contains('`') { + if fence_char == '`' && info_string.contains('`') { return None; } - // Check the language identifier - let info_string = rest.trim(); - // Empty info string is treated as Python (matches original implementation) if info_string.is_empty() { return Some((kind, fence_len)); @@ -1943,11 +1942,16 @@ fn is_rst_option(line: &str) -> bool { /// If `s` starts with `prefix` (case-insensitive), returns the remainder of `s` /// after the prefix. Otherwise, returns `None`. fn strip_prefix_ignore_ascii_case<'a>(s: &'a str, prefix: &str) -> Option<&'a str> { - if s.len() < prefix.len() { - return None; - } - if s[..prefix.len()].eq_ignore_ascii_case(prefix) { - Some(&s[prefix.len()..]) + let prefix_len = prefix.len(); + // Use byte slicing to avoid panic on non-ASCII strings. + // This is safe because `prefix` is always ASCII in our usage. + if s.as_bytes() + .get(..prefix_len)? + .eq_ignore_ascii_case(prefix.as_bytes()) + { + // SAFETY: prefix_len is guaranteed to be on a valid UTF-8 boundary + // because we only call this function with ASCII prefixes. + Some(&s[prefix_len..]) } else { None } @@ -1975,17 +1979,22 @@ fn strip_python_lang_prefix(s: &str) -> Option<&str> { return None; } + // SAFETY for all `s.get(n..)` calls below: + // We only slice after verifying that the preceding bytes are ASCII characters. + // Since ASCII characters are single-byte in UTF-8, slicing at these indices + // is guaranteed to be on valid UTF-8 boundaries. + // State 2: "py" matched - check what's next match bytes.get(2).map(u8::to_ascii_lowercase) { // "py" followed by end or whitespace -> accept "py" - None => return Some(&s[2..]), - Some(b) if b.is_ascii_whitespace() => return Some(&s[2..]), + None => return s.get(2..), + Some(b) if b.is_ascii_whitespace() => return s.get(2..), // "py3" -> accept "py3" Some(b'3') => { return match bytes.get(3) { - None => Some(&s[3..]), - Some(b) if b.is_ascii_whitespace() => Some(&s[3..]), + None => s.get(3..), + Some(b) if b.is_ascii_whitespace() => s.get(3..), Some(_) => None, }; } @@ -2004,13 +2013,13 @@ fn strip_python_lang_prefix(s: &str) -> Option<&str> { // State 6: "python" matched - check what's next match bytes.get(6).map(u8::to_ascii_lowercase) { // "python" followed by end or whitespace -> accept "python" - None => Some(&s[6..]), - Some(b) if b.is_ascii_whitespace() => Some(&s[6..]), + None => s.get(6..), + Some(b) if b.is_ascii_whitespace() => s.get(6..), // "python3" -> accept "python3" Some(b'3') => match bytes.get(7) { - None => Some(&s[7..]), - Some(b) if b.is_ascii_whitespace() => Some(&s[7..]), + None => s.get(7..), + Some(b) if b.is_ascii_whitespace() => s.get(7..), Some(_) => None, },