diff --git a/crates/ruff_python_formatter/Cargo.toml b/crates/ruff_python_formatter/Cargo.toml index d285f88d22..443fe72ce8 100644 --- a/crates/ruff_python_formatter/Cargo.toml +++ b/crates/ruff_python_formatter/Cargo.toml @@ -29,7 +29,6 @@ clap = { workspace = true } countme = { workspace = true } itertools = { workspace = true } memchr = { workspace = true } -regex = { workspace = true } rustc-hash = { workspace = true } salsa = { workspace = true } serde = { workspace = true, optional = true } diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs index c4554175d5..0a3e5dd8a3 100644 --- a/crates/ruff_python_formatter/src/string/docstring.rs +++ b/crates/ruff_python_formatter/src/string/docstring.rs @@ -3,11 +3,9 @@ #![allow(clippy::doc_markdown)] use std::cmp::Ordering; -use std::sync::LazyLock; use std::{borrow::Cow, collections::VecDeque}; use itertools::Itertools; -use regex::Regex; use ruff_formatter::printer::SourceMapGeneration; use ruff_python_ast::{AnyStringFlags, StringFlags, str::Quote}; @@ -1073,13 +1071,38 @@ impl<'src> CodeExampleRst<'src> { // [directives]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#directives // [Pygments lexer names]: https://pygments.org/docs/lexers/ // [code-block]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block - static DIRECTIVE_START: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?m)^\s*\.\. \s*(?i:code-block|sourcecode)::\s*(?i:python|py|python3|py3)$", - ) - .unwrap() - }); - if !DIRECTIVE_START.is_match(original.line) { + fn is_rst_directive_start(line: &str) -> bool { + let trimmed = line.trim_start(); + + // Must start with ".. " + let Some(rest) = trimmed.strip_prefix(".. ") else { + return false; + }; + let rest = rest.trim_start(); + + // Match "code-block" or "sourcecode" (case-insensitive) + let Some(rest) = strip_prefix_ignore_ascii_case(rest, "code-block") + .or_else(|| strip_prefix_ignore_ascii_case(rest, "sourcecode")) + else { + return false; + }; + + // Must be followed by "::" + let Some(rest) = rest.strip_prefix("::") else { + return false; + }; + let rest = rest.trim_start(); + + // Match Python language identifier (case-insensitive) + let Some(rest) = strip_python_lang_prefix(rest) else { + return false; + }; + + // Line must end immediately after the language identifier (no trailing whitespace) + rest.is_empty() + } + + if !is_rst_directive_start(original.line) { return None; } Some(CodeExampleRst { @@ -1318,50 +1341,13 @@ impl<'src> CodeExampleMarkdown<'src> { /// /// [fenced code block]: https://spec.commonmark.org/0.30/#fenced-code-blocks fn new(original: InputDocstringLine<'src>) -> Option> { - static FENCE_START: LazyLock = LazyLock::new(|| { - Regex::new( - r"(?xm) - ^ - (?: - # In the backtick case, info strings (following the fence) - # cannot contain backticks themselves, since it would - # introduce ambiguity with parsing inline code. In other - # words, if we didn't specifically exclude matching ` - # in the info string for backtick fences, then we might - # erroneously consider something to be a code fence block - # that is actually inline code. - # - # NOTE: The `ticklang` and `tildlang` capture groups are - # currently unused, but there was some discussion about not - # assuming unlabeled blocks were Python. At the time of - # writing, we do assume unlabeled blocks are Python, but - # one could inspect the `ticklang` and `tildlang` capture - # groups to determine whether the block is labeled or not. - (?```+)(?:\s*(?(?i:python|py|python3|py3))[^`]*)? - | - (?~~~+)(?:\s*(?(?i:python|py|python3|py3))\p{any}*)? - ) - $ - ", - ) - .unwrap() - }); - let (opening_fence_indent, rest) = indent_with_suffix(original.line); // Quit quickly in the vast majority of cases. if !rest.starts_with("```") && !rest.starts_with("~~~") { return None; } - let caps = FENCE_START.captures(rest)?; - let (fence_kind, fence_len) = if let Some(ticks) = caps.name("ticks") { - (MarkdownFenceKind::Backtick, ticks.as_str().chars().count()) - } else { - let tildes = caps - .name("tilds") - .expect("no ticks means it must be tildes"); - (MarkdownFenceKind::Tilde, tildes.as_str().chars().count()) - }; + let (fence_kind, fence_len) = Self::parse_markdown_fence_start(rest)?; Some(CodeExampleMarkdown { lines: vec![], opening_fence_indent: Indentation::from_str(opening_fence_indent), @@ -1481,6 +1467,60 @@ impl<'src> CodeExampleMarkdown<'src> { fn into_reset_action(self) -> CodeExampleAddAction<'src> { CodeExampleAddAction::Reset { code: self.lines } } + + /// Parses a Markdown fenced code block opening line. + /// + /// Returns the fence type and length if the line is a valid Python code fence. + /// Returns `None` if not a valid Python code fence. + /// + /// In the backtick case, info strings (following the fence) cannot contain + /// backticks themselves, since it would introduce ambiguity with parsing + /// inline code. In other words, if we didn't specifically exclude matching + /// backticks in the info string for backtick fences, then we might + /// erroneously consider something to be a code fence block that is actually + /// inline code. + fn parse_markdown_fence_start(line: &str) -> Option<(MarkdownFenceKind, usize)> { + // Check if it's backticks or tildes + let (fence_char, kind) = if line.starts_with('`') { + ('`', MarkdownFenceKind::Backtick) + } else if line.starts_with('~') { + ('~', MarkdownFenceKind::Tilde) + } else { + return None; + }; + + // Count consecutive fence characters (need at least 3) + let fence_len = line.bytes().take_while(|&b| b == fence_char as u8).count(); + if fence_len < 3 { + return None; + } + + // Get content after the fence + let rest = &line[fence_len..]; + + let info_string = rest.trim(); + + // For backtick fences, info string cannot contain backticks + if fence_char == '`' && info_string.contains('`') { + return None; + } + + // Empty info string is treated as Python (matches original implementation) + if info_string.is_empty() { + return Some((kind, fence_len)); + } + + // Check if it starts with a Python language identifier using state machine. + // NOTE: This is stricter than the original regex which matched any info string + // starting with py/python (e.g., "python-repl" would have matched). We now require + // an exact language identifier followed by whitespace or end of string, which is + // more conservative and avoids matching non-Python formats like "pycon" or "python-repl". + if strip_python_lang_prefix(info_string).is_some() { + return Some((kind, fence_len)); + } + + None + } } /// The kind of fence used in a Markdown code block. @@ -1897,9 +1937,100 @@ fn is_rst_option(line: &str) -> bool { .any(|ch| ch == ':') } +/// Case-insensitive ASCII prefix stripping. +/// +/// If `s` starts with `prefix` (case-insensitive), returns the remainder of `s` +/// after the prefix. Otherwise, returns `None`. +fn strip_prefix_ignore_ascii_case<'a>(s: &'a str, prefix: &str) -> Option<&'a str> { + let prefix_len = prefix.len(); + // Use byte slicing to avoid panic on non-ASCII strings. + // This is safe because `prefix` is always ASCII in our usage. + if s.as_bytes() + .get(..prefix_len)? + .eq_ignore_ascii_case(prefix.as_bytes()) + { + // SAFETY: prefix_len is guaranteed to be on a valid UTF-8 boundary + // because we only call this function with ASCII prefixes. + Some(&s[prefix_len..]) + } else { + None + } +} + +/// Matches a Python language identifier using a state machine. +/// +/// Matches "py", "py3", "python", or "python3" (case-insensitive) and returns +/// the remainder of the string after the match. This is more efficient than +/// multiple `strip_prefix_ignore_ascii_case` calls as it traverses the input +/// only once. +/// +/// State machine structure: +/// ```text +/// Start -> 'p' -> 'y' -> (accept "py") +/// -> '3' -> (accept "py3") +/// -> 't' -> 'h' -> 'o' -> 'n' -> (accept "python") +/// -> '3' -> (accept "python3") +/// ``` +fn strip_python_lang_prefix(s: &str) -> Option<&str> { + let bytes = s.as_bytes(); + + // State 0-1: expect "py" + if !bytes.get(..2)?.eq_ignore_ascii_case(b"py") { + return None; + } + + // SAFETY for all `s.get(n..)` calls below: + // We only slice after verifying that the preceding bytes are ASCII characters. + // Since ASCII characters are single-byte in UTF-8, slicing at these indices + // is guaranteed to be on valid UTF-8 boundaries. + + // State 2: "py" matched - check what's next + match bytes.get(2).map(u8::to_ascii_lowercase) { + // "py" followed by end or whitespace -> accept "py" + None => return s.get(2..), + Some(b) if b.is_ascii_whitespace() => return s.get(2..), + + // "py3" -> accept "py3" + Some(b'3') => { + return match bytes.get(3) { + None => s.get(3..), + Some(b) if b.is_ascii_whitespace() => s.get(3..), + Some(_) => None, + }; + } + + // Continue to "python" - check "thon" suffix + Some(b't') => { + if !bytes.get(3..6)?.eq_ignore_ascii_case(b"hon") { + return None; + } + } + + // Invalid + Some(_) => return None, + } + + // State 6: "python" matched - check what's next + match bytes.get(6).map(u8::to_ascii_lowercase) { + // "python" followed by end or whitespace -> accept "python" + None => s.get(6..), + Some(b) if b.is_ascii_whitespace() => s.get(6..), + + // "python3" -> accept "python3" + Some(b'3') => match bytes.get(7) { + None => s.get(7..), + Some(b) if b.is_ascii_whitespace() => s.get(7..), + Some(_) => None, + }, + + // Invalid (e.g., "pythonx") + Some(_) => None, + } +} + #[cfg(test)] mod tests { - use crate::string::docstring::Indentation; + use crate::string::docstring::{Indentation, strip_python_lang_prefix}; #[test] fn indentation_like_black() { @@ -1908,4 +2039,43 @@ mod tests { assert_eq!(Indentation::from_str("\t\t\t").columns(), 24); assert_eq!(Indentation::from_str(" ").columns(), 4); } + + #[test] + fn python_lang_state_machine() { + // Valid matches - exact + assert_eq!(strip_python_lang_prefix("py"), Some("")); + assert_eq!(strip_python_lang_prefix("py3"), Some("")); + assert_eq!(strip_python_lang_prefix("python"), Some("")); + assert_eq!(strip_python_lang_prefix("python3"), Some("")); + + // Valid matches - case insensitive + assert_eq!(strip_python_lang_prefix("PY"), Some("")); + assert_eq!(strip_python_lang_prefix("Py3"), Some("")); + assert_eq!(strip_python_lang_prefix("Python"), Some("")); + assert_eq!(strip_python_lang_prefix("PYTHON3"), Some("")); + assert_eq!(strip_python_lang_prefix("PyThOn"), Some("")); + + // Valid matches - with trailing whitespace + assert_eq!(strip_python_lang_prefix("py "), Some(" ")); + assert_eq!(strip_python_lang_prefix("python\t"), Some("\t")); + assert_eq!(strip_python_lang_prefix("python3 extra"), Some(" extra")); + + // Invalid - prefix only + assert_eq!(strip_python_lang_prefix("p"), None); + assert_eq!(strip_python_lang_prefix("pyt"), None); + assert_eq!(strip_python_lang_prefix("pyth"), None); + assert_eq!(strip_python_lang_prefix("pytho"), None); // # spellchecker:disable-line + + // Invalid - no word boundary + assert_eq!(strip_python_lang_prefix("pyx"), None); + assert_eq!(strip_python_lang_prefix("py33"), None); + assert_eq!(strip_python_lang_prefix("pythonx"), None); + assert_eq!(strip_python_lang_prefix("python33"), None); + assert_eq!(strip_python_lang_prefix("python3x"), None); + + // Invalid - completely different + assert_eq!(strip_python_lang_prefix("rust"), None); + assert_eq!(strip_python_lang_prefix(""), None); + assert_eq!(strip_python_lang_prefix("javascript"), None); + } }