From 23084c57864d4577ea655637ae13d2919fc5d0b7 Mon Sep 17 00:00:00 2001
From: magic-akari <akari.ccino@gmail.com>
Date: Sun, 7 Dec 2025 05:47:35 +0800
Subject: [PATCH 1/4] Remove `regex` dependency from `ruff_python_formatter`

---
 crates/ruff_python_formatter/Cargo.toml       |   1 -
 .../src/string/docstring.rs                   | 254 ++++++++++++++----
 2 files changed, 206 insertions(+), 49 deletions(-)
diff --git a/crates/ruff_python_formatter/Cargo.toml b/crates/ruff_python_formatter/Cargo.toml
index 95ca60ee10..56b9792c6e 100644
--- a/crates/ruff_python_formatter/Cargo.toml
+++ b/crates/ruff_python_formatter/Cargo.toml
@@ -29,7 +29,6 @@ clap = { workspace = true }
 countme = { workspace = true }
 itertools = { workspace = true }
 memchr = { workspace = true }
-regex = { workspace = true }
 rustc-hash = { workspace = true }
 salsa = { workspace = true }
 serde = { workspace = true, optional = true }
diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs
index ad357fc65e..01317cc798 100644
--- a/crates/ruff_python_formatter/src/string/docstring.rs
+++ b/crates/ruff_python_formatter/src/string/docstring.rs
@@ -3,11 +3,9 @@
 #![allow(clippy::doc_markdown)]
 
 use std::cmp::Ordering;
-use std::sync::LazyLock;
 use std::{borrow::Cow, collections::VecDeque};
 
 use itertools::Itertools;
-use regex::Regex;
 
 use ruff_formatter::printer::SourceMapGeneration;
 use ruff_python_ast::{AnyStringFlags, StringFlags, str::Quote};
@@ -1073,13 +1071,38 @@ impl<'src> CodeExampleRst<'src> {
         // [directives]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#directives
         // [Pygments lexer names]: https://pygments.org/docs/lexers/
         // [code-block]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block
-        static DIRECTIVE_START: LazyLock<Regex> = LazyLock::new(|| {
-            Regex::new(
-                r"(?m)^\s*\.\. \s*(?i:code-block|sourcecode)::\s*(?i:python|py|python3|py3)$",
-            )
-            .unwrap()
-        });
-        if !DIRECTIVE_START.is_match(original.line) {
+        fn is_rst_directive_start(line: &str) -> bool {
+            let trimmed = line.trim_start();
+
+            // Must start with ".. "
+            let Some(rest) = trimmed.strip_prefix(".. ") else {
+                return false;
+            };
+            let rest = rest.trim_start();
+
+            // Match "code-block" or "sourcecode" (case-insensitive)
+            let Some(rest) = strip_prefix_ignore_ascii_case(rest, "code-block")
+                .or_else(|| strip_prefix_ignore_ascii_case(rest, "sourcecode"))
+            else {
+                return false;
+            };
+
+            // Must be followed by "::"
+            let Some(rest) = rest.strip_prefix("::") else {
+                return false;
+            };
+            let rest = rest.trim_start();
+
+            // Match Python language identifier (case-insensitive)
+            let Some(rest) = strip_python_lang_prefix(rest) else {
+                return false;
+            };
+
+            // Line must end with only whitespace after the language identifier
+            rest.trim().is_empty()
+        }
+
+        if !is_rst_directive_start(original.line) {
             return None;
         }
         Some(CodeExampleRst {
@@ -1318,50 +1341,13 @@ impl<'src> CodeExampleMarkdown<'src> {
     ///
     /// [fenced code block]: https://spec.commonmark.org/0.30/#fenced-code-blocks
     fn new(original: InputDocstringLine<'src>) -> Option<CodeExampleMarkdown<'src>> {
-        static FENCE_START: LazyLock<Regex> = LazyLock::new(|| {
-            Regex::new(
-                r"(?xm)
-                ^
-                (?:
-                    # In the backtick case, info strings (following the fence)
-                    # cannot contain backticks themselves, since it would
-                    # introduce ambiguity with parsing inline code. In other
-                    # words, if we didn't specifically exclude matching `
-                    # in the info string for backtick fences, then we might
-                    # erroneously consider something to be a code fence block
-                    # that is actually inline code.
-                    #
-                    # NOTE: The `ticklang` and `tildlang` capture groups are
-                    # currently unused, but there was some discussion about not
-                    # assuming unlabeled blocks were Python. At the time of
-                    # writing, we do assume unlabeled blocks are Python, but
-                    # one could inspect the `ticklang` and `tildlang` capture
-                    # groups to determine whether the block is labeled or not.
-                    (?<ticks>```+)(?:\s*(?<ticklang>(?i:python|py|python3|py3))[^`]*)?
-                    |
-                    (?<tilds>~~~+)(?:\s*(?<tildlang>(?i:python|py|python3|py3))\p{any}*)?
-                )
-                $
-                ",
-            )
-            .unwrap()
-        });
-
         let (opening_fence_indent, rest) = indent_with_suffix(original.line);
         // Quit quickly in the vast majority of cases.
         if !rest.starts_with("```") && !rest.starts_with("~~~") {
             return None;
         }
 
-        let caps = FENCE_START.captures(rest)?;
-        let (fence_kind, fence_len) = if let Some(ticks) = caps.name("ticks") {
-            (MarkdownFenceKind::Backtick, ticks.as_str().chars().count())
-        } else {
-            let tildes = caps
-                .name("tilds")
-                .expect("no ticks means it must be tildes");
-            (MarkdownFenceKind::Tilde, tildes.as_str().chars().count())
-        };
+        let (fence_kind, fence_len) = Self::parse_markdown_fence_start(rest)?;
         Some(CodeExampleMarkdown {
             lines: vec![],
             opening_fence_indent: Indentation::from_str(opening_fence_indent),
@@ -1481,6 +1467,58 @@ impl<'src> CodeExampleMarkdown<'src> {
     fn into_reset_action(self) -> CodeExampleAddAction<'src> {
         CodeExampleAddAction::Reset { code: self.lines }
     }
+
+    /// Parses a Markdown fenced code block opening line.
+    ///
+    /// Returns the fence type and length if the line is a valid Python code fence.
+    /// Returns `None` if not a valid Python code fence.
+    ///
+    /// In the backtick case, info strings (following the fence) cannot contain
+    /// backticks themselves, since it would introduce ambiguity with parsing
+    /// inline code. In other words, if we didn't specifically exclude matching
+    /// backticks in the info string for backtick fences, then we might
+    /// erroneously consider something to be a code fence block that is actually
+    /// inline code.
+    fn parse_markdown_fence_start(line: &str) -> Option<(MarkdownFenceKind, usize)> {
+        // Check if it's backticks or tildes
+        let (fence_char, kind) = if line.starts_with('`') {
+            ('`', MarkdownFenceKind::Backtick)
+        } else if line.starts_with('~') {
+            ('~', MarkdownFenceKind::Tilde)
+        } else {
+            return None;
+        };
+
+        // Count consecutive fence characters (need at least 3)
+        let fence_len = line.bytes().take_while(|&b| b == fence_char as u8).count();
+        if fence_len < 3 {
+            return None;
+        }
+
+        // Get content after the fence
+        let rest = &line[fence_len..];
+
+        // For backtick fences, info string cannot contain backticks
+        if fence_char == '`' && rest.contains('`') {
+            return None;
+        }
+
+        // Check the language identifier
+        let info_string = rest.trim();
+
+        // Empty info string is treated as Python (matches original implementation)
+        if info_string.is_empty() {
+            return Some((kind, fence_len));
+        }
+
+        // Check if it starts with a Python language identifier using state machine
+        // The state machine only matches if followed by whitespace or end of string
+        if strip_python_lang_prefix(info_string).is_some() {
+            return Some((kind, fence_len));
+        }
+
+        None
+    }
 }
 
 /// The kind of fence used in a Markdown code block.
@@ -1897,9 +1935,90 @@ fn is_rst_option(line: &str) -> bool {
         .any(|ch| ch == ':')
 }
 
+/// Case-insensitive ASCII prefix stripping.
+///
+/// If `s` starts with `prefix` (case-insensitive), returns the remainder of `s`
+/// after the prefix. Otherwise, returns `None`.
+fn strip_prefix_ignore_ascii_case<'a>(s: &'a str, prefix: &str) -> Option<&'a str> {
+    if s.len() < prefix.len() {
+        return None;
+    }
+    if s[..prefix.len()].eq_ignore_ascii_case(prefix) {
+        Some(&s[prefix.len()..])
+    } else {
+        None
+    }
+}
+
+/// Matches a Python language identifier using a state machine.
+///
+/// Matches "py", "py3", "python", or "python3" (case-insensitive) and returns
+/// the remainder of the string after the match. This is more efficient than
+/// multiple `strip_prefix_ignore_ascii_case` calls as it traverses the input
+/// only once.
+///
+/// State machine structure:
+/// ```text
+/// Start -> 'p' -> 'y' -> (accept "py")
+///                     -> '3' -> (accept "py3")
+///                     -> 't' -> 'h' -> 'o' -> 'n' -> (accept "python")
+///                                                 -> '3' -> (accept "python3")
+/// ```
+fn strip_python_lang_prefix(s: &str) -> Option<&str> {
+    let bytes = s.as_bytes();
+
+    // State 0-1: expect "py"
+    if !bytes.get(..2)?.eq_ignore_ascii_case(b"py") {
+        return None;
+    }
+
+    // State 2: "py" matched - check what's next
+    match bytes.get(2).map(u8::to_ascii_lowercase) {
+        // "py" followed by end or whitespace -> accept "py"
+        None => return Some(&s[2..]),
+        Some(b) if b.is_ascii_whitespace() => return Some(&s[2..]),
+
+        // "py3" -> accept "py3"
+        Some(b'3') => {
+            return match bytes.get(3) {
+                None => Some(&s[3..]),
+                Some(b) if b.is_ascii_whitespace() => Some(&s[3..]),
+                Some(_) => None,
+            };
+        }
+
+        // Continue to "python" - check "thon" suffix
+        Some(b't') => {
+            if !bytes.get(3..6)?.eq_ignore_ascii_case(b"hon") {
+                return None;
+            }
+        }
+
+        // Invalid
+        Some(_) => return None,
+    }
+
+    // State 6: "python" matched - check what's next
+    match bytes.get(6).map(u8::to_ascii_lowercase) {
+        // "python" followed by end or whitespace -> accept "python"
+        None => Some(&s[6..]),
+        Some(b) if b.is_ascii_whitespace() => Some(&s[6..]),
+
+        // "python3" -> accept "python3"
+        Some(b'3') => match bytes.get(7) {
+            None => Some(&s[7..]),
+            Some(b) if b.is_ascii_whitespace() => Some(&s[7..]),
+            Some(_) => None,
+        },
+
+        // Invalid (e.g., "pythonx")
+        Some(_) => None,
+    }
+}
+
 #[cfg(test)]
 mod tests {
-    use crate::string::docstring::Indentation;
+    use crate::string::docstring::{Indentation, strip_python_lang_prefix};
 
     #[test]
     fn indentation_like_black() {
@@ -1908,4 +2027,43 @@ mod tests {
         assert_eq!(Indentation::from_str("\t\t\t").columns(), 24);
         assert_eq!(Indentation::from_str("    ").columns(), 4);
     }
+
+    #[test]
+    fn python_lang_state_machine() {
+        // Valid matches - exact
+        assert_eq!(strip_python_lang_prefix("py"), Some(""));
+        assert_eq!(strip_python_lang_prefix("py3"), Some(""));
+        assert_eq!(strip_python_lang_prefix("python"), Some(""));
+        assert_eq!(strip_python_lang_prefix("python3"), Some(""));
+
+        // Valid matches - case insensitive
+        assert_eq!(strip_python_lang_prefix("PY"), Some(""));
+        assert_eq!(strip_python_lang_prefix("Py3"), Some(""));
+        assert_eq!(strip_python_lang_prefix("Python"), Some(""));
+        assert_eq!(strip_python_lang_prefix("PYTHON3"), Some(""));
+        assert_eq!(strip_python_lang_prefix("PyThOn"), Some(""));
+
+        // Valid matches - with trailing whitespace
+        assert_eq!(strip_python_lang_prefix("py "), Some(" "));
+        assert_eq!(strip_python_lang_prefix("python\t"), Some("\t"));
+        assert_eq!(strip_python_lang_prefix("python3 extra"), Some(" extra"));
+
+        // Invalid - prefix only
+        assert_eq!(strip_python_lang_prefix("p"), None);
+        assert_eq!(strip_python_lang_prefix("pyt"), None);
+        assert_eq!(strip_python_lang_prefix("pyth"), None);
+        assert_eq!(strip_python_lang_prefix("pytho"), None);
+
+        // Invalid - no word boundary
+        assert_eq!(strip_python_lang_prefix("pyx"), None);
+        assert_eq!(strip_python_lang_prefix("py33"), None);
+        assert_eq!(strip_python_lang_prefix("pythonx"), None);
+        assert_eq!(strip_python_lang_prefix("python33"), None);
+        assert_eq!(strip_python_lang_prefix("python3x"), None);
+
+        // Invalid - completely different
+        assert_eq!(strip_python_lang_prefix("rust"), None);
+        assert_eq!(strip_python_lang_prefix(""), None);
+        assert_eq!(strip_python_lang_prefix("javascript"), None);
+    }
 }

From 59ccc11e610eb80a4fde2b20b64839f1686645db Mon Sep 17 00:00:00 2001
From: magic-akari <akari.ccino@gmail.com>
Date: Sun, 7 Dec 2025 06:01:46 +0800
Subject: [PATCH 2/4] fix: typos issue

---
 crates/ruff_python_formatter/src/string/docstring.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs
index 01317cc798..2f377f5510 100644
--- a/crates/ruff_python_formatter/src/string/docstring.rs
+++ b/crates/ruff_python_formatter/src/string/docstring.rs
@@ -2052,7 +2052,7 @@ mod tests {
         assert_eq!(strip_python_lang_prefix("p"), None);
         assert_eq!(strip_python_lang_prefix("pyt"), None);
         assert_eq!(strip_python_lang_prefix("pyth"), None);
-        assert_eq!(strip_python_lang_prefix("pytho"), None);
+        assert_eq!(strip_python_lang_prefix("pytho"), None); // # spellchecker:disable-line
 
         // Invalid - no word boundary
         assert_eq!(strip_python_lang_prefix("pyx"), None);

From 3d93408526c9ad1c22c2dc0d2513df9e65d6eaea Mon Sep 17 00:00:00 2001
From: magic-akari <akari.ccino@gmail.com>
Date: Sun, 7 Dec 2025 06:49:23 +0800
Subject: [PATCH 3/4] Address review feedback

---
 crates/ruff_python_formatter/src/string/docstring.rs | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs
index 2f377f5510..6d002dbebd 100644
--- a/crates/ruff_python_formatter/src/string/docstring.rs
+++ b/crates/ruff_python_formatter/src/string/docstring.rs
@@ -1098,8 +1098,8 @@ impl<'src> CodeExampleRst<'src> {
                 return false;
             };
 
-            // Line must end with only whitespace after the language identifier
-            rest.trim().is_empty()
+            // Line must end immediately after the language identifier (no trailing whitespace)
+            rest.is_empty()
         }
 
         if !is_rst_directive_start(original.line) {
@@ -1511,8 +1511,11 @@ impl<'src> CodeExampleMarkdown<'src> {
             return Some((kind, fence_len));
         }
 
-        // Check if it starts with a Python language identifier using state machine
-        // The state machine only matches if followed by whitespace or end of string
+        // Check if it starts with a Python language identifier using state machine.
+        // NOTE: This is stricter than the original regex which matched any info string
+        // starting with py/python (e.g., "python-repl" would have matched). We now require
+        // an exact language identifier followed by whitespace or end of string, which is
+        // more conservative and avoids matching non-Python formats like "pycon" or "python-repl".
         if strip_python_lang_prefix(info_string).is_some() {
             return Some((kind, fence_len));
         }

From 4e88adbdc46239ed236c9368f59fb9be2f4791f5 Mon Sep 17 00:00:00 2001
From: magic-akari <akari.ccino@gmail.com>
Date: Mon, 8 Dec 2025 19:44:54 +0800
Subject: [PATCH 4/4] Address review feedback

---
 .../src/string/docstring.rs                   | 43 +++++++++++--------
 1 file changed, 26 insertions(+), 17 deletions(-)

diff --git a/crates/ruff_python_formatter/src/string/docstring.rs b/crates/ruff_python_formatter/src/string/docstring.rs
index 6d002dbebd..3162374be0 100644
--- a/crates/ruff_python_formatter/src/string/docstring.rs
+++ b/crates/ruff_python_formatter/src/string/docstring.rs
@@ -1498,14 +1498,13 @@ impl<'src> CodeExampleMarkdown<'src> {
         // Get content after the fence
         let rest = &line[fence_len..];
 
+        let info_string = rest.trim();
+
         // For backtick fences, info string cannot contain backticks
-        if fence_char == '`' && rest.contains('`') {
+        if fence_char == '`' && info_string.contains('`') {
             return None;
         }
 
-        // Check the language identifier
-        let info_string = rest.trim();
-
         // Empty info string is treated as Python (matches original implementation)
         if info_string.is_empty() {
             return Some((kind, fence_len));
@@ -1943,11 +1942,16 @@ fn is_rst_option(line: &str) -> bool {
 /// If `s` starts with `prefix` (case-insensitive), returns the remainder of `s`
 /// after the prefix. Otherwise, returns `None`.
 fn strip_prefix_ignore_ascii_case<'a>(s: &'a str, prefix: &str) -> Option<&'a str> {
-    if s.len() < prefix.len() {
-        return None;
-    }
-    if s[..prefix.len()].eq_ignore_ascii_case(prefix) {
-        Some(&s[prefix.len()..])
+    let prefix_len = prefix.len();
+    // Use byte slicing to avoid panic on non-ASCII strings.
+    // This is safe because `prefix` is always ASCII in our usage.
+    if s.as_bytes()
+        .get(..prefix_len)?
+        .eq_ignore_ascii_case(prefix.as_bytes())
+    {
+        // SAFETY: prefix_len is guaranteed to be on a valid UTF-8 boundary
+        // because we only call this function with ASCII prefixes.
+        Some(&s[prefix_len..])
     } else {
         None
     }
@@ -1975,17 +1979,22 @@ fn strip_python_lang_prefix(s: &str) -> Option<&str> {
         return None;
     }
 
+    // SAFETY for all `s.get(n..)` calls below:
+    // We only slice after verifying that the preceding bytes are ASCII characters.
+    // Since ASCII characters are single-byte in UTF-8, slicing at these indices
+    // is guaranteed to be on valid UTF-8 boundaries.
+
     // State 2: "py" matched - check what's next
     match bytes.get(2).map(u8::to_ascii_lowercase) {
         // "py" followed by end or whitespace -> accept "py"
-        None => return Some(&s[2..]),
-        Some(b) if b.is_ascii_whitespace() => return Some(&s[2..]),
+        None => return s.get(2..),
+        Some(b) if b.is_ascii_whitespace() => return s.get(2..),
 
         // "py3" -> accept "py3"
         Some(b'3') => {
             return match bytes.get(3) {
-                None => Some(&s[3..]),
-                Some(b) if b.is_ascii_whitespace() => Some(&s[3..]),
+                None => s.get(3..),
+                Some(b) if b.is_ascii_whitespace() => s.get(3..),
                 Some(_) => None,
             };
         }
@@ -2004,13 +2013,13 @@ fn strip_python_lang_prefix(s: &str) -> Option<&str> {
     // State 6: "python" matched - check what's next
     match bytes.get(6).map(u8::to_ascii_lowercase) {
         // "python" followed by end or whitespace -> accept "python"
-        None => Some(&s[6..]),
-        Some(b) if b.is_ascii_whitespace() => Some(&s[6..]),
+        None => s.get(6..),
+        Some(b) if b.is_ascii_whitespace() => s.get(6..),
 
         // "python3" -> accept "python3"
         Some(b'3') => match bytes.get(7) {
-            None => Some(&s[7..]),
-            Some(b) if b.is_ascii_whitespace() => Some(&s[7..]),
+            None => s.get(7..),
+            Some(b) if b.is_ascii_whitespace() => s.get(7..),
             Some(_) => None,
         },