This commit is contained in:
magic-akari 2025-12-16 16:37:08 -05:00 committed by GitHub
commit f7313a199a
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
2 changed files with 218 additions and 49 deletions

View File

@ -29,7 +29,6 @@ clap = { workspace = true }
countme = { workspace = true } countme = { workspace = true }
itertools = { workspace = true } itertools = { workspace = true }
memchr = { workspace = true } memchr = { workspace = true }
regex = { workspace = true }
rustc-hash = { workspace = true } rustc-hash = { workspace = true }
salsa = { workspace = true } salsa = { workspace = true }
serde = { workspace = true, optional = true } serde = { workspace = true, optional = true }

View File

@ -3,11 +3,9 @@
#![allow(clippy::doc_markdown)] #![allow(clippy::doc_markdown)]
use std::cmp::Ordering; use std::cmp::Ordering;
use std::sync::LazyLock;
use std::{borrow::Cow, collections::VecDeque}; use std::{borrow::Cow, collections::VecDeque};
use itertools::Itertools; use itertools::Itertools;
use regex::Regex;
use ruff_formatter::printer::SourceMapGeneration; use ruff_formatter::printer::SourceMapGeneration;
use ruff_python_ast::{AnyStringFlags, StringFlags, str::Quote}; use ruff_python_ast::{AnyStringFlags, StringFlags, str::Quote};
@ -1073,13 +1071,38 @@ impl<'src> CodeExampleRst<'src> {
// [directives]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#directives // [directives]: https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#directives
// [Pygments lexer names]: https://pygments.org/docs/lexers/ // [Pygments lexer names]: https://pygments.org/docs/lexers/
// [code-block]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block // [code-block]: https://www.sphinx-doc.org/en/master/usage/restructuredtext/directives.html#directive-code-block
static DIRECTIVE_START: LazyLock<Regex> = LazyLock::new(|| { fn is_rst_directive_start(line: &str) -> bool {
Regex::new( let trimmed = line.trim_start();
r"(?m)^\s*\.\. \s*(?i:code-block|sourcecode)::\s*(?i:python|py|python3|py3)$",
) // Must start with ".. "
.unwrap() let Some(rest) = trimmed.strip_prefix(".. ") else {
}); return false;
if !DIRECTIVE_START.is_match(original.line) { };
let rest = rest.trim_start();
// Match "code-block" or "sourcecode" (case-insensitive)
let Some(rest) = strip_prefix_ignore_ascii_case(rest, "code-block")
.or_else(|| strip_prefix_ignore_ascii_case(rest, "sourcecode"))
else {
return false;
};
// Must be followed by "::"
let Some(rest) = rest.strip_prefix("::") else {
return false;
};
let rest = rest.trim_start();
// Match Python language identifier (case-insensitive)
let Some(rest) = strip_python_lang_prefix(rest) else {
return false;
};
// Line must end immediately after the language identifier (no trailing whitespace)
rest.is_empty()
}
if !is_rst_directive_start(original.line) {
return None; return None;
} }
Some(CodeExampleRst { Some(CodeExampleRst {
@ -1318,50 +1341,13 @@ impl<'src> CodeExampleMarkdown<'src> {
/// ///
/// [fenced code block]: https://spec.commonmark.org/0.30/#fenced-code-blocks /// [fenced code block]: https://spec.commonmark.org/0.30/#fenced-code-blocks
fn new(original: InputDocstringLine<'src>) -> Option<CodeExampleMarkdown<'src>> { fn new(original: InputDocstringLine<'src>) -> Option<CodeExampleMarkdown<'src>> {
static FENCE_START: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(
r"(?xm)
^
(?:
# In the backtick case, info strings (following the fence)
# cannot contain backticks themselves, since it would
# introduce ambiguity with parsing inline code. In other
# words, if we didn't specifically exclude matching `
# in the info string for backtick fences, then we might
# erroneously consider something to be a code fence block
# that is actually inline code.
#
# NOTE: The `ticklang` and `tildlang` capture groups are
# currently unused, but there was some discussion about not
# assuming unlabeled blocks were Python. At the time of
# writing, we do assume unlabeled blocks are Python, but
# one could inspect the `ticklang` and `tildlang` capture
# groups to determine whether the block is labeled or not.
(?<ticks>```+)(?:\s*(?<ticklang>(?i:python|py|python3|py3))[^`]*)?
|
(?<tilds>~~~+)(?:\s*(?<tildlang>(?i:python|py|python3|py3))\p{any}*)?
)
$
",
)
.unwrap()
});
let (opening_fence_indent, rest) = indent_with_suffix(original.line); let (opening_fence_indent, rest) = indent_with_suffix(original.line);
// Quit quickly in the vast majority of cases. // Quit quickly in the vast majority of cases.
if !rest.starts_with("```") && !rest.starts_with("~~~") { if !rest.starts_with("```") && !rest.starts_with("~~~") {
return None; return None;
} }
let caps = FENCE_START.captures(rest)?; let (fence_kind, fence_len) = Self::parse_markdown_fence_start(rest)?;
let (fence_kind, fence_len) = if let Some(ticks) = caps.name("ticks") {
(MarkdownFenceKind::Backtick, ticks.as_str().chars().count())
} else {
let tildes = caps
.name("tilds")
.expect("no ticks means it must be tildes");
(MarkdownFenceKind::Tilde, tildes.as_str().chars().count())
};
Some(CodeExampleMarkdown { Some(CodeExampleMarkdown {
lines: vec![], lines: vec![],
opening_fence_indent: Indentation::from_str(opening_fence_indent), opening_fence_indent: Indentation::from_str(opening_fence_indent),
@ -1481,6 +1467,60 @@ impl<'src> CodeExampleMarkdown<'src> {
fn into_reset_action(self) -> CodeExampleAddAction<'src> { fn into_reset_action(self) -> CodeExampleAddAction<'src> {
CodeExampleAddAction::Reset { code: self.lines } CodeExampleAddAction::Reset { code: self.lines }
} }
/// Parses a Markdown fenced code block opening line.
///
/// Returns the fence type and length if the line is a valid Python code fence.
/// Returns `None` if not a valid Python code fence.
///
/// In the backtick case, info strings (following the fence) cannot contain
/// backticks themselves, since it would introduce ambiguity with parsing
/// inline code. In other words, if we didn't specifically exclude matching
/// backticks in the info string for backtick fences, then we might
/// erroneously consider something to be a code fence block that is actually
/// inline code.
fn parse_markdown_fence_start(line: &str) -> Option<(MarkdownFenceKind, usize)> {
// Check if it's backticks or tildes
let (fence_char, kind) = if line.starts_with('`') {
('`', MarkdownFenceKind::Backtick)
} else if line.starts_with('~') {
('~', MarkdownFenceKind::Tilde)
} else {
return None;
};
// Count consecutive fence characters (need at least 3)
let fence_len = line.bytes().take_while(|&b| b == fence_char as u8).count();
if fence_len < 3 {
return None;
}
// Get content after the fence
let rest = &line[fence_len..];
let info_string = rest.trim();
// For backtick fences, info string cannot contain backticks
if fence_char == '`' && info_string.contains('`') {
return None;
}
// Empty info string is treated as Python (matches original implementation)
if info_string.is_empty() {
return Some((kind, fence_len));
}
// Check if it starts with a Python language identifier using state machine.
// NOTE: This is stricter than the original regex which matched any info string
// starting with py/python (e.g., "python-repl" would have matched). We now require
// an exact language identifier followed by whitespace or end of string, which is
// more conservative and avoids matching non-Python formats like "pycon" or "python-repl".
if strip_python_lang_prefix(info_string).is_some() {
return Some((kind, fence_len));
}
None
}
} }
/// The kind of fence used in a Markdown code block. /// The kind of fence used in a Markdown code block.
@ -1897,9 +1937,100 @@ fn is_rst_option(line: &str) -> bool {
.any(|ch| ch == ':') .any(|ch| ch == ':')
} }
/// Case-insensitive ASCII prefix stripping.
///
/// If `s` starts with `prefix` (case-insensitive), returns the remainder of `s`
/// after the prefix. Otherwise, returns `None`.
fn strip_prefix_ignore_ascii_case<'a>(s: &'a str, prefix: &str) -> Option<&'a str> {
let prefix_len = prefix.len();
// Use byte slicing to avoid panic on non-ASCII strings.
// This is safe because `prefix` is always ASCII in our usage.
if s.as_bytes()
.get(..prefix_len)?
.eq_ignore_ascii_case(prefix.as_bytes())
{
// SAFETY: prefix_len is guaranteed to be on a valid UTF-8 boundary
// because we only call this function with ASCII prefixes.
Some(&s[prefix_len..])
} else {
None
}
}
/// Matches a Python language identifier using a state machine.
///
/// Matches "py", "py3", "python", or "python3" (case-insensitive) and returns
/// the remainder of the string after the match. This is more efficient than
/// multiple `strip_prefix_ignore_ascii_case` calls as it traverses the input
/// only once.
///
/// State machine structure:
/// ```text
/// Start -> 'p' -> 'y' -> (accept "py")
/// -> '3' -> (accept "py3")
/// -> 't' -> 'h' -> 'o' -> 'n' -> (accept "python")
/// -> '3' -> (accept "python3")
/// ```
fn strip_python_lang_prefix(s: &str) -> Option<&str> {
let bytes = s.as_bytes();
// State 0-1: expect "py"
if !bytes.get(..2)?.eq_ignore_ascii_case(b"py") {
return None;
}
// SAFETY for all `s.get(n..)` calls below:
// We only slice after verifying that the preceding bytes are ASCII characters.
// Since ASCII characters are single-byte in UTF-8, slicing at these indices
// is guaranteed to be on valid UTF-8 boundaries.
// State 2: "py" matched - check what's next
match bytes.get(2).map(u8::to_ascii_lowercase) {
// "py" followed by end or whitespace -> accept "py"
None => return s.get(2..),
Some(b) if b.is_ascii_whitespace() => return s.get(2..),
// "py3" -> accept "py3"
Some(b'3') => {
return match bytes.get(3) {
None => s.get(3..),
Some(b) if b.is_ascii_whitespace() => s.get(3..),
Some(_) => None,
};
}
// Continue to "python" - check "thon" suffix
Some(b't') => {
if !bytes.get(3..6)?.eq_ignore_ascii_case(b"hon") {
return None;
}
}
// Invalid
Some(_) => return None,
}
// State 6: "python" matched - check what's next
match bytes.get(6).map(u8::to_ascii_lowercase) {
// "python" followed by end or whitespace -> accept "python"
None => s.get(6..),
Some(b) if b.is_ascii_whitespace() => s.get(6..),
// "python3" -> accept "python3"
Some(b'3') => match bytes.get(7) {
None => s.get(7..),
Some(b) if b.is_ascii_whitespace() => s.get(7..),
Some(_) => None,
},
// Invalid (e.g., "pythonx")
Some(_) => None,
}
}
#[cfg(test)] #[cfg(test)]
mod tests { mod tests {
use crate::string::docstring::Indentation; use crate::string::docstring::{Indentation, strip_python_lang_prefix};
#[test] #[test]
fn indentation_like_black() { fn indentation_like_black() {
@ -1908,4 +2039,43 @@ mod tests {
assert_eq!(Indentation::from_str("\t\t\t").columns(), 24); assert_eq!(Indentation::from_str("\t\t\t").columns(), 24);
assert_eq!(Indentation::from_str(" ").columns(), 4); assert_eq!(Indentation::from_str(" ").columns(), 4);
} }
#[test]
fn python_lang_state_machine() {
// Valid matches - exact
assert_eq!(strip_python_lang_prefix("py"), Some(""));
assert_eq!(strip_python_lang_prefix("py3"), Some(""));
assert_eq!(strip_python_lang_prefix("python"), Some(""));
assert_eq!(strip_python_lang_prefix("python3"), Some(""));
// Valid matches - case insensitive
assert_eq!(strip_python_lang_prefix("PY"), Some(""));
assert_eq!(strip_python_lang_prefix("Py3"), Some(""));
assert_eq!(strip_python_lang_prefix("Python"), Some(""));
assert_eq!(strip_python_lang_prefix("PYTHON3"), Some(""));
assert_eq!(strip_python_lang_prefix("PyThOn"), Some(""));
// Valid matches - with trailing whitespace
assert_eq!(strip_python_lang_prefix("py "), Some(" "));
assert_eq!(strip_python_lang_prefix("python\t"), Some("\t"));
assert_eq!(strip_python_lang_prefix("python3 extra"), Some(" extra"));
// Invalid - prefix only
assert_eq!(strip_python_lang_prefix("p"), None);
assert_eq!(strip_python_lang_prefix("pyt"), None);
assert_eq!(strip_python_lang_prefix("pyth"), None);
assert_eq!(strip_python_lang_prefix("pytho"), None); // # spellchecker:disable-line
// Invalid - no word boundary
assert_eq!(strip_python_lang_prefix("pyx"), None);
assert_eq!(strip_python_lang_prefix("py33"), None);
assert_eq!(strip_python_lang_prefix("pythonx"), None);
assert_eq!(strip_python_lang_prefix("python33"), None);
assert_eq!(strip_python_lang_prefix("python3x"), None);
// Invalid - completely different
assert_eq!(strip_python_lang_prefix("rust"), None);
assert_eq!(strip_python_lang_prefix(""), None);
assert_eq!(strip_python_lang_prefix("javascript"), None);
}
} }