From 155be8837379dd0f690eacede2227d016a622742 Mon Sep 17 00:00:00 2001 From: Micha Reiser Date: Mon, 21 Oct 2024 21:49:20 +0200 Subject: [PATCH] Speedup mdtest parser (#13835) --- Cargo.lock | 1 + crates/red_knot_test/Cargo.toml | 1 + crates/red_knot_test/src/parser.rs | 180 ++++++++++++++++++++++++----- 3 files changed, 150 insertions(+), 32 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 09b59584da..c2c82e0c42 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -2156,6 +2156,7 @@ version = "0.0.0" dependencies = [ "anyhow", "colored", + "memchr", "red_knot_python_semantic", "red_knot_vendored", "regex", diff --git a/crates/red_knot_test/Cargo.toml b/crates/red_knot_test/Cargo.toml index 2de1e9a662..68c7f13e96 100644 --- a/crates/red_knot_test/Cargo.toml +++ b/crates/red_knot_test/Cargo.toml @@ -21,6 +21,7 @@ ruff_text_size = { workspace = true } anyhow = { workspace = true } colored = { workspace = true } +memchr = { workspace = true } regex = { workspace = true } rustc-hash = { workspace = true } salsa = { workspace = true } diff --git a/crates/red_knot_test/src/parser.rs b/crates/red_knot_test/src/parser.rs index 9c5f3a1955..fa9e39432c 100644 --- a/crates/red_knot_test/src/parser.rs +++ b/crates/red_knot_test/src/parser.rs @@ -1,4 +1,5 @@ -use regex::{Captures, Regex}; +use memchr::memchr2; +use regex::{Captures, Match, Regex}; use ruff_index::{newtype_index, IndexVec}; use rustc_hash::{FxHashMap, FxHashSet}; use std::sync::LazyLock; @@ -133,15 +134,15 @@ pub(crate) struct EmbeddedFile<'s> { pub(crate) code: &'s str, } -/// Matches an arbitrary amount of whitespace (including newlines), followed by a sequence of `#` -/// characters, followed by a title heading, followed by a newline. +/// Matches a sequence of `#` characters, followed by a title heading, followed by a newline. static HEADER_RE: LazyLock = - LazyLock::new(|| Regex::new(r"^(\s*\n)*(?#+)\s+(?.+)\s*\n").unwrap()); + LazyLock::new(|| Regex::new(r"^(?<level>#+)\s+(?<title>.+)\s*\n").unwrap()); /// Matches a code block fenced by triple backticks, possibly with language and `key=val` /// configuration items following the opening backticks (in the "tag string" of the code block). static CODE_RE: LazyLock<Regex> = LazyLock::new(|| { - Regex::new(r"^```(?<lang>\w+)(?<config>( +\S+)*)\s*\n(?<code>(.|\n)*?)\n?```\s*\n").unwrap() + Regex::new(r"^```(?<lang>(?-u:\w)+)?(?<config>(?: +\S+)*)\s*\n(?<code>(?:.|\n)*?)\n?```\s*\n") + .unwrap() }); #[derive(Debug)] @@ -226,33 +227,61 @@ impl<'s> Parser<'s> { } fn parse_impl(&mut self) -> anyhow::Result<()> { - while !self.unparsed.is_empty() { - if let Some(captures) = self.scan(&HEADER_RE) { - self.parse_header(&captures)?; - } else if let Some(captures) = self.scan(&CODE_RE) { - self.parse_code_block(&captures)?; - } else { - // ignore other Markdown syntax (paragraphs, etc) used as comments in the test - if let Some(next_newline) = self.unparsed.find('\n') { - (_, self.unparsed) = self.unparsed.split_at(next_newline + 1); - } else { - break; + while let Some(position) = memchr2(b'`', b'#', self.unparsed.as_bytes()) { + let (before, after) = self.unparsed.split_at(position); + self.unparsed = after; + + // code blocks and headers must start on a new line. + if before.is_empty() || before.ends_with('\n') { + let c = after.as_bytes()[0] as char; + + match c { + '#' => { + if let Some(find) = HEADER_RE.find(self.unparsed) { + self.parse_header(find.as_str())?; + self.unparsed = &self.unparsed[find.end()..]; + continue; + } + } + '`' => { + if let Some(captures) = CODE_RE.captures(self.unparsed) { + self.parse_code_block(&captures)?; + self.unparsed = &self.unparsed[captures.get(0).unwrap().end()..]; + continue; + } + } + _ => unreachable!(), } } + + // Skip to the end of the line + if let Some(position) = memchr::memchr(b'\n', self.unparsed.as_bytes()) { + self.unparsed = &self.unparsed[position + 1..]; + } else { + break; + } } Ok(()) } - fn parse_header(&mut self, captures: &Captures<'s>) -> anyhow::Result<()> { - let header_level = captures["level"].len(); + fn parse_header(&mut self, header: &'s str) -> anyhow::Result<()> { + let mut trimmed = header.trim(); + + let mut header_level = 0usize; + while let Some(rest) = trimmed.strip_prefix('#') { + header_level += 1; + trimmed = rest; + } + + let title = trimmed.trim_start(); + self.pop_sections_to_level(header_level); let parent = self.stack.parent(); let section = Section { - // HEADER_RE can't match without a match for group 'title'. - title: captures.name("title").unwrap().into(), + title, level: header_level.try_into()?, parent_id: Some(parent), }; @@ -300,8 +329,12 @@ impl<'s> Parser<'s> { self.files.push(EmbeddedFile { path, section: parent, + lang: captures + .name("lang") + .as_ref() + .map(Match::as_str) + .unwrap_or_default(), // CODE_RE can't match without matches for 'lang' and 'code'. - lang: captures.name("lang").unwrap().into(), code: captures.name("code").unwrap().into(), }); @@ -335,17 +368,6 @@ impl<'s> Parser<'s> { self.current_section_files = None; } } - - /// Get capture groups and advance cursor past match if unparsed text matches `pattern`. - fn scan(&mut self, pattern: &Regex) -> Option<Captures<'s>> { - if let Some(captures) = pattern.captures(self.unparsed) { - let (_, unparsed) = self.unparsed.split_at(captures.get(0).unwrap().end()); - self.unparsed = unparsed; - Some(captures) - } else { - None - } - } } #[cfg(test)] @@ -428,6 +450,57 @@ mod tests { assert_eq!(file.code, "y = 2"); } + #[test] + fn multiple_file_tests() { + let source = dedent( + " + # One + + ```py path=main.py + from foo import y + ``` + + ```py path=foo.py + y = 2 + ``` + + # Two + + ```py + y = 2 + ``` + ", + ); + let mf = super::parse("file.md", &source).unwrap(); + + let [test1, test2] = &mf.tests().collect::<Vec<_>>()[..] else { + panic!("expected two tests"); + }; + + assert_eq!(test1.name(), "file.md - One"); + assert_eq!(test2.name(), "file.md - Two"); + + let [main, foo] = test1.files().collect::<Vec<_>>()[..] else { + panic!("expected two files"); + }; + + assert_eq!(main.path, "main.py"); + assert_eq!(main.lang, "py"); + assert_eq!(main.code, "from foo import y"); + + assert_eq!(foo.path, "foo.py"); + assert_eq!(foo.lang, "py"); + assert_eq!(foo.code, "y = 2"); + + let [file] = test2.files().collect::<Vec<_>>()[..] else { + panic!("expected one file"); + }; + + assert_eq!(file.path, "test.py"); + assert_eq!(file.lang, "py"); + assert_eq!(file.code, "y = 2"); + } + #[test] fn custom_file_path() { let source = dedent( @@ -473,6 +546,49 @@ mod tests { assert_eq!(file.code, "x = 1\ny = 2"); } + #[test] + fn empty_file() { + let source = dedent( + " + ```py + ``` + ", + ); + + let mf = super::parse("file.md", &source).unwrap(); + + let [test] = &mf.tests().collect::<Vec<_>>()[..] else { + panic!("expected one test"); + }; + let [file] = test.files().collect::<Vec<_>>()[..] else { + panic!("expected one file"); + }; + + assert_eq!(file.code, ""); + } + + #[test] + fn no_lang() { + let source = dedent( + " + ``` + x = 10 + ``` + ", + ); + + let mf = super::parse("file.md", &source).unwrap(); + + let [test] = &mf.tests().collect::<Vec<_>>()[..] else { + panic!("expected one test"); + }; + let [file] = test.files().collect::<Vec<_>>()[..] else { + panic!("expected one file"); + }; + + assert_eq!(file.code, "x = 10"); + } + #[test] fn no_header_inside_test() { let source = dedent(