Speedup mdtest parser (#13835)

This commit is contained in:
Micha Reiser 2024-10-21 21:49:20 +02:00 committed by GitHub
parent a77512df68
commit 155be88373
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
3 changed files with 150 additions and 32 deletions

1
Cargo.lock generated
View File

@ -2156,6 +2156,7 @@ version = "0.0.0"
dependencies = [ dependencies = [
"anyhow", "anyhow",
"colored", "colored",
"memchr",
"red_knot_python_semantic", "red_knot_python_semantic",
"red_knot_vendored", "red_knot_vendored",
"regex", "regex",

View File

@ -21,6 +21,7 @@ ruff_text_size = { workspace = true }
anyhow = { workspace = true } anyhow = { workspace = true }
colored = { workspace = true } colored = { workspace = true }
memchr = { workspace = true }
regex = { workspace = true } regex = { workspace = true }
rustc-hash = { workspace = true } rustc-hash = { workspace = true }
salsa = { workspace = true } salsa = { workspace = true }

View File

@ -1,4 +1,5 @@
use regex::{Captures, Regex}; use memchr::memchr2;
use regex::{Captures, Match, Regex};
use ruff_index::{newtype_index, IndexVec}; use ruff_index::{newtype_index, IndexVec};
use rustc_hash::{FxHashMap, FxHashSet}; use rustc_hash::{FxHashMap, FxHashSet};
use std::sync::LazyLock; use std::sync::LazyLock;
@ -133,15 +134,15 @@ pub(crate) struct EmbeddedFile<'s> {
pub(crate) code: &'s str, pub(crate) code: &'s str,
} }
/// Matches an arbitrary amount of whitespace (including newlines), followed by a sequence of `#` /// Matches a sequence of `#` characters, followed by a title heading, followed by a newline.
/// characters, followed by a title heading, followed by a newline.
static HEADER_RE: LazyLock<Regex> = static HEADER_RE: LazyLock<Regex> =
LazyLock::new(|| Regex::new(r"^(\s*\n)*(?<level>#+)\s+(?<title>.+)\s*\n").unwrap()); LazyLock::new(|| Regex::new(r"^(?<level>#+)\s+(?<title>.+)\s*\n").unwrap());
/// Matches a code block fenced by triple backticks, possibly with language and `key=val` /// Matches a code block fenced by triple backticks, possibly with language and `key=val`
/// configuration items following the opening backticks (in the "tag string" of the code block). /// configuration items following the opening backticks (in the "tag string" of the code block).
static CODE_RE: LazyLock<Regex> = LazyLock::new(|| { static CODE_RE: LazyLock<Regex> = LazyLock::new(|| {
Regex::new(r"^```(?<lang>\w+)(?<config>( +\S+)*)\s*\n(?<code>(.|\n)*?)\n?```\s*\n").unwrap() Regex::new(r"^```(?<lang>(?-u:\w)+)?(?<config>(?: +\S+)*)\s*\n(?<code>(?:.|\n)*?)\n?```\s*\n")
.unwrap()
}); });
#[derive(Debug)] #[derive(Debug)]
@ -226,33 +227,61 @@ impl<'s> Parser<'s> {
} }
fn parse_impl(&mut self) -> anyhow::Result<()> { fn parse_impl(&mut self) -> anyhow::Result<()> {
while !self.unparsed.is_empty() { while let Some(position) = memchr2(b'`', b'#', self.unparsed.as_bytes()) {
if let Some(captures) = self.scan(&HEADER_RE) { let (before, after) = self.unparsed.split_at(position);
self.parse_header(&captures)?; self.unparsed = after;
} else if let Some(captures) = self.scan(&CODE_RE) {
self.parse_code_block(&captures)?; // code blocks and headers must start on a new line.
} else { if before.is_empty() || before.ends_with('\n') {
// ignore other Markdown syntax (paragraphs, etc) used as comments in the test let c = after.as_bytes()[0] as char;
if let Some(next_newline) = self.unparsed.find('\n') {
(_, self.unparsed) = self.unparsed.split_at(next_newline + 1); match c {
} else { '#' => {
break; if let Some(find) = HEADER_RE.find(self.unparsed) {
self.parse_header(find.as_str())?;
self.unparsed = &self.unparsed[find.end()..];
continue;
}
}
'`' => {
if let Some(captures) = CODE_RE.captures(self.unparsed) {
self.parse_code_block(&captures)?;
self.unparsed = &self.unparsed[captures.get(0).unwrap().end()..];
continue;
}
}
_ => unreachable!(),
} }
} }
// Skip to the end of the line
if let Some(position) = memchr::memchr(b'\n', self.unparsed.as_bytes()) {
self.unparsed = &self.unparsed[position + 1..];
} else {
break;
}
} }
Ok(()) Ok(())
} }
fn parse_header(&mut self, captures: &Captures<'s>) -> anyhow::Result<()> { fn parse_header(&mut self, header: &'s str) -> anyhow::Result<()> {
let header_level = captures["level"].len(); let mut trimmed = header.trim();
let mut header_level = 0usize;
while let Some(rest) = trimmed.strip_prefix('#') {
header_level += 1;
trimmed = rest;
}
let title = trimmed.trim_start();
self.pop_sections_to_level(header_level); self.pop_sections_to_level(header_level);
let parent = self.stack.parent(); let parent = self.stack.parent();
let section = Section { let section = Section {
// HEADER_RE can't match without a match for group 'title'. title,
title: captures.name("title").unwrap().into(),
level: header_level.try_into()?, level: header_level.try_into()?,
parent_id: Some(parent), parent_id: Some(parent),
}; };
@ -300,8 +329,12 @@ impl<'s> Parser<'s> {
self.files.push(EmbeddedFile { self.files.push(EmbeddedFile {
path, path,
section: parent, section: parent,
lang: captures
.name("lang")
.as_ref()
.map(Match::as_str)
.unwrap_or_default(),
// CODE_RE can't match without matches for 'lang' and 'code'. // CODE_RE can't match without matches for 'lang' and 'code'.
lang: captures.name("lang").unwrap().into(),
code: captures.name("code").unwrap().into(), code: captures.name("code").unwrap().into(),
}); });
@ -335,17 +368,6 @@ impl<'s> Parser<'s> {
self.current_section_files = None; self.current_section_files = None;
} }
} }
/// Get capture groups and advance cursor past match if unparsed text matches `pattern`.
fn scan(&mut self, pattern: &Regex) -> Option<Captures<'s>> {
if let Some(captures) = pattern.captures(self.unparsed) {
let (_, unparsed) = self.unparsed.split_at(captures.get(0).unwrap().end());
self.unparsed = unparsed;
Some(captures)
} else {
None
}
}
} }
#[cfg(test)] #[cfg(test)]
@ -428,6 +450,57 @@ mod tests {
assert_eq!(file.code, "y = 2"); assert_eq!(file.code, "y = 2");
} }
#[test]
fn multiple_file_tests() {
let source = dedent(
"
# One
```py path=main.py
from foo import y
```
```py path=foo.py
y = 2
```
# Two
```py
y = 2
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test1, test2] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected two tests");
};
assert_eq!(test1.name(), "file.md - One");
assert_eq!(test2.name(), "file.md - Two");
let [main, foo] = test1.files().collect::<Vec<_>>()[..] else {
panic!("expected two files");
};
assert_eq!(main.path, "main.py");
assert_eq!(main.lang, "py");
assert_eq!(main.code, "from foo import y");
assert_eq!(foo.path, "foo.py");
assert_eq!(foo.lang, "py");
assert_eq!(foo.code, "y = 2");
let [file] = test2.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.path, "test.py");
assert_eq!(file.lang, "py");
assert_eq!(file.code, "y = 2");
}
#[test] #[test]
fn custom_file_path() { fn custom_file_path() {
let source = dedent( let source = dedent(
@ -473,6 +546,49 @@ mod tests {
assert_eq!(file.code, "x = 1\ny = 2"); assert_eq!(file.code, "x = 1\ny = 2");
} }
#[test]
fn empty_file() {
let source = dedent(
"
```py
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.code, "");
}
#[test]
fn no_lang() {
let source = dedent(
"
```
x = 10
```
",
);
let mf = super::parse("file.md", &source).unwrap();
let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
panic!("expected one test");
};
let [file] = test.files().collect::<Vec<_>>()[..] else {
panic!("expected one file");
};
assert_eq!(file.code, "x = 10");
}
#[test] #[test]
fn no_header_inside_test() { fn no_header_inside_test() {
let source = dedent( let source = dedent(