Speedup mdtest parser (#13835)

2024-10-21 21:49:20 +02:00 · 2024-10-21 21:49:20 +02:00 · 155be88373
parent a77512df68
commit 155be88373
3 changed files with 150 additions and 32 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -2156,6 +2156,7 @@ version = "0.0.0"
 dependencies = [
 "anyhow",
 "colored",
 "memchr",
 "red_knot_python_semantic",
 "red_knot_vendored",
 "regex",
--- a/crates/red_knot_test/Cargo.toml
+++ b/crates/red_knot_test/Cargo.toml
@ -21,6 +21,7 @@ ruff_text_size = { workspace = true }
 anyhow = { workspace = true }
 colored = { workspace = true }
 memchr = { workspace = true }
 regex = { workspace = true }
 rustc-hash = { workspace = true }
 salsa = { workspace = true }
--- a/crates/red_knot_test/src/parser.rs
+++ b/crates/red_knot_test/src/parser.rs
@ -1,4 +1,5 @@
-use regex::{Captures, Regex};
+use memchr::memchr2;
 use regex::{Captures, Match, Regex};
 use ruff_index::{newtype_index, IndexVec};
 use rustc_hash::{FxHashMap, FxHashSet};
 use std::sync::LazyLock;
@ -133,15 +134,15 @@ pub(crate) struct EmbeddedFile<'s> {
    pub(crate) code: &'s str,
 }
-/// Matches an arbitrary amount of whitespace (including newlines), followed by a sequence of `#`
+/// Matches a sequence of `#` characters, followed by a title heading, followed by a newline.
 /// characters, followed by a title heading, followed by a newline.
 static HEADER_RE: LazyLock<Regex> =
-    LazyLock::new(|| Regex::new(r"^(\s*\n)*(?<level>#+)\s+(?<title>.+)\s*\n").unwrap());
+    LazyLock::new(|| Regex::new(r"^(?<level>#+)\s+(?<title>.+)\s*\n").unwrap());
 /// Matches a code block fenced by triple backticks, possibly with language and `key=val`
 /// configuration items following the opening backticks (in the "tag string" of the code block).
 static CODE_RE: LazyLock<Regex> = LazyLock::new(|| {
-    Regex::new(r"^```(?<lang>\w+)(?<config>( +\S+)*)\s*\n(?<code>(.|\n)*?)\n?```\s*\n").unwrap()
+    Regex::new(r"^```(?<lang>(?-u:\w)+)?(?<config>(?: +\S+)*)\s*\n(?<code>(?:.|\n)*?)\n?```\s*\n")
        .unwrap()
 });
 #[derive(Debug)]
@ -226,33 +227,61 @@ impl<'s> Parser<'s> {
    }
    fn parse_impl(&mut self) -> anyhow::Result<()> {
-        while !self.unparsed.is_empty() {
+        while let Some(position) = memchr2(b'`', b'#', self.unparsed.as_bytes()) {
-            if let Some(captures) = self.scan(&HEADER_RE) {
+            let (before, after) = self.unparsed.split_at(position);
-                self.parse_header(&captures)?;
+            self.unparsed = after;
-            } else if let Some(captures) = self.scan(&CODE_RE) {
+
-                self.parse_code_block(&captures)?;
+            // code blocks and headers must start on a new line.
-            } else {
+            if before.is_empty() || before.ends_with('\n') {
-                // ignore other Markdown syntax (paragraphs, etc) used as comments in the test
+                let c = after.as_bytes()[0] as char;
-                if let Some(next_newline) = self.unparsed.find('\n') {
+
-                    (_, self.unparsed) = self.unparsed.split_at(next_newline + 1);
+                match c {
-                } else {
+                    '#' => {
-                    break;
+                        if let Some(find) = HEADER_RE.find(self.unparsed) {
                            self.parse_header(find.as_str())?;
                            self.unparsed = &self.unparsed[find.end()..];
                            continue;
                        }
                    }
                    '`' => {
                        if let Some(captures) = CODE_RE.captures(self.unparsed) {
                            self.parse_code_block(&captures)?;
                            self.unparsed = &self.unparsed[captures.get(0).unwrap().end()..];
                            continue;
                        }
                    }
                    _ => unreachable!(),
                }
            }
            // Skip to the end of the line
            if let Some(position) = memchr::memchr(b'\n', self.unparsed.as_bytes()) {
                self.unparsed = &self.unparsed[position + 1..];
            } else {
                break;
            }
        }
        Ok(())
    }
-    fn parse_header(&mut self, captures: &Captures<'s>) -> anyhow::Result<()> {
+    fn parse_header(&mut self, header: &'s str) -> anyhow::Result<()> {
-        let header_level = captures["level"].len();
+        let mut trimmed = header.trim();
        let mut header_level = 0usize;
        while let Some(rest) = trimmed.strip_prefix('#') {
            header_level += 1;
            trimmed = rest;
        }
        let title = trimmed.trim_start();
        self.pop_sections_to_level(header_level);
        let parent = self.stack.parent();
        let section = Section {
-            // HEADER_RE can't match without a match for group 'title'.
+            title,
            title: captures.name("title").unwrap().into(),
            level: header_level.try_into()?,
            parent_id: Some(parent),
        };
@ -300,8 +329,12 @@ impl<'s> Parser<'s> {
        self.files.push(EmbeddedFile {
            path,
            section: parent,
            lang: captures
                .name("lang")
                .as_ref()
                .map(Match::as_str)
                .unwrap_or_default(),
            // CODE_RE can't match without matches for 'lang' and 'code'.
            lang: captures.name("lang").unwrap().into(),
            code: captures.name("code").unwrap().into(),
        });
@ -335,17 +368,6 @@ impl<'s> Parser<'s> {
            self.current_section_files = None;
        }
    }
    /// Get capture groups and advance cursor past match if unparsed text matches `pattern`.
    fn scan(&mut self, pattern: &Regex) -> Option<Captures<'s>> {
        if let Some(captures) = pattern.captures(self.unparsed) {
            let (_, unparsed) = self.unparsed.split_at(captures.get(0).unwrap().end());
            self.unparsed = unparsed;
            Some(captures)
        } else {
            None
        }
    }
 }
 #[cfg(test)]
@ -428,6 +450,57 @@ mod tests {
        assert_eq!(file.code, "y = 2");
    }
    #[test]
    fn multiple_file_tests() {
        let source = dedent(
            "
            # One
            ```py path=main.py
            from foo import y
            ```
            ```py path=foo.py
            y = 2
            ```
            # Two
            ```py
            y = 2
            ```
            ",
        );
        let mf = super::parse("file.md", &source).unwrap();
        let [test1, test2] = &mf.tests().collect::<Vec<_>>()[..] else {
            panic!("expected two tests");
        };
        assert_eq!(test1.name(), "file.md - One");
        assert_eq!(test2.name(), "file.md - Two");
        let [main, foo] = test1.files().collect::<Vec<_>>()[..] else {
            panic!("expected two files");
        };
        assert_eq!(main.path, "main.py");
        assert_eq!(main.lang, "py");
        assert_eq!(main.code, "from foo import y");
        assert_eq!(foo.path, "foo.py");
        assert_eq!(foo.lang, "py");
        assert_eq!(foo.code, "y = 2");
        let [file] = test2.files().collect::<Vec<_>>()[..] else {
            panic!("expected one file");
        };
        assert_eq!(file.path, "test.py");
        assert_eq!(file.lang, "py");
        assert_eq!(file.code, "y = 2");
    }
    #[test]
    fn custom_file_path() {
        let source = dedent(
@ -473,6 +546,49 @@ mod tests {
        assert_eq!(file.code, "x = 1\ny = 2");
    }
    #[test]
    fn empty_file() {
        let source = dedent(
            "
            ```py
            ```
            ",
        );
        let mf = super::parse("file.md", &source).unwrap();
        let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
            panic!("expected one test");
        };
        let [file] = test.files().collect::<Vec<_>>()[..] else {
            panic!("expected one file");
        };
        assert_eq!(file.code, "");
    }
    #[test]
    fn no_lang() {
        let source = dedent(
            "
            ```
            x = 10
            ```
            ",
        );
        let mf = super::parse("file.md", &source).unwrap();
        let [test] = &mf.tests().collect::<Vec<_>>()[..] else {
            panic!("expected one test");
        };
        let [file] = test.files().collect::<Vec<_>>()[..] else {
            panic!("expected one file");
        };
        assert_eq!(file.code, "x = 10");
    }
    #[test]
    fn no_header_inside_test() {
        let source = dedent(