[red-knot] Add fuzzer to catch panics for invalid syntax (#14678)

## Summary

This PR adds a fuzzer harness for red knot that runs the type checker on
source code that contains invalid syntax.

Additionally, this PR also updates the `init-fuzzer.sh` script to
increase the corpus size to:
* Include various crates that includes Python source code
* Use the 3.13 CPython source code

And, remove any non-Python files from the final corpus so that when the
fuzzer tries to minify the corpus, it doesn't produce files that only
contains documentation content as that's just noise.

## Test Plan

Run `./fuzz/init-fuzzer.sh`, say no to the large dataset.
Run the fuzzer with `cargo +night fuzz run red_knot_check_invalid_syntax
-- -timeout=5`
This commit is contained in:
Dhruv Manilawala 2024-12-04 14:36:58 +05:30 committed by GitHub
parent 575deb5d4d
commit 1685d95ed2
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
6 changed files with 195 additions and 10 deletions

View File

@ -32,6 +32,8 @@ jobs:
# Flag that is raised when any code is changed
# This is superset of the linter and formatter
code: ${{ steps.changed.outputs.code_any_changed }}
# Flag that is raised when any code that affects the fuzzer is changed
fuzz: ${{ steps.changed.outputs.fuzz_any_changed }}
steps:
- uses: actions/checkout@v4
with:
@ -79,6 +81,11 @@ jobs:
- python/**
- .github/workflows/ci.yaml
fuzz:
- fuzz/Cargo.toml
- fuzz/Cargo.lock
- fuzz/fuzz_targets/**
code:
- "**/*"
- "!**/*.md"
@ -288,7 +295,7 @@ jobs:
name: "cargo fuzz build"
runs-on: ubuntu-latest
needs: determine_changes
if: ${{ github.ref == 'refs/heads/main' }}
if: ${{ github.ref == 'refs/heads/main' || needs.determine_changes.outputs.fuzz == 'true' }}
timeout-minutes: 10
steps:
- uses: actions/checkout@v4

View File

@ -17,6 +17,9 @@ libfuzzer = ["libfuzzer-sys/link_libfuzzer"]
cargo-fuzz = true
[dependencies]
red_knot_python_semantic = { path = "../crates/red_knot_python_semantic" }
red_knot_vendored = { path = "../crates/red_knot_vendored" }
ruff_db = { path = "../crates/ruff_db" }
ruff_linter = { path = "../crates/ruff_linter" }
ruff_python_ast = { path = "../crates/ruff_python_ast" }
ruff_python_codegen = { path = "../crates/ruff_python_codegen" }
@ -26,12 +29,18 @@ ruff_python_formatter = { path = "../crates/ruff_python_formatter"}
ruff_text_size = { path = "../crates/ruff_text_size" }
libfuzzer-sys = { git = "https://github.com/rust-fuzz/libfuzzer", default-features = false }
salsa = { git = "https://github.com/salsa-rs/salsa.git", rev = "254c749b02cde2fd29852a7463a33e800b771758" }
similar = { version = "2.5.0" }
tracing = { version = "0.1.40" }
# Prevent this from interfering with workspaces
[workspace]
members = ["."]
[[bin]]
name = "red_knot_check_invalid_syntax"
path = "fuzz_targets/red_knot_check_invalid_syntax.rs"
[[bin]]
name = "ruff_parse_simple"
path = "fuzz_targets/ruff_parse_simple.rs"

View File

@ -74,6 +74,15 @@ Each fuzzer harness in [`fuzz_targets`](fuzz_targets) targets a different aspect
them in different ways. While there is implementation-specific documentation in the source code
itself, each harness is briefly described below.
### `red_knot_check_invalid_syntax`
This fuzz harness checks that the type checker (Red Knot) does not panic when checking a source
file with invalid syntax. This rejects any corpus entries that is already valid Python code.
Currently, this is limited to syntax errors that's produced by Ruff's Python parser which means
that it does not cover all possible syntax errors (<https://github.com/astral-sh/ruff/issues/11934>).
A possible workaround for now would be to bypass the parser and run the type checker on all inputs
regardless of syntax errors.
### `ruff_parse_simple`
This fuzz harness does not perform any "smart" testing of Ruff; it merely checks that the parsing

View File

@ -0,0 +1 @@
ruff_fix_validity

View File

@ -0,0 +1,143 @@
//! Fuzzer harness that runs the type checker to catch for panics for source code containing
//! syntax errors.
#![no_main]
use std::sync::{Mutex, OnceLock};
use libfuzzer_sys::{fuzz_target, Corpus};
use red_knot_python_semantic::types::check_types;
use red_knot_python_semantic::{
Db as SemanticDb, Program, ProgramSettings, PythonVersion, SearchPathSettings,
};
use ruff_db::files::{system_path_to_file, File, Files};
use ruff_db::system::{DbWithTestSystem, System, SystemPathBuf, TestSystem};
use ruff_db::vendored::VendoredFileSystem;
use ruff_db::{Db as SourceDb, Upcast};
use ruff_python_parser::{parse_unchecked, Mode};
/// Database that can be used for testing.
///
/// Uses an in memory filesystem and it stubs out the vendored files by default.
#[salsa::db]
struct TestDb {
storage: salsa::Storage<Self>,
files: Files,
system: TestSystem,
vendored: VendoredFileSystem,
events: std::sync::Arc<std::sync::Mutex<Vec<salsa::Event>>>,
}
impl TestDb {
fn new() -> Self {
Self {
storage: salsa::Storage::default(),
system: TestSystem::default(),
vendored: red_knot_vendored::file_system().clone(),
events: std::sync::Arc::default(),
files: Files::default(),
}
}
}
#[salsa::db]
impl SourceDb for TestDb {
fn vendored(&self) -> &VendoredFileSystem {
&self.vendored
}
fn system(&self) -> &dyn System {
&self.system
}
fn files(&self) -> &Files {
&self.files
}
}
impl DbWithTestSystem for TestDb {
fn test_system(&self) -> &TestSystem {
&self.system
}
fn test_system_mut(&mut self) -> &mut TestSystem {
&mut self.system
}
}
impl Upcast<dyn SourceDb> for TestDb {
fn upcast(&self) -> &(dyn SourceDb + 'static) {
self
}
fn upcast_mut(&mut self) -> &mut (dyn SourceDb + 'static) {
self
}
}
#[salsa::db]
impl SemanticDb for TestDb {
fn is_file_open(&self, file: File) -> bool {
!file.path(self).is_vendored_path()
}
}
#[salsa::db]
impl salsa::Database for TestDb {
fn salsa_event(&self, event: &dyn Fn() -> salsa::Event) {
let event = event();
tracing::trace!("event: {:?}", event);
let mut events = self.events.lock().unwrap();
events.push(event);
}
}
fn setup_db() -> TestDb {
let db = TestDb::new();
let src_root = SystemPathBuf::from("/src");
db.memory_file_system()
.create_directory_all(&src_root)
.unwrap();
Program::from_settings(
&db,
&ProgramSettings {
target_version: PythonVersion::default(),
search_paths: SearchPathSettings::new(src_root),
},
)
.expect("Valid search path settings");
db
}
static TEST_DB: OnceLock<Mutex<TestDb>> = OnceLock::new();
fn do_fuzz(case: &[u8]) -> Corpus {
let Ok(code) = std::str::from_utf8(case) else {
return Corpus::Reject;
};
let parsed = parse_unchecked(code, Mode::Module);
if parsed.is_valid() {
return Corpus::Reject;
}
let mut db = TEST_DB
.get_or_init(|| Mutex::new(setup_db()))
.lock()
.unwrap();
for path in &["/src/a.py", "/src/a.pyi"] {
db.write_file(path, code).unwrap();
let file = system_path_to_file(&*db, path).unwrap();
check_types(&*db, file);
db.memory_file_system().remove_file(path).unwrap();
file.sync(&mut *db);
}
Corpus::Keep
}
fuzz_target!(|case: &[u8]| -> Corpus { do_fuzz(case) });

View File

@ -11,16 +11,32 @@ fi
if [ ! -d corpus/ruff_fix_validity ]; then
mkdir -p corpus/ruff_fix_validity
read -p "Would you like to build a corpus from a python source code dataset? (this will take a long time!) [Y/n] " -n 1 -r
echo
cd corpus/ruff_fix_validity
if [[ $REPLY =~ ^[Yy]$ ]]; then
curl -L 'https://zenodo.org/record/3628784/files/python-corpus.tar.gz?download=1' | tar xz
(
cd corpus/ruff_fix_validity
read -p "Would you like to build a corpus from a python source code dataset? (this will take a long time!) [Y/n] " -n 1 -r
echo
if [[ $REPLY =~ ^[Yy]$ ]]; then
curl -L 'https://zenodo.org/record/3628784/files/python-corpus.tar.gz?download=1' | tar xz
fi
# Build a smaller corpus in addition to the (optional) larger corpus
curl -L 'https://github.com/python/cpython/archive/refs/tags/v3.13.0.tar.gz' | tar xz
cp -r "../../../crates/red_knot_workspace/resources/test/corpus" "red_knot_workspace"
cp -r "../../../crates/ruff_linter/resources/test/fixtures" "ruff_linter"
cp -r "../../../crates/ruff_python_formatter/resources/test/fixtures" "ruff_python_formatter"
cp -r "../../../crates/ruff_python_parser/resources" "ruff_python_parser"
# Delete all non-Python files
find . -type f -not -name "*.py" -delete
)
if [[ "$OSTYPE" == "darwin"* ]]; then
cargo +nightly fuzz cmin ruff_fix_validity -- -timeout=5
else
cargo fuzz cmin -s none ruff_fix_validity -- -timeout=5
fi
curl -L 'https://github.com/python/cpython/archive/refs/tags/v3.12.0b2.tar.gz' | tar xz
cp -r "../../../crates/ruff_linter/resources/test" .
cd -
cargo fuzz cmin -s none ruff_fix_validity -- -timeout=5
fi
echo "Done! You are ready to fuzz."