Index source code upfront to power (row, column) lookups (#1990)

## Summary

The problem: given a (row, column) number (e.g., for a token in the AST), we need to be able to map it to a precise byte index in the source code. A while ago, we moved to `ropey` for this, since it was faster in practice (mostly, I think, because it's able to defer indexing). However, at some threshold of accesses, it becomes faster to index the string in advance, as we're doing here.

## Benchmark

It looks like this is ~3.6% slower for the default rule set, but ~9.3% faster for `--select ALL`.

**I suspect there's a strategy that would be strictly faster in both cases**, based on deferring even more computation (right now, we lazily compute these offsets, but we do it for the entire file at once, even if we only need some slice at the top), or caching the `ropey` lookups in some way.

Before:

![main](https://user-images.githubusercontent.com/1309177/213883581-8f73c61d-2979-4171-88a6-a88d7ff07e40.png)

After:

![48 all](https://user-images.githubusercontent.com/1309177/213883586-3e049680-9ef9-49e2-8f04-fd6ff402eba7.png)

## Alternatives

I tried tweaking the `Vec::with_capacity` hints, and even trying `Vec::with_capacity(str_indices::lines_crlf::count_breaks(contents))` to do a quick scan of the number of lines, but that turned out to be slower.
This commit is contained in:
Charlie Marsh 2023-01-21 17:56:11 -05:00 committed by GitHub
parent 08fc9b8095
commit 4dcf284a04
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
33 changed files with 306 additions and 148 deletions

View File

@ -1 +0,0 @@
from long_module_name import member_one, member_two, member_three, member_four, member_five

View File

@ -366,7 +366,7 @@ pub fn collect_arg_names<'a>(arguments: &'a Arguments) -> FxHashSet<&'a str> {
/// Returns `true` if a statement or expression includes at least one comment.
pub fn has_comments_in(range: Range, locator: &Locator) -> bool {
lexer::make_tokenizer(&locator.slice_source_code_range(&range))
lexer::make_tokenizer(locator.slice_source_code_range(&range))
.any(|result| result.map_or(false, |(_, tok, _)| matches!(tok, Tok::Comment(..))))
}
@ -486,7 +486,7 @@ pub fn identifier_range(stmt: &Stmt, locator: &Locator) -> Range {
| StmtKind::AsyncFunctionDef { .. }
) {
let contents = locator.slice_source_code_range(&Range::from_located(stmt));
for (start, tok, end) in lexer::make_tokenizer_located(&contents, stmt.location).flatten() {
for (start, tok, end) in lexer::make_tokenizer_located(contents, stmt.location).flatten() {
if matches!(tok, Tok::Name { .. }) {
return Range::new(start, end);
}
@ -515,7 +515,7 @@ pub fn binding_range(binding: &Binding, locator: &Locator) -> Range {
// Return the ranges of `Name` tokens within a specified node.
pub fn find_names<T>(located: &Located<T>, locator: &Locator) -> Vec<Range> {
let contents = locator.slice_source_code_range(&Range::from_located(located));
lexer::make_tokenizer_located(&contents, located.location)
lexer::make_tokenizer_located(contents, located.location)
.flatten()
.filter(|(_, tok, _)| matches!(tok, Tok::Name { .. }))
.map(|(start, _, end)| Range {
@ -535,7 +535,7 @@ pub fn excepthandler_name_range(handler: &Excepthandler, locator: &Locator) -> O
let type_end_location = type_.end_location.unwrap();
let contents =
locator.slice_source_code_range(&Range::new(type_end_location, body[0].location));
let range = lexer::make_tokenizer_located(&contents, type_end_location)
let range = lexer::make_tokenizer_located(contents, type_end_location)
.flatten()
.tuple_windows()
.find(|(tok, next_tok)| {
@ -562,7 +562,7 @@ pub fn except_range(handler: &Excepthandler, locator: &Locator) -> Range {
location: handler.location,
end_location: end,
});
let range = lexer::make_tokenizer_located(&contents, handler.location)
let range = lexer::make_tokenizer_located(contents, handler.location)
.flatten()
.find(|(_, kind, _)| matches!(kind, Tok::Except { .. }))
.map(|(location, _, end_location)| Range {
@ -576,7 +576,7 @@ pub fn except_range(handler: &Excepthandler, locator: &Locator) -> Range {
/// Find f-strings that don't contain any formatted values in a `JoinedStr`.
pub fn find_useless_f_strings(expr: &Expr, locator: &Locator) -> Vec<(Range, Range)> {
let contents = locator.slice_source_code_range(&Range::from_located(expr));
lexer::make_tokenizer_located(&contents, expr.location)
lexer::make_tokenizer_located(contents, expr.location)
.flatten()
.filter_map(|(location, tok, end_location)| match tok {
Tok::String {
@ -630,7 +630,7 @@ pub fn else_range(stmt: &Stmt, locator: &Locator) -> Option<Range> {
.expect("Expected orelse to be non-empty")
.location,
});
let range = lexer::make_tokenizer_located(&contents, body_end)
let range = lexer::make_tokenizer_located(contents, body_end)
.flatten()
.find(|(_, kind, _)| matches!(kind, Tok::Else))
.map(|(location, _, end_location)| Range {
@ -646,7 +646,7 @@ pub fn else_range(stmt: &Stmt, locator: &Locator) -> Option<Range> {
/// Return the `Range` of the first `Tok::Colon` token in a `Range`.
pub fn first_colon_range(range: Range, locator: &Locator) -> Option<Range> {
let contents = locator.slice_source_code_range(&range);
let range = lexer::make_tokenizer_located(&contents, range.location)
let range = lexer::make_tokenizer_located(contents, range.location)
.flatten()
.find(|(_, kind, _)| matches!(kind, Tok::Colon))
.map(|(location, _, end_location)| Range {
@ -676,7 +676,7 @@ pub fn elif_else_range(stmt: &Stmt, locator: &Locator) -> Option<Range> {
_ => return None,
};
let contents = locator.slice_source_code_range(&Range::new(start, end));
let range = lexer::make_tokenizer_located(&contents, start)
let range = lexer::make_tokenizer_located(contents, start)
.flatten()
.find(|(_, kind, _)| matches!(kind, Tok::Elif | Tok::Else))
.map(|(location, _, end_location)| Range {

View File

@ -1,4 +1,3 @@
use std::borrow::Cow;
use std::str::Lines;
use rustpython_ast::{Located, Location};
@ -7,7 +6,7 @@ use crate::ast::types::Range;
use crate::source_code::Locator;
/// Extract the leading indentation from a line.
pub fn indentation<'a, T>(locator: &'a Locator, located: &'a Located<T>) -> Option<Cow<'a, str>> {
pub fn indentation<'a, T>(locator: &'a Locator, located: &'a Located<T>) -> Option<&'a str> {
let range = Range::from_located(located);
let indentation = locator.slice_source_code_range(&Range::new(
Location::new(range.location.row(), 0),

View File

@ -80,7 +80,7 @@ fn is_lone_child(child: &Stmt, parent: &Stmt, deleted: &[&Stmt]) -> Result<bool>
/// of a multi-statement line.
fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option<Location> {
let contents = locator.slice_source_code_at(stmt.end_location.unwrap());
for (row, line) in LinesWithTrailingNewline::from(&contents).enumerate() {
for (row, line) in LinesWithTrailingNewline::from(contents).enumerate() {
let trimmed = line.trim();
if trimmed.starts_with(';') {
let column = line
@ -103,7 +103,7 @@ fn trailing_semicolon(stmt: &Stmt, locator: &Locator) -> Option<Location> {
fn next_stmt_break(semicolon: Location, locator: &Locator) -> Location {
let start_location = Location::new(semicolon.row(), semicolon.column() + 1);
let contents = locator.slice_source_code_at(start_location);
for (row, line) in LinesWithTrailingNewline::from(&contents).enumerate() {
for (row, line) in LinesWithTrailingNewline::from(contents).enumerate() {
let trimmed = line.trim();
// Skip past any continuations.
if trimmed.starts_with('\\') {
@ -202,7 +202,7 @@ pub fn remove_unused_imports<'a>(
indexer: &Indexer,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(stmt));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let Some(Statement::Simple(body)) = tree.body.first_mut() else {
bail!("Expected Statement::Simple");

View File

@ -54,7 +54,7 @@ fn apply_fixes<'a>(
// Add all contents from `last_pos` to `fix.location`.
let slice = locator.slice_source_code_range(&Range::new(last_pos, fix.location));
output.append(&slice);
output.append(slice);
// Add the patch itself.
output.append(&fix.content);
@ -67,7 +67,7 @@ fn apply_fixes<'a>(
// Add the remaining content.
let slice = locator.slice_source_code_at(last_pos);
output.append(&slice);
output.append(slice);
(Cow::from(output.finish()), num_fixed)
}

View File

@ -273,7 +273,7 @@ impl<'a> Checker<'a> {
Location::new(*noqa_lineno, 0),
Location::new(noqa_lineno + 1, 0),
));
match noqa::extract_noqa_directive(&line) {
match noqa::extract_noqa_directive(line) {
Directive::None => false,
Directive::All(..) => true,
Directive::Codes(.., codes) => noqa::includes(code, &codes),
@ -4610,12 +4610,13 @@ impl<'a> Checker<'a> {
Location::new(expr.location.row(), 0),
Location::new(expr.location.row(), expr.location.column()),
));
let body = pydocstyle::helpers::raw_contents(&contents);
let body = pydocstyle::helpers::raw_contents(contents);
let docstring = Docstring {
kind: definition.kind,
expr,
contents: &contents,
indentation: &indentation,
contents,
indentation,
body,
};

View File

@ -1,5 +1,3 @@
use std::borrow::Cow;
use rustpython_ast::{Expr, Stmt};
#[derive(Debug, Clone)]
@ -23,9 +21,9 @@ pub struct Definition<'a> {
pub struct Docstring<'a> {
pub kind: DefinitionKind<'a>,
pub expr: &'a Expr,
pub contents: &'a Cow<'a, str>,
pub contents: &'a str,
pub body: &'a str,
pub indentation: &'a Cow<'a, str>,
pub indentation: &'a str,
}
pub enum Documentable {

View File

@ -32,7 +32,7 @@ pub fn commented_out_code(
let line = locator.slice_source_code_range(&Range::new(location, end_location));
// Verify that the comment is on its own line, and that it contains code.
if is_standalone_comment(&line) && comment_contains_code(&line, &settings.task_tags[..]) {
if is_standalone_comment(line) && comment_contains_code(line, &settings.task_tags[..]) {
let mut diagnostic = Diagnostic::new(violations::CommentedOutCode, Range::new(start, end));
if matches!(autofix, flags::Autofix::Enabled)
&& settings.rules.should_fix(&Rule::CommentedOutCode)

View File

@ -16,7 +16,7 @@ pub fn add_return_none_annotation(locator: &Locator, stmt: &Stmt) -> Result<Fix>
let mut seen_lpar = false;
let mut seen_rpar = false;
let mut count: usize = 0;
for (start, tok, ..) in lexer::make_tokenizer_located(&contents, range.location).flatten() {
for (start, tok, ..) in lexer::make_tokenizer_located(contents, range.location).flatten() {
if seen_lpar && seen_rpar {
if matches!(tok, Tok::Colon) {
return Ok(Fix::insertion(" -> None".to_string(), start));

View File

@ -34,7 +34,7 @@ pub fn fix_unnecessary_generator_list(
) -> Result<Fix> {
// Expr(Call(GeneratorExp)))) -> Expr(ListComp)))
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -75,7 +75,7 @@ pub fn fix_unnecessary_generator_set(
) -> Result<Fix> {
// Expr(Call(GeneratorExp)))) -> Expr(SetComp)))
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -116,7 +116,7 @@ pub fn fix_unnecessary_generator_dict(
expr: &rustpython_ast::Expr,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -175,7 +175,7 @@ pub fn fix_unnecessary_list_comprehension_set(
// Expr(Call(ListComp)))) ->
// Expr(SetComp)))
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -214,7 +214,7 @@ pub fn fix_unnecessary_list_comprehension_dict(
expr: &rustpython_ast::Expr,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -305,7 +305,7 @@ fn drop_trailing_comma<'a>(
pub fn fix_unnecessary_literal_set(locator: &Locator, expr: &rustpython_ast::Expr) -> Result<Fix> {
// Expr(Call(List|Tuple)))) -> Expr(Set)))
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let mut call = match_call(body)?;
let arg = match_arg(call)?;
@ -348,7 +348,7 @@ pub fn fix_unnecessary_literal_set(locator: &Locator, expr: &rustpython_ast::Exp
pub fn fix_unnecessary_literal_dict(locator: &Locator, expr: &rustpython_ast::Expr) -> Result<Fix> {
// Expr(Call(List|Tuple)))) -> Expr(Dict)))
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -416,7 +416,7 @@ pub fn fix_unnecessary_collection_call(
) -> Result<Fix> {
// Expr(Call("list" | "tuple" | "dict")))) -> Expr(List|Tuple|Dict)
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let Expression::Name(name) = &call.func.as_ref() else {
@ -524,7 +524,7 @@ pub fn fix_unnecessary_literal_within_tuple_call(
expr: &rustpython_ast::Expr,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -578,7 +578,7 @@ pub fn fix_unnecessary_literal_within_list_call(
expr: &rustpython_ast::Expr,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -632,7 +632,7 @@ pub fn fix_unnecessary_literal_within_list_call(
pub fn fix_unnecessary_list_call(locator: &Locator, expr: &rustpython_ast::Expr) -> Result<Fix> {
// Expr(Call(List|Tuple)))) -> Expr(List|Tuple)))
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let call = match_call(body)?;
let arg = match_arg(call)?;
@ -657,7 +657,7 @@ pub fn fix_unnecessary_call_around_sorted(
expr: &rustpython_ast::Expr,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let outer_call = match_call(body)?;
let inner_call = match &outer_call.args[..] {
@ -739,7 +739,7 @@ pub fn fix_unnecessary_comprehension(
expr: &rustpython_ast::Expr,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(expr));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
match &body.value {

View File

@ -109,7 +109,7 @@ fn implicit_return(checker: &mut Checker, last_stmt: &Stmt) {
if checker.patch(&Rule::ImplicitReturn) {
if let Some(indent) = indentation(checker.locator, last_stmt) {
let mut content = String::new();
content.push_str(&indent);
content.push_str(indent);
content.push_str("return None");
content.push('\n');
diagnostic.amend(Fix::insertion(

View File

@ -1,5 +1,3 @@
use std::borrow::Cow;
use anyhow::{bail, Result};
use libcst_native::{
BooleanOp, BooleanOperation, Codegen, CodegenState, CompoundStatement, Expression, If,
@ -52,9 +50,9 @@ pub(crate) fn fix_nested_if_statements(
// If this is an `elif`, we have to remove the `elif` keyword for now. (We'll
// restore the `el` later on.)
let module_text = if is_elif {
Cow::Owned(contents.replacen("elif", "if", 1))
contents.replacen("elif", "if", 1)
} else {
contents
contents.to_string()
};
// If the block is indented, "embed" it in a function definition, to preserve
@ -63,10 +61,7 @@ pub(crate) fn fix_nested_if_statements(
let module_text = if outer_indent.is_empty() {
module_text
} else {
Cow::Owned(format!(
"def f():{}{module_text}",
stylist.line_ending().as_str()
))
format!("def f():{}{module_text}", stylist.line_ending().as_str())
};
// Parse the CST.
@ -82,7 +77,7 @@ pub(crate) fn fix_nested_if_statements(
let Suite::IndentedBlock(indented_block) = &mut embedding.body else {
bail!("Expected indented block")
};
indented_block.indent = Some(&outer_indent);
indented_block.indent = Some(outer_indent);
&mut *indented_block.body
};

View File

@ -1,5 +1,3 @@
use std::borrow::Cow;
use anyhow::{bail, Result};
use libcst_native::{Codegen, CodegenState, CompoundStatement, Statement, Suite, With};
use rustpython_ast::Location;
@ -30,9 +28,9 @@ pub(crate) fn fix_multiple_with_statements(
// indentation while retaining valid source code. (We'll strip the prefix later
// on.)
let module_text = if outer_indent.is_empty() {
contents
contents.to_string()
} else {
Cow::Owned(format!("def f():\n{contents}"))
format!("def f():\n{contents}")
};
// Parse the CST.
@ -48,7 +46,7 @@ pub(crate) fn fix_multiple_with_statements(
let Suite::IndentedBlock(indented_block) = &mut embedding.body else {
bail!("Expected indented block")
};
indented_block.indent = Some(&outer_indent);
indented_block.indent = Some(outer_indent);
&mut *indented_block.body
};

View File

@ -17,7 +17,7 @@ pub struct Comment<'a> {
/// Collect all comments in an import block.
pub fn collect_comments<'a>(range: &Range, locator: &'a Locator) -> Vec<Comment<'a>> {
let contents = locator.slice_source_code_range(range);
lexer::make_tokenizer_located(&contents, range.location)
lexer::make_tokenizer_located(contents, range.location)
.flatten()
.filter_map(|(start, tok, end)| {
if let Tok::Comment(value) = tok {

View File

@ -13,7 +13,7 @@ pub fn trailing_comma(stmt: &Stmt, locator: &Locator) -> TrailingComma {
let contents = locator.slice_source_code_range(&Range::from_located(stmt));
let mut count: usize = 0;
let mut trailing_comma = TrailingComma::Absent;
for (_, tok, _) in lexer::make_tokenizer(&contents).flatten() {
for (_, tok, _) in lexer::make_tokenizer(contents).flatten() {
if matches!(tok, Tok::Lpar) {
count += 1;
}
@ -110,7 +110,7 @@ pub fn find_splice_location(body: &[Stmt], locator: &Locator) -> Location {
// Find the first token that isn't a comment or whitespace.
let contents = locator.slice_source_code_at(splice);
for (.., tok, end) in lexer::make_tokenizer(&contents).flatten() {
for (.., tok, end) in lexer::make_tokenizer(contents).flatten() {
if matches!(tok, Tok::Comment(..) | Tok::Newline) {
splice = end;
} else {

View File

@ -701,7 +701,6 @@ mod tests {
#[test_case(Path::new("insert_empty_lines.py"))]
#[test_case(Path::new("insert_empty_lines.pyi"))]
#[test_case(Path::new("leading_prefix.py"))]
#[test_case(Path::new("line_ending_cr.py"))]
#[test_case(Path::new("line_ending_crlf.py"))]
#[test_case(Path::new("line_ending_lf.py"))]
#[test_case(Path::new("magic_trailing_comma.py"))]

View File

@ -38,7 +38,7 @@ pub fn organize_imports(
package: Option<&Path>,
) -> Option<Diagnostic> {
let indentation = locator.slice_source_code_range(&extract_indentation_range(&block.imports));
let indentation = leading_space(&indentation);
let indentation = leading_space(indentation);
let range = extract_range(&block.imports);
@ -96,7 +96,7 @@ pub fn organize_imports(
Location::new(range.location.row(), 0),
Location::new(range.end_location.row() + 1 + num_trailing_lines, 0),
);
let actual = dedent(&locator.slice_source_code_range(&range));
let actual = dedent(locator.slice_source_code_range(&range));
if actual == dedent(&expected) {
None
} else {

View File

@ -1,22 +0,0 @@
---
source: src/rules/isort/mod.rs
expression: diagnostics
---
- kind:
UnsortedImports: ~
location:
row: 1
column: 0
end_location:
row: 2
column: 0
fix:
content: "from long_module_name import (\r member_five,\r member_four,\r member_one,\r member_three,\r member_two,\r)\r"
location:
row: 1
column: 0
end_location:
row: 2
column: 0
parent: ~

View File

@ -480,7 +480,7 @@ pub fn do_not_assign_lambda(checker: &mut Checker, target: &Expr, value: &Expr,
Location::new(stmt.location.row(), 0),
Location::new(stmt.location.row() + 1, 0),
));
let indentation = &leading_space(&first_line);
let indentation = &leading_space(first_line);
let mut indented = String::new();
for (idx, line) in function(id, args, body, checker.stylist)
.lines()
@ -603,7 +603,7 @@ pub fn invalid_escape_sequence(
let text = locator.slice_source_code_range(&Range::new(start, end));
// Determine whether the string is single- or triple-quoted.
let quote = extract_quote(&text);
let quote = extract_quote(text);
let quote_pos = text.find(quote).unwrap();
let prefix = text[..quote_pos].to_lowercase();
let body = &text[(quote_pos + quote.len())..(text.len() - quote.len())];

View File

@ -74,7 +74,7 @@ pub fn blank_before_after_function(checker: &mut Checker, docstring: &Docstring)
.count();
// Avoid D202 violations for blank lines followed by inner functions or classes.
if blank_lines_after == 1 && INNER_FUNCTION_OR_CLASS_REGEX.is_match(&after) {
if blank_lines_after == 1 && INNER_FUNCTION_OR_CLASS_REGEX.is_match(after) {
return;
}

View File

@ -17,7 +17,7 @@ pub fn remove_unused_format_arguments_from_dict(
locator: &Locator,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&Range::from_located(stmt));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let new_dict = {
@ -63,7 +63,7 @@ pub fn remove_unused_keyword_arguments_from_format_call(
locator: &Locator,
) -> Result<Fix> {
let module_text = locator.slice_source_code_range(&location);
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut body = match_expr(&mut tree)?;
let new_call = {
@ -112,7 +112,7 @@ pub fn remove_exception_handler_assignment(
// End of the token just before the `as` to the semicolon.
let mut prev = None;
for (start, tok, end) in
lexer::make_tokenizer_located(&contents, excepthandler.location).flatten()
lexer::make_tokenizer_located(contents, excepthandler.location).flatten()
{
if matches!(tok, Tok::As) {
fix_start = prev;

View File

@ -18,7 +18,7 @@ pub fn invalid_literal_comparison(
comparators: &[Expr],
location: Range,
) {
let located = Lazy::new(|| locate_cmpops(&checker.locator.slice_source_code_range(&location)));
let located = Lazy::new(|| locate_cmpops(checker.locator.slice_source_code_range(&location)));
let mut left = left;
for (index, (op, right)) in izip!(ops, comparators).enumerate() {
if matches!(op, Cmpop::Is | Cmpop::IsNot)

View File

@ -18,7 +18,7 @@ where
F: Fn(Tok) -> bool,
{
let contents = locator.slice_source_code_range(&Range::from_located(stmt));
for ((_, tok, _), (start, ..)) in lexer::make_tokenizer_located(&contents, stmt.location)
for ((_, tok, _), (start, ..)) in lexer::make_tokenizer_located(contents, stmt.location)
.flatten()
.tuple_windows()
{

View File

@ -24,7 +24,7 @@ pub fn remove_class_def_base(
let mut fix_start = None;
let mut fix_end = None;
let mut count: usize = 0;
for (start, tok, end) in lexer::make_tokenizer_located(&contents, stmt_at).flatten() {
for (start, tok, end) in lexer::make_tokenizer_located(contents, stmt_at).flatten() {
if matches!(tok, Tok::Lpar) {
if count == 0 {
fix_start = Some(start);
@ -57,7 +57,7 @@ pub fn remove_class_def_base(
let mut fix_start: Option<Location> = None;
let mut fix_end: Option<Location> = None;
let mut seen_comma = false;
for (start, tok, end) in lexer::make_tokenizer_located(&contents, stmt_at).flatten() {
for (start, tok, end) in lexer::make_tokenizer_located(contents, stmt_at).flatten() {
if seen_comma {
if matches!(tok, Tok::NonLogicalNewline) {
// Also delete any non-logical newlines after the comma.
@ -87,7 +87,7 @@ pub fn remove_class_def_base(
// isn't a comma.
let mut fix_start: Option<Location> = None;
let mut fix_end: Option<Location> = None;
for (start, tok, end) in lexer::make_tokenizer_located(&contents, stmt_at).flatten() {
for (start, tok, end) in lexer::make_tokenizer_located(contents, stmt_at).flatten() {
if start == expr_at {
fix_end = Some(end);
break;
@ -109,7 +109,7 @@ pub fn remove_super_arguments(locator: &Locator, expr: &Expr) -> Option<Fix> {
let range = Range::from_located(expr);
let contents = locator.slice_source_code_range(&range);
let mut tree = libcst_native::parse_module(&contents, None).ok()?;
let mut tree = libcst_native::parse_module(contents, None).ok()?;
let Statement::Simple(body) = tree.body.first_mut()? else {
return None;

View File

@ -33,7 +33,7 @@ impl<'a> FormatSummaryValues<'a> {
let arg = checker
.locator
.slice_source_code_range(&Range::from_located(arg));
if contains_invalids(&arg) {
if contains_invalids(arg) {
return None;
}
extracted_args.push(arg.to_string());
@ -44,7 +44,7 @@ impl<'a> FormatSummaryValues<'a> {
let kwarg = checker
.locator
.slice_source_code_range(&Range::from_located(value));
if contains_invalids(&kwarg) {
if contains_invalids(kwarg) {
return None;
}
extracted_kwargs.insert(key, kwarg.to_string());
@ -119,7 +119,7 @@ fn try_convert_to_f_string(checker: &Checker, expr: &Expr) -> Option<String> {
.slice_source_code_range(&Range::from_located(value));
// Tokenize: we need to avoid trying to fix implicit string concatenations.
if lexer::make_tokenizer(&contents)
if lexer::make_tokenizer(contents)
.flatten()
.filter(|(_, tok, _)| matches!(tok, Tok::String { .. }))
.count()
@ -133,7 +133,7 @@ fn try_convert_to_f_string(checker: &Checker, expr: &Expr) -> Option<String> {
let contents = if contents.starts_with('U') || contents.starts_with('u') {
&contents[1..]
} else {
&contents
contents
};
if contents.is_empty() {
return None;

View File

@ -102,7 +102,7 @@ pub(crate) fn format_literals(checker: &mut Checker, summary: &FormatSummary, ex
// Currently, the only issue we know of is in LibCST:
// https://github.com/Instagram/LibCST/issues/846
if let Ok(contents) = generate_call(
&checker
checker
.locator
.slice_source_code_range(&Range::from_located(expr)),
&summary.indexes,

View File

@ -84,7 +84,7 @@ pub fn native_literals(
let arg_code = checker
.locator
.slice_source_code_range(&Range::from_located(arg));
if lexer::make_tokenizer(&arg_code)
if lexer::make_tokenizer(arg_code)
.flatten()
.filter(|(_, tok, _)| matches!(tok, Tok::String { .. }))
.count()

View File

@ -186,7 +186,7 @@ fn clean_params_dictionary(checker: &mut Checker, right: &Expr) -> Option<String
.slice_source_code_range(&Range::from_located(value));
contents.push_str(key_string);
contents.push('=');
contents.push_str(&value_string);
contents.push_str(value_string);
arguments.push(contents);
} else {
// If there are any non-string keys, abort.
@ -205,7 +205,7 @@ fn clean_params_dictionary(checker: &mut Checker, right: &Expr) -> Option<String
for item in &arguments {
contents.push('\n');
contents.push_str(&indent);
contents.push_str(indent);
contents.push_str(item);
contents.push(',');
}
@ -217,7 +217,7 @@ fn clean_params_dictionary(checker: &mut Checker, right: &Expr) -> Option<String
if let Some(ident) = indent.strip_prefix(default_indent) {
contents.push_str(ident);
} else {
contents.push_str(&indent);
contents.push_str(indent);
}
} else {
contents.push_str(&arguments.join(", "));
@ -304,7 +304,7 @@ pub(crate) fn printf_string_formatting(
let mut strings: Vec<(Location, Location)> = vec![];
let mut extension = None;
for (start, tok, end) in lexer::make_tokenizer_located(
&checker
checker
.locator
.slice_source_code_range(&Range::from_located(expr)),
expr.location,
@ -333,7 +333,7 @@ pub(crate) fn printf_string_formatting(
let string = checker
.locator
.slice_source_code_range(&Range::new(*start, *end));
let (Some(leader), Some(trailer)) = (leading_quote(&string), trailing_quote(&string)) else {
let (Some(leader), Some(trailer)) = (leading_quote(string), trailing_quote(string)) else {
return;
};
let string = &string[leader.len()..string.len() - trailer.len()];
@ -371,14 +371,14 @@ pub(crate) fn printf_string_formatting(
match prev {
None => {
contents.push_str(
&checker
checker
.locator
.slice_source_code_range(&Range::new(expr.location, *start)),
);
}
Some(prev) => {
contents.push_str(
&checker
checker
.locator
.slice_source_code_range(&Range::new(prev, *start)),
);
@ -391,7 +391,7 @@ pub(crate) fn printf_string_formatting(
if let Some((.., end)) = extension {
contents.push_str(
&checker
checker
.locator
.slice_source_code_range(&Range::new(prev.unwrap(), end)),
);

View File

@ -112,7 +112,7 @@ fn create_remove_param_fix(locator: &Locator, expr: &Expr, mode_param: &Expr) ->
let mut fix_end: Option<Location> = None;
let mut is_first_arg: bool = false;
let mut delete_first_arg: bool = false;
for (start, tok, end) in lexer::make_tokenizer_located(&content, expr.location).flatten() {
for (start, tok, end) in lexer::make_tokenizer_located(content, expr.location).flatten() {
if start == mode_param.location {
if is_first_arg {
delete_first_arg = true;

View File

@ -56,7 +56,7 @@ fn generate_fix(locator: &Locator, stdout: &Keyword, stderr: &Keyword) -> Option
};
let mut contents = String::from("capture_output=True");
if let Some(middle) = extract_middle(
&locator.slice_source_code_range(&Range::new(first.end_location.unwrap(), last.location)),
locator.slice_source_code_range(&Range::new(first.end_location.unwrap(), last.location)),
) {
if middle.multi_line {
let Some(indent) = indentation(locator, first) else {
@ -64,7 +64,7 @@ fn generate_fix(locator: &Locator, stdout: &Keyword, stderr: &Keyword) -> Option
};
contents.push(',');
contents.push('\n');
contents.push_str(&indent);
contents.push_str(indent);
} else {
contents.push(',');
contents.push(' ');

View File

@ -117,7 +117,7 @@ fn format_import(
stylist: &Stylist,
) -> Result<String> {
let module_text = locator.slice_source_code_range(&Range::from_located(stmt));
let mut tree = match_module(&module_text)?;
let mut tree = match_module(module_text)?;
let mut import = match_import(&mut tree)?;
let Import { names, .. } = import.clone();
@ -147,7 +147,7 @@ fn format_import_from(
stylist: &Stylist,
) -> Result<String> {
let module_text = locator.slice_source_code_range(&Range::from_located(stmt));
let mut tree = match_module(&module_text).unwrap();
let mut tree = match_module(module_text).unwrap();
let mut import = match_import_from(&mut tree)?;
let ImportFrom {
@ -228,7 +228,7 @@ pub fn rewrite_mock_import(checker: &mut Checker, stmt: &Stmt) {
// Generate the fix, if needed, which is shared between all `mock` imports.
let content = if checker.patch(&Rule::RewriteMockImport) {
if let Some(indent) = indentation(checker.locator, stmt) {
match format_import(stmt, &indent, checker.locator, checker.stylist) {
match format_import(stmt, indent, checker.locator, checker.stylist) {
Ok(content) => Some(content),
Err(e) => {
error!("Failed to rewrite `mock` import: {e}");
@ -277,7 +277,7 @@ pub fn rewrite_mock_import(checker: &mut Checker, stmt: &Stmt) {
);
if checker.patch(&Rule::RewriteMockImport) {
if let Some(indent) = indentation(checker.locator, stmt) {
match format_import_from(stmt, &indent, checker.locator, checker.stylist) {
match format_import_from(stmt, indent, checker.locator, checker.stylist) {
Ok(content) => {
diagnostic.amend(Fix::replacement(
content,

View File

@ -1,65 +1,256 @@
//! Struct used to efficiently slice source code at (row, column) Locations.
use std::borrow::Cow;
use once_cell::unsync::OnceCell;
use ropey::Rope;
use rustpython_ast::Location;
use crate::ast::types::Range;
pub struct Locator<'a> {
contents: &'a str,
rope: OnceCell<Rope>,
index: OnceCell<Index>,
}
pub enum Index {
Ascii(Vec<usize>),
Utf8(Vec<Vec<usize>>),
}
/// Compute the starting byte index of each line in ASCII source code.
fn index_ascii(contents: &str) -> Vec<usize> {
let mut index = Vec::with_capacity(48);
index.push(0);
let bytes = contents.as_bytes();
for (i, byte) in bytes.iter().enumerate() {
if *byte == b'\n' {
index.push(i + 1);
}
}
index
}
/// Compute the starting byte index of each character in UTF-8 source code.
fn index_utf8(contents: &str) -> Vec<Vec<usize>> {
let mut index = Vec::with_capacity(48);
let mut current_row = Vec::with_capacity(48);
let mut current_byte_offset = 0;
let mut previous_char = '\0';
for char in contents.chars() {
current_row.push(current_byte_offset);
if char == '\n' {
if previous_char == '\r' {
current_row.pop();
}
index.push(current_row);
current_row = Vec::with_capacity(48);
}
current_byte_offset += char.len_utf8();
previous_char = char;
}
index.push(current_row);
index
}
/// Compute the starting byte index of each line in source code.
pub fn index(contents: &str) -> Index {
if contents.is_ascii() {
Index::Ascii(index_ascii(contents))
} else {
Index::Utf8(index_utf8(contents))
}
}
/// Truncate a [`Location`] to a byte offset in ASCII source code.
fn truncate_ascii(location: Location, index: &[usize], contents: &str) -> usize {
if location.row() - 1 == index.len() && location.column() == 0
|| (!index.is_empty()
&& location.row() - 1 == index.len() - 1
&& index[location.row() - 1] + location.column() >= contents.len())
{
contents.len()
} else {
index[location.row() - 1] + location.column()
}
}
/// Truncate a [`Location`] to a byte offset in UTF-8 source code.
fn truncate_utf8(location: Location, index: &[Vec<usize>], contents: &str) -> usize {
if (location.row() - 1 == index.len() && location.column() == 0)
|| (location.row() - 1 == index.len() - 1
&& location.column() == index[location.row() - 1].len())
{
contents.len()
} else {
index[location.row() - 1][location.column()]
}
}
/// Truncate a [`Location`] to a byte offset in source code.
fn truncate(location: Location, index: &Index, contents: &str) -> usize {
match index {
Index::Ascii(index) => truncate_ascii(location, index, contents),
Index::Utf8(index) => truncate_utf8(location, index, contents),
}
}
impl<'a> Locator<'a> {
pub fn new(contents: &'a str) -> Self {
Locator {
contents,
rope: OnceCell::default(),
index: OnceCell::new(),
}
}
fn get_or_init_rope(&self) -> &Rope {
self.rope.get_or_init(|| Rope::from_str(self.contents))
fn get_or_init_index(&self) -> &Index {
self.index.get_or_init(|| index(self.contents))
}
pub fn slice_source_code_at(&self, location: Location) -> Cow<'_, str> {
let rope = self.get_or_init_rope();
let offset = rope.line_to_char(location.row() - 1) + location.column();
Cow::from(rope.slice(offset..))
pub fn slice_source_code_until(&self, location: Location) -> &'a str {
let index = self.get_or_init_index();
let offset = truncate(location, index, self.contents);
&self.contents[..offset]
}
pub fn slice_source_code_until(&self, location: Location) -> Cow<'_, str> {
let rope = self.get_or_init_rope();
let offset = rope.line_to_char(location.row() - 1) + location.column();
Cow::from(rope.slice(..offset))
pub fn slice_source_code_at(&self, location: Location) -> &'a str {
let index = self.get_or_init_index();
let offset = truncate(location, index, self.contents);
&self.contents[offset..]
}
pub fn slice_source_code_range(&self, range: &Range) -> Cow<'_, str> {
let rope = self.get_or_init_rope();
let start = rope.line_to_char(range.location.row() - 1) + range.location.column();
let end = rope.line_to_char(range.end_location.row() - 1) + range.end_location.column();
Cow::from(rope.slice(start..end))
pub fn slice_source_code_range(&self, range: &Range) -> &'a str {
let index = self.get_or_init_index();
let start = truncate(range.location, index, self.contents);
let end = truncate(range.end_location, index, self.contents);
&self.contents[start..end]
}
pub fn partition_source_code_at(
&self,
outer: &Range,
inner: &Range,
) -> (Cow<'_, str>, Cow<'_, str>, Cow<'_, str>) {
let rope = self.get_or_init_rope();
let outer_start = rope.line_to_char(outer.location.row() - 1) + outer.location.column();
let outer_end =
rope.line_to_char(outer.end_location.row() - 1) + outer.end_location.column();
let inner_start = rope.line_to_char(inner.location.row() - 1) + inner.location.column();
let inner_end =
rope.line_to_char(inner.end_location.row() - 1) + inner.end_location.column();
) -> (&'a str, &'a str, &'a str) {
let index = self.get_or_init_index();
let outer_start = truncate(outer.location, index, self.contents);
let outer_end = truncate(outer.end_location, index, self.contents);
let inner_start = truncate(inner.location, index, self.contents);
let inner_end = truncate(inner.end_location, index, self.contents);
(
Cow::from(rope.slice(outer_start..inner_start)),
Cow::from(rope.slice(inner_start..inner_end)),
Cow::from(rope.slice(inner_end..outer_end)),
&self.contents[outer_start..inner_start],
&self.contents[inner_start..inner_end],
&self.contents[inner_end..outer_end],
)
}
}
#[cfg(test)]
mod tests {
use rustpython_ast::Location;
use crate::source_code::locator::{index_ascii, index_utf8, truncate_ascii, truncate_utf8};
#[test]
fn ascii_index() {
let contents = "";
let index = index_ascii(contents);
assert_eq!(index, [0]);
let contents = "x = 1";
let index = index_ascii(contents);
assert_eq!(index, [0]);
let contents = "x = 1\n";
let index = index_ascii(contents);
assert_eq!(index, [0, 6]);
let contents = "x = 1\r\n";
let index = index_ascii(contents);
assert_eq!(index, [0, 7]);
let contents = "x = 1\ny = 2\nz = x + y\n";
let index = index_ascii(contents);
assert_eq!(index, [0, 6, 12, 22]);
}
#[test]
fn ascii_truncate() {
let contents = "x = 1\ny = 2";
let index = index_ascii(contents);
// First row.
let loc = truncate_ascii(Location::new(1, 0), &index, contents);
assert_eq!(loc, 0);
// Second row.
let loc = truncate_ascii(Location::new(2, 0), &index, contents);
assert_eq!(loc, 6);
// One-past-the-end.
let loc = truncate_ascii(Location::new(3, 0), &index, contents);
assert_eq!(loc, 11);
}
#[test]
fn utf8_index() {
let contents = "";
let index = index_utf8(contents);
assert_eq!(index.len(), 1);
assert_eq!(index[0], Vec::<usize>::new());
let contents = "x = 1";
let index = index_utf8(contents);
assert_eq!(index.len(), 1);
assert_eq!(index[0], [0, 1, 2, 3, 4]);
let contents = "x = 1\n";
let index = index_utf8(contents);
assert_eq!(index.len(), 2);
assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
assert_eq!(index[1], Vec::<usize>::new());
let contents = "x = 1\r\n";
let index = index_utf8(contents);
assert_eq!(index.len(), 2);
assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
assert_eq!(index[1], Vec::<usize>::new());
let contents = "x = 1\ny = 2\nz = x + y\n";
let index = index_utf8(contents);
assert_eq!(index.len(), 4);
assert_eq!(index[0], [0, 1, 2, 3, 4, 5]);
assert_eq!(index[1], [6, 7, 8, 9, 10, 11]);
assert_eq!(index[2], [12, 13, 14, 15, 16, 17, 18, 19, 20, 21]);
assert_eq!(index[3], Vec::<usize>::new());
let contents = "# \u{4e9c}\nclass Foo:\n \"\"\".\"\"\"";
let index = index_utf8(contents);
assert_eq!(index.len(), 3);
assert_eq!(index[0], [0, 1, 2, 5]);
assert_eq!(index[1], [6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]);
assert_eq!(index[2], [17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27]);
}
#[test]
fn utf8_truncate() {
let contents = "x = '☃'\ny = 2";
let index = index_utf8(contents);
// First row.
let loc = truncate_utf8(Location::new(1, 0), &index, contents);
assert_eq!(loc, 0);
let loc = truncate_utf8(Location::new(1, 5), &index, contents);
assert_eq!(loc, 5);
assert_eq!(&contents[loc..], "☃'\ny = 2");
let loc = truncate_utf8(Location::new(1, 6), &index, contents);
assert_eq!(loc, 8);
assert_eq!(&contents[loc..], "'\ny = 2");
// Second row.
let loc = truncate_utf8(Location::new(2, 0), &index, contents);
assert_eq!(loc, 10);
// One-past-the-end.
let loc = truncate_utf8(Location::new(3, 0), &index, contents);
assert_eq!(loc, 15);
}
}

View File

@ -170,7 +170,7 @@ fn detect_quote(contents: &str, locator: &Locator) -> Option<Quote> {
for (start, tok, end) in lexer::make_tokenizer(contents).flatten() {
if let Tok::String { .. } = tok {
let content = locator.slice_source_code_range(&Range::new(start, end));
if let Some(pattern) = leading_quote(&content) {
if let Some(pattern) = leading_quote(content) {
if pattern.contains('\'') {
return Some(Quote::Single);
} else if pattern.contains('"') {