Box other strings

2024-02-08 16:25:48 -05:00 · 2024-02-08 16:25:48 -05:00 · 56b148bb43
parent 0a5a4f6d92
commit 56b148bb43
10 changed files with 30 additions and 377 deletions
--- a/Cargo.lock
+++ b/Cargo.lock
@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"

 [[package]]
 name = "bstr"
-version = "1.6.2"
+version = "1.9.0"
 source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
+checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
 dependencies = [
 "memchr",
- "regex-automata 0.3.9",
+ "regex-automata 0.4.3",
 "serde",
 ]

@ -1921,12 +1921,6 @@ dependencies = [
 "regex-syntax 0.6.29",
 ]

-[[package]]
-name = "regex-automata"
-version = "0.3.9"
-source = "registry+https://github.com/rust-lang/crates.io-index"
-checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
-
 [[package]]
 name = "regex-automata"
 version = "0.4.3"
@ -2342,6 +2336,7 @@ version = "0.0.0"
 dependencies = [
 "anyhow",
 "bitflags 2.4.1",
+ "bstr",
 "insta",
 "is-macro",
 "itertools 0.12.1",
--- a/Cargo.toml
+++ b/Cargo.toml
@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
 assert_cmd = { version = "2.0.13" }
 bincode = { version = "1.3.3" }
 bitflags = { version = "2.4.1" }
+bstr = { version = "1.9.0" }
 cachedir = { version = "0.3.1" }
 chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
 clap = { version = "4.4.18", features = ["derive"] }
--- a/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
+++ b/crates/ruff_linter/src/rules/flake8_bandit/rules/hardcoded_bind_all_interfaces.rs
@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
 pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
    let is_bind_all_interface = match string {
        StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
-        StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
+        StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
+            &**value == "0.0.0.0"
+        }
        StringLike::BytesLiteral(_) => return,
    };

--- a/crates/ruff_linter/src/rules/flynt/helpers.rs
+++ b/crates/ruff_linter/src/rules/flynt/helpers.rs
@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
 /// Convert a string to a [`ast::FStringElement::Literal`].
 pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
    ast::FStringElement::Literal(ast::FStringLiteralElement {
-        value: s.to_owned(),
+        value: s.to_string().into_boxed_str(),
        range: TextRange::default(),
    })
 }
@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
    match expr {
        Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
            Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
-                value: value.to_string(),
+                value: value.to_string().into_boxed_str(),
                range: *range,
            }))
        }
--- a/crates/ruff_python_ast/src/comparable.rs
+++ b/crates/ruff_python_ast/src/comparable.rs
@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
 impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
    fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
        Self {
-            value: bytes_literal.value.as_slice(),
+            value: &bytes_literal.value,
        }
    }
 }
--- a/crates/ruff_python_ast/src/nodes.rs
+++ b/crates/ruff_python_ast/src/nodes.rs
@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
 #[derive(Clone, Debug, PartialEq)]
 pub struct FStringLiteralElement {
    pub range: TextRange,
-    pub value: String,
+    pub value: Box<str>,
 }

 impl Ranged for FStringLiteralElement {
@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
    type Target = str;

    fn deref(&self) -> &Self::Target {
-        self.value.as_str()
+        &self.value
    }
 }

@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
 #[derive(Clone, Debug, Default, PartialEq)]
 pub struct BytesLiteral {
    pub range: TextRange,
-    pub value: Vec<u8>,
+    pub value: Box<[u8]>,
 }

 impl Ranged for BytesLiteral {
@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
    type Target = [u8];

    fn deref(&self) -> &Self::Target {
-        self.value.as_slice()
+        &self.value
    }
 }

--- a/crates/ruff_python_parser/Cargo.toml
+++ b/crates/ruff_python_parser/Cargo.toml
@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }

 anyhow = { workspace = true }
 bitflags = { workspace = true }
+bstr = { workspace = true }
 is-macro = { workspace = true }
 itertools = { workspace = true }
 lalrpop-util = { workspace = true, default-features = false }
 memchr = { workspace = true }
-unicode-ident = { workspace = true }
-unicode_names2 = { workspace = true }
 rustc-hash = { workspace = true }
 static_assertions = { workspace = true }
+unicode-ident = { workspace = true }
+unicode_names2 = { workspace = true }

 [dev-dependencies]
 insta = { workspace = true }
--- a/crates/ruff_python_parser/src/ascii.rs
+++ b/crates/ruff_python_parser/src/ascii.rs
@ -1,345 +0,0 @@
-#![allow(
-    clippy::cast_possible_truncation,
-    clippy::cast_possible_wrap,
-    clippy::cast_ptr_alignment,
-    clippy::inline_always,
-    clippy::ptr_as_ptr,
-    unsafe_code
-)]
-
-//! Source: <https://github.com/BurntSushi/bstr/blob/d4aeee2eac5d5ef6ec4d2206f6ebffe7b3dd3e1f/src/ascii.rs>
-
-// The following ~400 lines of code exists for exactly one purpose, which is
-// to optimize this code:
-//
-//     byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
-//
-// Yes... Overengineered is a word that comes to mind, but this is effectively
-// a very similar problem to memchr, and virtually nobody has been able to
-// resist optimizing the crap out of that (except for perhaps the BSD and MUSL
-// folks). In particular, this routine makes a very common case (ASCII) very
-// fast, which seems worth it. We do stop short of adding AVX variants of the
-// code below in order to retain our sanity and also to avoid needing to deal
-// with runtime target feature detection. RESIST!
-//
-// In order to understand the SIMD version below, it would be good to read this
-// comment describing how my memchr routine works:
-// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
-//
-// The primary difference with memchr is that for ASCII, we can do a bit less
-// work. In particular, we don't need to detect the presence of a specific
-// byte, but rather, whether any byte has its most significant bit set. That
-// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
-// _mm_movemask_epi8.
-
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const USIZE_BYTES: usize = core::mem::size_of::<usize>();
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
-
-// This is a mask where the most significant bit of each byte in the usize
-// is set. We test this bit to determine whether a character is ASCII or not.
-// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
-// most significant bit is not set.
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const ASCII_MASK_U64: u64 = 0x8080_8080_8080_8080;
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
-
-/// Returns the index of the first non ASCII byte in the given slice.
-///
-/// If slice only contains ASCII bytes, then the length of the slice is
-/// returned.
-pub(crate) fn first_non_ascii_byte(slice: &[u8]) -> usize {
-    #[cfg(any(miri, not(target_arch = "x86_64")))]
-    {
-        first_non_ascii_byte_fallback(slice)
-    }
-
-    #[cfg(all(not(miri), target_arch = "x86_64"))]
-    {
-        first_non_ascii_byte_sse2(slice)
-    }
-}
-
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
-    let align = USIZE_BYTES - 1;
-    let start_ptr = slice.as_ptr();
-    let end_ptr = slice[slice.len()..].as_ptr();
-    let mut ptr = start_ptr;
-
-    unsafe {
-        if slice.len() < USIZE_BYTES {
-            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
-        }
-
-        let chunk = read_unaligned_usize(ptr);
-        let mask = chunk & ASCII_MASK;
-        if mask != 0 {
-            return first_non_ascii_byte_mask(mask);
-        }
-
-        ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
-        debug_assert!(ptr > start_ptr);
-        debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
-        if slice.len() >= FALLBACK_LOOP_SIZE {
-            while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
-                debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
-
-                let a = *ptr.cast::<usize>();
-                let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
-                if (a | b) & ASCII_MASK != 0 {
-                    // What a kludge. We wrap the position finding code into
-                    // a non-inlineable function, which makes the codegen in
-                    // the tight loop above a bit better by avoiding a
-                    // couple extra movs. We pay for it by two additional
-                    // stores, but only in the case of finding a non-ASCII
-                    // byte.
-                    #[inline(never)]
-                    unsafe fn findpos(start_ptr: *const u8, ptr: *const u8) -> usize {
-                        let a = *ptr.cast::<usize>();
-                        let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
-
-                        let mut at = sub(ptr, start_ptr);
-                        let maska = a & ASCII_MASK;
-                        if maska != 0 {
-                            return at + first_non_ascii_byte_mask(maska);
-                        }
-
-                        at += USIZE_BYTES;
-                        let maskb = b & ASCII_MASK;
-                        debug_assert!(maskb != 0);
-                        at + first_non_ascii_byte_mask(maskb)
-                    }
-                    return findpos(start_ptr, ptr);
-                }
-                ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
-            }
-        }
-        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
-    }
-}
-
-#[cfg(all(not(miri), target_arch = "x86_64"))]
-fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
-    use core::arch::x86_64::{
-        __m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
-    };
-
-    const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>();
-    const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
-    const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;
-
-    let start_ptr = slice.as_ptr();
-    let end_ptr = slice[slice.len()..].as_ptr();
-    let mut ptr = start_ptr;
-
-    unsafe {
-        if slice.len() < VECTOR_SIZE {
-            return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
-        }
-
-        let chunk = _mm_loadu_si128(ptr as *const __m128i);
-        let mask = _mm_movemask_epi8(chunk);
-        if mask != 0 {
-            return mask.trailing_zeros() as usize;
-        }
-
-        ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
-        debug_assert!(ptr > start_ptr);
-        debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
-        if slice.len() >= VECTOR_LOOP_SIZE {
-            while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
-                debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
-
-                let a = _mm_load_si128(ptr as *const __m128i);
-                let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
-                let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
-                let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
-
-                let or1 = _mm_or_si128(a, b);
-                let or2 = _mm_or_si128(c, d);
-                let or3 = _mm_or_si128(or1, or2);
-                if _mm_movemask_epi8(or3) != 0 {
-                    let mut at = sub(ptr, start_ptr);
-                    let mask = _mm_movemask_epi8(a);
-                    if mask != 0 {
-                        return at + mask.trailing_zeros() as usize;
-                    }
-
-                    at += VECTOR_SIZE;
-                    let mask = _mm_movemask_epi8(b);
-                    if mask != 0 {
-                        return at + mask.trailing_zeros() as usize;
-                    }
-
-                    at += VECTOR_SIZE;
-                    let mask = _mm_movemask_epi8(c);
-                    if mask != 0 {
-                        return at + mask.trailing_zeros() as usize;
-                    }
-
-                    at += VECTOR_SIZE;
-                    let mask = _mm_movemask_epi8(d);
-                    debug_assert!(mask != 0);
-                    return at + mask.trailing_zeros() as usize;
-                }
-                ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
-            }
-        }
-        while ptr <= end_ptr.sub(VECTOR_SIZE) {
-            debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
-
-            let chunk = _mm_loadu_si128(ptr as *const __m128i);
-            let mask = _mm_movemask_epi8(chunk);
-            if mask != 0 {
-                return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
-            }
-            ptr = ptr.add(VECTOR_SIZE);
-        }
-        first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
-    }
-}
-
-#[inline(always)]
-unsafe fn first_non_ascii_byte_slow(
-    start_ptr: *const u8,
-    end_ptr: *const u8,
-    mut ptr: *const u8,
-) -> usize {
-    debug_assert!(start_ptr <= ptr);
-    debug_assert!(ptr <= end_ptr);
-
-    while ptr < end_ptr {
-        if *ptr > 0x7F {
-            return sub(ptr, start_ptr);
-        }
-        ptr = ptr.offset(1);
-    }
-    sub(end_ptr, start_ptr)
-}
-
-/// Compute the position of the first ASCII byte in the given mask.
-///
-/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
-/// 8 contiguous bytes of the slice being checked where *at least* one of those
-/// bytes is not an ASCII byte.
-///
-/// The position returned is always in the inclusive range [0, 7].
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-fn first_non_ascii_byte_mask(mask: usize) -> usize {
-    #[cfg(target_endian = "little")]
-    {
-        mask.trailing_zeros() as usize / 8
-    }
-    #[cfg(target_endian = "big")]
-    {
-        mask.leading_zeros() as usize / 8
-    }
-}
-
-/// Increment the given pointer by the given amount.
-unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
-    debug_assert!(amt < ::core::isize::MAX as usize);
-    ptr.add(amt)
-}
-
-/// Decrement the given pointer by the given amount.
-unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
-    debug_assert!(amt < ::core::isize::MAX as usize);
-    ptr.offset((amt as isize).wrapping_neg())
-}
-
-#[cfg(any(test, miri, not(target_arch = "x86_64")))]
-unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
-    use core::ptr;
-
-    let mut n: usize = 0;
-    ptr::copy_nonoverlapping(ptr, std::ptr::addr_of_mut!(n) as *mut u8, USIZE_BYTES);
-    n
-}
-
-/// Subtract `b` from `a` and return the difference. `a` should be greater than
-/// or equal to `b`.
-fn sub(a: *const u8, b: *const u8) -> usize {
-    debug_assert!(a >= b);
-    (a as usize) - (b as usize)
-}
-
-#[cfg(test)]
-mod tests {
-    use super::*;
-
-    // Our testing approach here is to try and exhaustively test every case.
-    // This includes the position at which a non-ASCII byte occurs in addition
-    // to the alignment of the slice that we're searching.
-
-    #[test]
-    fn positive_fallback_forward() {
-        for i in 0..517 {
-            let s = "a".repeat(i);
-            assert_eq!(
-                i,
-                first_non_ascii_byte_fallback(s.as_bytes()),
-                "i: {:?}, len: {:?}, s: {:?}",
-                i,
-                s.len(),
-                s
-            );
-        }
-    }
-
-    #[test]
-    #[cfg(target_arch = "x86_64")]
-    #[cfg(not(miri))]
-    fn positive_sse2_forward() {
-        for i in 0..517 {
-            let b = "a".repeat(i).into_bytes();
-            assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
-        }
-    }
-
-    #[test]
-    #[cfg(not(miri))]
-    fn negative_fallback_forward() {
-        for i in 0..517 {
-            for align in 0..65 {
-                let mut s = "a".repeat(i);
-                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
-                let s = s.get(align..).unwrap_or("");
-                assert_eq!(
-                    i.saturating_sub(align),
-                    first_non_ascii_byte_fallback(s.as_bytes()),
-                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
-                    i,
-                    align,
-                    s.len(),
-                    s
-                );
-            }
-        }
-    }
-
-    #[test]
-    #[cfg(target_arch = "x86_64")]
-    #[cfg(not(miri))]
-    fn negative_sse2_forward() {
-        for i in 0..517 {
-            for align in 0..65 {
-                let mut s = "a".repeat(i);
-                s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
-                let s = s.get(align..).unwrap_or("");
-                assert_eq!(
-                    i.saturating_sub(align),
-                    first_non_ascii_byte_sse2(s.as_bytes()),
-                    "i: {:?}, align: {:?}, len: {:?}, s: {:?}",
-                    i,
-                    align,
-                    s.len(),
-                    s
-                );
-            }
-        }
-    }
-}
--- a/crates/ruff_python_parser/src/lib.rs
+++ b/crates/ruff_python_parser/src/lib.rs
@ -119,11 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};

 use crate::lexer::LexResult;

-mod function;
-// Skip flattening lexer to distinguish from full ruff_python_parser
-mod ascii;
 mod context;
+mod function;
 mod invalid;
+// Skip flattening lexer to distinguish from full ruff_python_parser
 pub mod lexer;
 mod parser;
 mod soft_keywords;
--- a/crates/ruff_python_parser/src/string.rs
+++ b/crates/ruff_python_parser/src/string.rs
@ -1,9 +1,10 @@
 //! Parsing of string literals, bytes literals, and implicit string concatenation.

+use bstr::ByteSlice;
+
 use ruff_python_ast::{self as ast, Expr};
 use ruff_text_size::{Ranged, TextRange, TextSize};

-use crate::ascii::first_non_ascii_byte;
 use crate::lexer::{LexicalError, LexicalErrorType};
 use crate::token::{StringKind, Tok};

@ -218,9 +219,9 @@ impl StringParser {

        let mut value = String::with_capacity(self.source.len());
        loop {
-            // Add the characters before the escape sequence to the string.
-            let before_with_slash = self.skip_bytes(index + 1);
-            let before = &before_with_slash[..before_with_slash.len() - 1];
+            // Add the characters before the escape sequence (or curly brace) to the string.
+            let before_with_slash_or_brace = self.skip_bytes(index + 1);
+            let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
            value.push_str(before);

            // Add the escaped character to the string.
@ -284,14 +285,13 @@ impl StringParser {
        }

        Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
-            value,
+            value: value.into_boxed_str(),
            range: self.range,
        }))
    }

    fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
-        let index = first_non_ascii_byte(self.source.as_bytes());
-        if index < self.source.len() {
+        if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
            return Err(LexicalError::new(
                LexicalErrorType::OtherError(
                    "bytes can only contain ASCII literal characters"
@ -305,7 +305,7 @@ impl StringParser {
        if self.kind.is_raw() {
            // For raw strings, no escaping is necessary.
            return Ok(StringType::Bytes(ast::BytesLiteral {
-                value: self.source.into_bytes(),
+                value: self.source.into_boxed_bytes(),
                range: self.range,
            }));
        }
@ -313,7 +313,7 @@ impl StringParser {
        let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
            // If the string doesn't contain any escape sequences, return the owned string.
            return Ok(StringType::Bytes(ast::BytesLiteral {
-                value: self.source.into_bytes(),
+                value: self.source.into_boxed_bytes(),
                range: self.range,
            }));
        };
@ -349,7 +349,7 @@ impl StringParser {
        }

        Ok(StringType::Bytes(ast::BytesLiteral {
-            value,
+            value: value.into_boxed_slice(),
            range: self.range,
        }))
    }