Box other strings

This commit is contained in:
Charlie Marsh 2024-02-08 16:25:48 -05:00
parent 0a5a4f6d92
commit 56b148bb43
10 changed files with 30 additions and 377 deletions

13
Cargo.lock generated
View File

@ -217,12 +217,12 @@ checksum = "327762f6e5a765692301e5bb513e0d9fef63be86bbc14528052b1cd3e6f03e07"
[[package]]
name = "bstr"
version = "1.6.2"
version = "1.9.0"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "4c2f7349907b712260e64b0afe2f84692af14a454be26187d9df565c7f69266a"
checksum = "c48f0051a4b4c5e0b6d365cd04af53aeaa209e3cc15ec2cdb69e73cc87fbd0dc"
dependencies = [
"memchr",
"regex-automata 0.3.9",
"regex-automata 0.4.3",
"serde",
]
@ -1921,12 +1921,6 @@ dependencies = [
"regex-syntax 0.6.29",
]
[[package]]
name = "regex-automata"
version = "0.3.9"
source = "registry+https://github.com/rust-lang/crates.io-index"
checksum = "59b23e92ee4318893fa3fe3e6fb365258efbfe6ac6ab30f090cdcbb7aa37efa9"
[[package]]
name = "regex-automata"
version = "0.4.3"
@ -2342,6 +2336,7 @@ version = "0.0.0"
dependencies = [
"anyhow",
"bitflags 2.4.1",
"bstr",
"insta",
"is-macro",
"itertools 0.12.1",

View File

@ -19,6 +19,7 @@ argfile = { version = "0.1.6" }
assert_cmd = { version = "2.0.13" }
bincode = { version = "1.3.3" }
bitflags = { version = "2.4.1" }
bstr = { version = "1.9.0" }
cachedir = { version = "0.3.1" }
chrono = { version = "0.4.33", default-features = false, features = ["clock"] }
clap = { version = "4.4.18", features = ["derive"] }

View File

@ -40,7 +40,9 @@ impl Violation for HardcodedBindAllInterfaces {
pub(crate) fn hardcoded_bind_all_interfaces(checker: &mut Checker, string: StringLike) {
let is_bind_all_interface = match string {
StringLike::StringLiteral(ast::ExprStringLiteral { value, .. }) => value == "0.0.0.0",
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => value == "0.0.0.0",
StringLike::FStringLiteral(ast::FStringLiteralElement { value, .. }) => {
&**value == "0.0.0.0"
}
StringLike::BytesLiteral(_) => return,
};

View File

@ -15,7 +15,7 @@ fn to_f_string_expression_element(inner: &Expr) -> ast::FStringElement {
/// Convert a string to a [`ast::FStringElement::Literal`].
pub(super) fn to_f_string_literal_element(s: &str) -> ast::FStringElement {
ast::FStringElement::Literal(ast::FStringLiteralElement {
value: s.to_owned(),
value: s.to_string().into_boxed_str(),
range: TextRange::default(),
})
}
@ -53,7 +53,7 @@ pub(super) fn to_f_string_element(expr: &Expr) -> Option<ast::FStringElement> {
match expr {
Expr::StringLiteral(ast::ExprStringLiteral { value, range }) => {
Some(ast::FStringElement::Literal(ast::FStringLiteralElement {
value: value.to_string(),
value: value.to_string().into_boxed_str(),
range: *range,
}))
}

View File

@ -644,7 +644,7 @@ pub struct ComparableBytesLiteral<'a> {
impl<'a> From<&'a ast::BytesLiteral> for ComparableBytesLiteral<'a> {
fn from(bytes_literal: &'a ast::BytesLiteral) -> Self {
Self {
value: bytes_literal.value.as_slice(),
value: &bytes_literal.value,
}
}
}

View File

@ -949,7 +949,7 @@ impl Ranged for FStringExpressionElement {
#[derive(Clone, Debug, PartialEq)]
pub struct FStringLiteralElement {
pub range: TextRange,
pub value: String,
pub value: Box<str>,
}
impl Ranged for FStringLiteralElement {
@ -962,7 +962,7 @@ impl Deref for FStringLiteralElement {
type Target = str;
fn deref(&self) -> &Self::Target {
self.value.as_str()
&self.value
}
}
@ -1607,7 +1607,7 @@ impl Default for BytesLiteralValueInner {
#[derive(Clone, Debug, Default, PartialEq)]
pub struct BytesLiteral {
pub range: TextRange,
pub value: Vec<u8>,
pub value: Box<[u8]>,
}
impl Ranged for BytesLiteral {
@ -1620,7 +1620,7 @@ impl Deref for BytesLiteral {
type Target = [u8];
fn deref(&self) -> &Self::Target {
self.value.as_slice()
&self.value
}
}

View File

@ -19,14 +19,15 @@ ruff_text_size = { path = "../ruff_text_size" }
anyhow = { workspace = true }
bitflags = { workspace = true }
bstr = { workspace = true }
is-macro = { workspace = true }
itertools = { workspace = true }
lalrpop-util = { workspace = true, default-features = false }
memchr = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
rustc-hash = { workspace = true }
static_assertions = { workspace = true }
unicode-ident = { workspace = true }
unicode_names2 = { workspace = true }
[dev-dependencies]
insta = { workspace = true }

View File

@ -1,345 +0,0 @@
#![allow(
clippy::cast_possible_truncation,
clippy::cast_possible_wrap,
clippy::cast_ptr_alignment,
clippy::inline_always,
clippy::ptr_as_ptr,
unsafe_code
)]
//! Source: <https://github.com/BurntSushi/bstr/blob/d4aeee2eac5d5ef6ec4d2206f6ebffe7b3dd3e1f/src/ascii.rs>
// The following ~400 lines of code exists for exactly one purpose, which is
// to optimize this code:
//
// byte_slice.iter().position(|&b| b > 0x7F).unwrap_or(byte_slice.len())
//
// Yes... Overengineered is a word that comes to mind, but this is effectively
// a very similar problem to memchr, and virtually nobody has been able to
// resist optimizing the crap out of that (except for perhaps the BSD and MUSL
// folks). In particular, this routine makes a very common case (ASCII) very
// fast, which seems worth it. We do stop short of adding AVX variants of the
// code below in order to retain our sanity and also to avoid needing to deal
// with runtime target feature detection. RESIST!
//
// In order to understand the SIMD version below, it would be good to read this
// comment describing how my memchr routine works:
// https://github.com/BurntSushi/rust-memchr/blob/b0a29f267f4a7fad8ffcc8fe8377a06498202883/src/x86/sse2.rs#L19-L106
//
// The primary difference with memchr is that for ASCII, we can do a bit less
// work. In particular, we don't need to detect the presence of a specific
// byte, but rather, whether any byte has its most significant bit set. That
// means we can effectively skip the _mm_cmpeq_epi8 step and jump straight to
// _mm_movemask_epi8.
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const USIZE_BYTES: usize = core::mem::size_of::<usize>();
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const FALLBACK_LOOP_SIZE: usize = 2 * USIZE_BYTES;
// This is a mask where the most significant bit of each byte in the usize
// is set. We test this bit to determine whether a character is ASCII or not.
// Namely, a single byte is regarded as an ASCII codepoint if and only if it's
// most significant bit is not set.
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK_U64: u64 = 0x8080_8080_8080_8080;
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
const ASCII_MASK: usize = ASCII_MASK_U64 as usize;
/// Returns the index of the first non ASCII byte in the given slice.
///
/// If slice only contains ASCII bytes, then the length of the slice is
/// returned.
pub(crate) fn first_non_ascii_byte(slice: &[u8]) -> usize {
#[cfg(any(miri, not(target_arch = "x86_64")))]
{
first_non_ascii_byte_fallback(slice)
}
#[cfg(all(not(miri), target_arch = "x86_64"))]
{
first_non_ascii_byte_sse2(slice)
}
}
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_fallback(slice: &[u8]) -> usize {
let align = USIZE_BYTES - 1;
let start_ptr = slice.as_ptr();
let end_ptr = slice[slice.len()..].as_ptr();
let mut ptr = start_ptr;
unsafe {
if slice.len() < USIZE_BYTES {
return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
}
let chunk = read_unaligned_usize(ptr);
let mask = chunk & ASCII_MASK;
if mask != 0 {
return first_non_ascii_byte_mask(mask);
}
ptr = ptr_add(ptr, USIZE_BYTES - (start_ptr as usize & align));
debug_assert!(ptr > start_ptr);
debug_assert!(ptr_sub(end_ptr, USIZE_BYTES) >= start_ptr);
if slice.len() >= FALLBACK_LOOP_SIZE {
while ptr <= ptr_sub(end_ptr, FALLBACK_LOOP_SIZE) {
debug_assert_eq!(0, (ptr as usize) % USIZE_BYTES);
let a = *ptr.cast::<usize>();
let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
if (a | b) & ASCII_MASK != 0 {
// What a kludge. We wrap the position finding code into
// a non-inlineable function, which makes the codegen in
// the tight loop above a bit better by avoiding a
// couple extra movs. We pay for it by two additional
// stores, but only in the case of finding a non-ASCII
// byte.
#[inline(never)]
unsafe fn findpos(start_ptr: *const u8, ptr: *const u8) -> usize {
let a = *ptr.cast::<usize>();
let b = *ptr_add(ptr, USIZE_BYTES).cast::<usize>();
let mut at = sub(ptr, start_ptr);
let maska = a & ASCII_MASK;
if maska != 0 {
return at + first_non_ascii_byte_mask(maska);
}
at += USIZE_BYTES;
let maskb = b & ASCII_MASK;
debug_assert!(maskb != 0);
at + first_non_ascii_byte_mask(maskb)
}
return findpos(start_ptr, ptr);
}
ptr = ptr_add(ptr, FALLBACK_LOOP_SIZE);
}
}
first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
}
}
#[cfg(all(not(miri), target_arch = "x86_64"))]
fn first_non_ascii_byte_sse2(slice: &[u8]) -> usize {
use core::arch::x86_64::{
__m128i, _mm_load_si128, _mm_loadu_si128, _mm_movemask_epi8, _mm_or_si128,
};
const VECTOR_SIZE: usize = core::mem::size_of::<__m128i>();
const VECTOR_ALIGN: usize = VECTOR_SIZE - 1;
const VECTOR_LOOP_SIZE: usize = 4 * VECTOR_SIZE;
let start_ptr = slice.as_ptr();
let end_ptr = slice[slice.len()..].as_ptr();
let mut ptr = start_ptr;
unsafe {
if slice.len() < VECTOR_SIZE {
return first_non_ascii_byte_slow(start_ptr, end_ptr, ptr);
}
let chunk = _mm_loadu_si128(ptr as *const __m128i);
let mask = _mm_movemask_epi8(chunk);
if mask != 0 {
return mask.trailing_zeros() as usize;
}
ptr = ptr.add(VECTOR_SIZE - (start_ptr as usize & VECTOR_ALIGN));
debug_assert!(ptr > start_ptr);
debug_assert!(end_ptr.sub(VECTOR_SIZE) >= start_ptr);
if slice.len() >= VECTOR_LOOP_SIZE {
while ptr <= ptr_sub(end_ptr, VECTOR_LOOP_SIZE) {
debug_assert_eq!(0, (ptr as usize) % VECTOR_SIZE);
let a = _mm_load_si128(ptr as *const __m128i);
let b = _mm_load_si128(ptr.add(VECTOR_SIZE) as *const __m128i);
let c = _mm_load_si128(ptr.add(2 * VECTOR_SIZE) as *const __m128i);
let d = _mm_load_si128(ptr.add(3 * VECTOR_SIZE) as *const __m128i);
let or1 = _mm_or_si128(a, b);
let or2 = _mm_or_si128(c, d);
let or3 = _mm_or_si128(or1, or2);
if _mm_movemask_epi8(or3) != 0 {
let mut at = sub(ptr, start_ptr);
let mask = _mm_movemask_epi8(a);
if mask != 0 {
return at + mask.trailing_zeros() as usize;
}
at += VECTOR_SIZE;
let mask = _mm_movemask_epi8(b);
if mask != 0 {
return at + mask.trailing_zeros() as usize;
}
at += VECTOR_SIZE;
let mask = _mm_movemask_epi8(c);
if mask != 0 {
return at + mask.trailing_zeros() as usize;
}
at += VECTOR_SIZE;
let mask = _mm_movemask_epi8(d);
debug_assert!(mask != 0);
return at + mask.trailing_zeros() as usize;
}
ptr = ptr_add(ptr, VECTOR_LOOP_SIZE);
}
}
while ptr <= end_ptr.sub(VECTOR_SIZE) {
debug_assert!(sub(end_ptr, ptr) >= VECTOR_SIZE);
let chunk = _mm_loadu_si128(ptr as *const __m128i);
let mask = _mm_movemask_epi8(chunk);
if mask != 0 {
return sub(ptr, start_ptr) + mask.trailing_zeros() as usize;
}
ptr = ptr.add(VECTOR_SIZE);
}
first_non_ascii_byte_slow(start_ptr, end_ptr, ptr)
}
}
#[inline(always)]
unsafe fn first_non_ascii_byte_slow(
start_ptr: *const u8,
end_ptr: *const u8,
mut ptr: *const u8,
) -> usize {
debug_assert!(start_ptr <= ptr);
debug_assert!(ptr <= end_ptr);
while ptr < end_ptr {
if *ptr > 0x7F {
return sub(ptr, start_ptr);
}
ptr = ptr.offset(1);
}
sub(end_ptr, start_ptr)
}
/// Compute the position of the first ASCII byte in the given mask.
///
/// The mask should be computed by `chunk & ASCII_MASK`, where `chunk` is
/// 8 contiguous bytes of the slice being checked where *at least* one of those
/// bytes is not an ASCII byte.
///
/// The position returned is always in the inclusive range [0, 7].
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
fn first_non_ascii_byte_mask(mask: usize) -> usize {
#[cfg(target_endian = "little")]
{
mask.trailing_zeros() as usize / 8
}
#[cfg(target_endian = "big")]
{
mask.leading_zeros() as usize / 8
}
}
/// Increment the given pointer by the given amount.
unsafe fn ptr_add(ptr: *const u8, amt: usize) -> *const u8 {
debug_assert!(amt < ::core::isize::MAX as usize);
ptr.add(amt)
}
/// Decrement the given pointer by the given amount.
unsafe fn ptr_sub(ptr: *const u8, amt: usize) -> *const u8 {
debug_assert!(amt < ::core::isize::MAX as usize);
ptr.offset((amt as isize).wrapping_neg())
}
#[cfg(any(test, miri, not(target_arch = "x86_64")))]
unsafe fn read_unaligned_usize(ptr: *const u8) -> usize {
use core::ptr;
let mut n: usize = 0;
ptr::copy_nonoverlapping(ptr, std::ptr::addr_of_mut!(n) as *mut u8, USIZE_BYTES);
n
}
/// Subtract `b` from `a` and return the difference. `a` should be greater than
/// or equal to `b`.
fn sub(a: *const u8, b: *const u8) -> usize {
debug_assert!(a >= b);
(a as usize) - (b as usize)
}
#[cfg(test)]
mod tests {
use super::*;
// Our testing approach here is to try and exhaustively test every case.
// This includes the position at which a non-ASCII byte occurs in addition
// to the alignment of the slice that we're searching.
#[test]
fn positive_fallback_forward() {
for i in 0..517 {
let s = "a".repeat(i);
assert_eq!(
i,
first_non_ascii_byte_fallback(s.as_bytes()),
"i: {:?}, len: {:?}, s: {:?}",
i,
s.len(),
s
);
}
}
#[test]
#[cfg(target_arch = "x86_64")]
#[cfg(not(miri))]
fn positive_sse2_forward() {
for i in 0..517 {
let b = "a".repeat(i).into_bytes();
assert_eq!(b.len(), first_non_ascii_byte_sse2(&b));
}
}
#[test]
#[cfg(not(miri))]
fn negative_fallback_forward() {
for i in 0..517 {
for align in 0..65 {
let mut s = "a".repeat(i);
s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
let s = s.get(align..).unwrap_or("");
assert_eq!(
i.saturating_sub(align),
first_non_ascii_byte_fallback(s.as_bytes()),
"i: {:?}, align: {:?}, len: {:?}, s: {:?}",
i,
align,
s.len(),
s
);
}
}
}
#[test]
#[cfg(target_arch = "x86_64")]
#[cfg(not(miri))]
fn negative_sse2_forward() {
for i in 0..517 {
for align in 0..65 {
let mut s = "a".repeat(i);
s.push_str("☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃☃");
let s = s.get(align..).unwrap_or("");
assert_eq!(
i.saturating_sub(align),
first_non_ascii_byte_sse2(s.as_bytes()),
"i: {:?}, align: {:?}, len: {:?}, s: {:?}",
i,
align,
s.len(),
s
);
}
}
}
}

View File

@ -119,11 +119,10 @@ pub use token::{StringKind, Tok, TokenKind};
use crate::lexer::LexResult;
mod function;
// Skip flattening lexer to distinguish from full ruff_python_parser
mod ascii;
mod context;
mod function;
mod invalid;
// Skip flattening lexer to distinguish from full ruff_python_parser
pub mod lexer;
mod parser;
mod soft_keywords;

View File

@ -1,9 +1,10 @@
//! Parsing of string literals, bytes literals, and implicit string concatenation.
use bstr::ByteSlice;
use ruff_python_ast::{self as ast, Expr};
use ruff_text_size::{Ranged, TextRange, TextSize};
use crate::ascii::first_non_ascii_byte;
use crate::lexer::{LexicalError, LexicalErrorType};
use crate::token::{StringKind, Tok};
@ -218,9 +219,9 @@ impl StringParser {
let mut value = String::with_capacity(self.source.len());
loop {
// Add the characters before the escape sequence to the string.
let before_with_slash = self.skip_bytes(index + 1);
let before = &before_with_slash[..before_with_slash.len() - 1];
// Add the characters before the escape sequence (or curly brace) to the string.
let before_with_slash_or_brace = self.skip_bytes(index + 1);
let before = &before_with_slash_or_brace[..before_with_slash_or_brace.len() - 1];
value.push_str(before);
// Add the escaped character to the string.
@ -284,14 +285,13 @@ impl StringParser {
}
Ok(ast::FStringElement::Literal(ast::FStringLiteralElement {
value,
value: value.into_boxed_str(),
range: self.range,
}))
}
fn parse_bytes(mut self) -> Result<StringType, LexicalError> {
let index = first_non_ascii_byte(self.source.as_bytes());
if index < self.source.len() {
if let Some(index) = self.source.as_bytes().find_non_ascii_byte() {
return Err(LexicalError::new(
LexicalErrorType::OtherError(
"bytes can only contain ASCII literal characters"
@ -305,7 +305,7 @@ impl StringParser {
if self.kind.is_raw() {
// For raw strings, no escaping is necessary.
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_bytes(),
value: self.source.into_boxed_bytes(),
range: self.range,
}));
}
@ -313,7 +313,7 @@ impl StringParser {
let Some(mut escape) = memchr::memchr(b'\\', self.source.as_bytes()) else {
// If the string doesn't contain any escape sequences, return the owned string.
return Ok(StringType::Bytes(ast::BytesLiteral {
value: self.source.into_bytes(),
value: self.source.into_boxed_bytes(),
range: self.range,
}));
};
@ -349,7 +349,7 @@ impl StringParser {
}
Ok(StringType::Bytes(ast::BytesLiteral {
value,
value: value.into_boxed_slice(),
range: self.range,
}))
}