This commit is contained in:
Leonard Hecker 2025-07-28 02:00:48 +02:00
parent 86b976e811
commit 1ed23edae4
7 changed files with 688 additions and 501 deletions

View File

@ -1,3 +0,0 @@
[unstable]
build-std = ["std"]
build-std-features = ["default", "core/debug_refcell"]

File diff suppressed because it is too large Load Diff

View File

@ -142,101 +142,68 @@ impl<'doc> Highlighter<'doc> {
}
let line_buf = unicode::strip_newline(&line_buf);
let mut off = 0;
if self.kind != HighlightKind::Other {
res.push(Higlight { start: 0, kind: self.kind });
}
let mut off = 0usize;
let mut start = 0usize;
let mut state = self.state;
let mut kind = self.kind;
loop {
let starter = &self.starter[self.state];
while off < line_buf.len() && starter[line_buf[off] as usize] == 0 {
off += 1;
}
let mut end = off;
let start = off;
let mut hit = None;
for t in self.language.states[self.state] {
for t in self.language.states[state] {
match t.0 {
Consume::Chars(n) => {
off = off.saturating_add(n);
hit = Some(t);
break;
end = end.saturating_add(n);
}
Consume::Prefix(str) => {
if line_buf[off..].starts_with(str.as_bytes()) {
off += str.len();
hit = Some(t);
break;
if !line_buf[end..].starts_with(str.as_bytes()) {
continue;
}
end += str.len();
}
Consume::PrefixInsensitive(str) => {
if line_buf[off..].starts_with_ignore_ascii_case(str) {
off += str.len();
hit = Some(t);
break;
if !line_buf[end..].starts_with_ignore_ascii_case(str) {
continue;
}
end += str.len();
}
Consume::Charset(cs) => {
if off < line_buf.len() && cs[line_buf[off] as usize] != 0 {
while {
off += 1;
off < line_buf.len() && cs[line_buf[off] as usize] != 0
} {}
hit = Some(t);
break;
// TODO: http://0x80.pl/notesen/2018-10-18-simd-byte-lookup.html#alternative-implementation
if end >= line_buf.len() || cs[line_buf[end] as usize] == 0 {
continue;
}
}
};
}
if let Some(t) = hit {
// If this transition changes the HighlightKind,
// we need to split the current run and add a new one.
if self.kind != t.1 {
if let Some(last) = res.last_mut()
&& last.start == start
{
last.kind = t.1;
} else {
res.push(Higlight { start, kind: t.1 });
while {
end += 1;
end < line_buf.len() && cs[line_buf[end] as usize] != 0
} {}
}
}
match t.2 {
Action::Change(to) => {
if let Some(last) = res.last_mut() {
last.kind = t.1;
}
self.state = to as usize;
self.kind = t.1;
state = to as usize;
kind = t.1.unwrap_or(kind);
}
Action::Push(to) => {
self.state_stack.push((self.state, self.kind));
self.state = to as usize;
self.kind = t.1;
res.push(Higlight { start, kind });
self.state_stack.push((state, kind));
start = off;
state = to as usize;
kind = t.1.unwrap_or(kind);
}
Action::Pop => {
self.state_stack.pop();
(self.state, self.kind) =
self.state_stack.last().copied().unwrap_or_default();
kind = t.1.unwrap_or(kind);
res.push(Higlight { start, kind });
// This may have changed the HighlightKind yet again.
if self.kind != t.1 {
if let Some(last) = res.last_mut()
&& last.start == off
{
last.kind = self.kind;
} else {
res.push(Higlight { start: off, kind: self.kind });
}
}
start = end;
(state, kind) = self.state_stack.last().copied().unwrap_or_default();
self.state_stack.pop();
}
}
} else {
// False starter hit.
off += 1;
off = end;
break;
}
if off >= line_buf.len() {
@ -244,15 +211,16 @@ impl<'doc> Highlighter<'doc> {
}
}
if res.last().is_some_and(|last| last.start != line_buf.len()) {
res.push(Higlight { start: line_buf.len(), kind: self.kind });
}
res.push(Higlight { start, kind });
res.push(Higlight { start: line_buf.len(), kind });
// Adjust the range to account for the line offset.
for h in &mut res {
h.start = line_beg + h.start.min(line_buf.len());
}
self.state = state;
self.kind = kind;
res
}
}

View File

@ -14,21 +14,6 @@ pub enum HighlightKind {
Method,
}
impl HighlightKind {
pub fn as_str(self) -> &'static str {
match self {
Other => "Other",
Comment => "Comment",
Number => "Number",
String => "String",
Variable => "Variable",
Operator => "Operator",
Keyword => "Keyword",
Method => "Method",
}
}
}
pub struct Language {
#[allow(dead_code)]
pub name: &'static str,
@ -60,7 +45,9 @@ pub const LANGUAGES: &[Language] = &[
(r#"//.*"#, Comment, Pop),
(r#"/\*"#, Comment, Push("comment")),
(r#"""#, String, Push("string")),
(r#"(?:-\d+|\d+)(?:\.\d+)?(?:[eE][+-]?\d+)?\w+"#, Other, Pop),
(r#"(?:-\d+|\d+)(?:\.\d+)?(?:[eE][+-]?\d+)?"#, Number, Pop),
(r#"(?:true|false|null)\w+"#, Other, Pop),
(r#"(?:true|false|null)"#, Keyword, Pop),
],
},
@ -158,12 +145,12 @@ pub const LANGUAGES: &[Language] = &[
(r#"-\w+"#, Operator, Pop),
(r#"[!*/%+<=>|]"#, Operator, Pop),
(
r#"(?i:break|catch|continue|do|else|finally|foreach|function|if|return|switch|throw|try|using|while)[\w-]+"#,
r#"(?i:break|catch|continue|do|elseif|else|finally|foreach|function|if|return|switch|throw|try|using|while)[\w-]+"#,
Method,
Pop,
),
(
r#"(?i:break|catch|continue|do|else|finally|foreach|function|if|return|switch|throw|try|using|while)"#,
r#"(?i:break|catch|continue|do|elseif|else|finally|foreach|function|if|return|switch|throw|try|using|while)"#,
Keyword,
Pop,
),
@ -202,11 +189,13 @@ pub const LANGUAGES: &[Language] = &[
State {
name: "ground",
rules: &[
(r#"rem\S+"#, Other, Pop),
(r#"rem.*"#, Comment, Pop),
(r#"(?i:rem)\S+"#, Other, Pop),
(r#"(?i:rem).*"#, Comment, Pop),
(r#"::.*"#, Comment, Pop),
(r#"""#, String, Push("string")),
(r#"[!*/%+<=>|]"#, Operator, Pop),
(r#"%%"#, Other, Pop),
(r#"%"#, Variable, Push("variable")),
(r#"[!*/+<=>|]"#, Operator, Pop),
(
r"(?i:break|call|cd|chdir|cls|copy|del|dir|echo|exit|for|goto|if|md|mkdir|move|pause|ren|set)\w+",
Other,
@ -225,6 +214,7 @@ pub const LANGUAGES: &[Language] = &[
rules: &[(r#"""#, String, Pop), (r#"\\"#, String, Push("string_escape"))],
},
State { name: "string_escape", rules: &[(r#"."#, String, Pop)] },
State { name: "variable", rules: &[(r#"%"#, Variable, Pop)] },
],
},
];

View File

@ -31,19 +31,23 @@ fn main() {
use std::ops::RangeInclusive;
use Action::*;
use Consume::*;
use HighlightKind::*;
pub struct Language {
pub name: &'static str,
pub extensions: &'static [&'static str],
pub states: &'static [&'static [Transition<'static>]],
}
pub type Transition<'a> = (Consume<'a>, HighlightKind, Action);
pub type Transition<'a> = (Consume<'a>, Option<HighlightKind>, Action);
pub enum Consume<'a> {
Chars(usize),
Charset(&'a [u8; 256]),
Prefix(&'a str),
PrefixInsensitive(&'a str),
Charset(&'a [u8; 256]),
}
#[derive(Debug, Default, Clone, Copy, PartialEq, Eq)]
@ -125,30 +129,26 @@ fn main() {
for t in &state.transitions {
let test = match &t.test {
GraphConsume::Chars(usize::MAX) => "Consume::Chars(usize::MAX)".to_string(),
GraphConsume::Chars(usize::MAX) => "Chars(usize::MAX)".to_string(),
GraphConsume::Chars(n) => {
format!("Consume::Chars({n})")
}
GraphConsume::Prefix(s) => {
format!("Consume::Prefix(r#\"{s}\"#)")
}
GraphConsume::PrefixInsensitive(s) => {
format!("Consume::PrefixInsensitive(r#\"{s}\"#)")
format!("Chars({n})")
}
GraphConsume::Charset(cs) => {
format!("Consume::Charset(LANG_{}_CHARSET_{})", name_uppercase, cs.id())
format!("Charset(LANG_{}_CHARSET_{})", name_uppercase, cs.id())
}
GraphConsume::Prefix(s) => {
format!("Prefix(r#\"{s}\"#)")
}
GraphConsume::PrefixInsensitive(s) => {
format!("PrefixInsensitive(r#\"{s}\"#)")
}
};
let action = match &t.action {
GraphAction::Change(next) => format!("Action::Change({})", next.borrow().id),
GraphAction::Push(next) => format!("Action::Push({})", next.borrow().id),
GraphAction::Pop => "Action::Pop".to_string(),
GraphAction::Change(next) => format!("Change({})", next.borrow().id),
GraphAction::Push(next) => format!("Push({})", next.borrow().id),
GraphAction::Pop => "Pop".to_string(),
};
_ = writeln!(
output,
r#" ({test}, HighlightKind::{kind}, {action}),"#,
kind = t.kind.as_str()
);
_ = writeln!(output, r#" ({test}, {kind:?}, {action}),"#, kind = t.kind);
}
_ = writeln!(output, r#" ],"#);

View File

@ -12,7 +12,7 @@ use crate::definitions::*;
pub struct GraphBuilder {
roots: Vec<Rc<WipStateCell>>,
states: Vec<Weak<WipStateCell>>,
charsets: Vec<Weak<Charset>>,
charsets: CharsetInterner,
origin: i32,
kind: HighlightKind,
@ -23,7 +23,7 @@ impl GraphBuilder {
GraphBuilder {
roots: Vec::with_capacity(16),
states: Vec::with_capacity(16),
charsets: Vec::with_capacity(16),
charsets: CharsetInterner { charsets: Vec::with_capacity(16) },
origin: -1,
kind: HighlightKind::Other,
@ -126,7 +126,7 @@ impl GraphBuilder {
class: &ClassBytes,
) -> GraphAction {
let c = self.class_to_charset(class);
let c = self.intern_charset(c);
let c = self.charsets.intern(c);
self.add_transition(src, dst, GraphConsume::Charset(c))
}
@ -295,19 +295,6 @@ impl GraphBuilder {
charset
}
fn intern_charset(&mut self, mut charset: Charset) -> Rc<Charset> {
if let Some(rc) = self.charsets.iter().filter_map(|w| w.upgrade()).find(|c| **c == charset)
{
return rc;
}
charset.id = self.charsets.len();
let rc = Rc::new(charset);
self.charsets.push(Rc::downgrade(&rc));
rc
}
fn add_state(&mut self, depth: usize) -> Rc<WipStateCell> {
let s = Rc::new(WipStateCell::new(GraphState {
id: 0,
@ -358,16 +345,16 @@ impl GraphBuilder {
if match (&t.test, &test) {
(Chars(_), _) => true,
(Charset(p), Charset(n)) => {
// If all the bits in `n` are also true in `p`
n.iter().zip(p.iter()).all(|(n, p)| !n || p)
}
(Prefix(p), Prefix(s)) => s.starts_with(p.as_str()),
(PrefixInsensitive(p), Prefix(s) | PrefixInsensitive(s)) => {
let s = s.as_bytes();
let p = p.as_bytes();
p.len() <= s.len() && s[..p.len()].eq_ignore_ascii_case(p)
}
(Charset(p), Charset(n)) => {
// If all the bits in `n` are also true in `p`
n.iter().zip(p.iter()).all(|(n, p)| !n || p)
}
_ => false,
} {
panic!(
@ -380,18 +367,19 @@ impl GraphBuilder {
s.transitions.push(GraphTransition {
origin: self.origin,
test,
kind: self.kind,
kind: Some(self.kind),
action: dst.clone(),
});
dst
}
pub fn finalize(&mut self) {
// Compute existing charset coverage.
// Technically we don't need to do that for the root states.
for s in &self.roots {
let mut s = s.borrow_mut();
s.coverage.fill(true);
}
for s in &self.states[self.roots.len()..] {
let Some(s) = s.upgrade() else {
continue;
@ -414,10 +402,12 @@ impl GraphBuilder {
}
}
// Add fallbacks from earlier regexes to later regexes that cover them.
for root in &self.roots {
Self::fallback_find(root.clone());
}
// Add always-matching fallbacks to all remaining ones.
for s in &self.states[self.roots.len()..] {
let Some(s) = s.upgrade() else {
continue;
@ -428,12 +418,63 @@ impl GraphBuilder {
s.transitions.push(GraphTransition {
origin: -1,
test: GraphConsume::Chars(0),
kind: HighlightKind::Other,
kind: Some(HighlightKind::Other),
action: GraphAction::Pop,
});
}
}
// Compute fast skips for root states & fallback to 1 byte skip.
// This is different from coverage computation above, because here
// we want to "skip" any character that can never be matched.
for root in &self.roots {
let mut s = root.borrow_mut();
let mut cs = Charset::no();
for t in &s.transitions {
match &t.test {
GraphConsume::Chars(_) => {
cs.fill(true);
break;
}
GraphConsume::Charset(c) => {
cs.merge(c);
}
GraphConsume::Prefix(s) => {
let ch = s.as_bytes()[0];
cs.set(ch, true);
}
GraphConsume::PrefixInsensitive(s) => {
let ch = s.as_bytes()[0];
cs.set(ch.to_ascii_uppercase(), true);
cs.set(ch.to_ascii_lowercase(), true);
}
}
}
if !cs.covers_all() {
cs.invert();
let cs = self.charsets.intern(cs);
s.transitions.insert(
0,
GraphTransition {
origin: -1,
test: GraphConsume::Charset(cs),
kind: None,
action: GraphAction::Change(root.clone()),
},
);
s.transitions.push(GraphTransition {
origin: -1,
test: GraphConsume::Chars(1),
kind: None,
action: GraphAction::Pop,
});
}
}
// Assign IDs to states.
let mut id = 0;
for s in &self.states {
let Some(s) = s.upgrade() else {
@ -538,6 +579,7 @@ impl GraphBuilder {
let label = match &t.test {
GraphConsume::Chars(usize::MAX) => "Chars(Line)".to_string(),
GraphConsume::Chars(n) => format!("Chars({n})"),
GraphConsume::Charset(c) => format!("Charset({c:?})"),
GraphConsume::Prefix(s) => {
let mut label = String::new();
_ = write!(label, "Prefix({s}");
@ -582,7 +624,6 @@ impl GraphBuilder {
label.push(')');
label
}
GraphConsume::Charset(c) => format!("Charset({c:?})"),
};
let dst = match &t.action {
@ -604,9 +645,9 @@ impl GraphBuilder {
let label = escape(&label);
_ = writeln!(
&mut visitor.output,
" {src} -->|\"{label}<br/>{kind}\"| {dst}",
" {src} -->|\"{label}<br/>{kind:?}\"| {dst}",
src = src.borrow().id,
kind = t.kind.as_str()
kind = t.kind,
);
if let GraphAction::Change(dst) = &t.action {
@ -634,7 +675,7 @@ impl GraphBuilder {
}
pub fn charsets(&self) -> Vec<Rc<Charset>> {
self.charsets.iter().filter_map(Weak::upgrade).collect()
self.charsets.extract()
}
pub fn states(&self) -> Vec<Rc<WipStateCell>> {
@ -642,6 +683,29 @@ impl GraphBuilder {
}
}
struct CharsetInterner {
charsets: Vec<Weak<Charset>>,
}
impl CharsetInterner {
pub fn intern(&mut self, mut charset: Charset) -> Rc<Charset> {
if let Some(rc) = self.charsets.iter().filter_map(|w| w.upgrade()).find(|c| **c == charset)
{
return rc;
}
charset.id = self.charsets.len();
let rc = Rc::new(charset);
self.charsets.push(Rc::downgrade(&rc));
rc
}
pub fn extract(&self) -> Vec<Rc<Charset>> {
self.charsets.iter().filter_map(Weak::upgrade).collect()
}
}
#[derive(Debug, Default)]
pub struct GraphState {
pub id: usize,
@ -689,16 +753,16 @@ impl Eq for GraphAction {}
pub enum GraphConsume {
// Same as super::Consume
Chars(usize),
Charset(Rc<Charset>),
Prefix(String),
PrefixInsensitive(String),
Charset(Rc<Charset>),
}
#[derive(Debug, Clone)]
pub struct GraphTransition {
origin: i32,
pub test: GraphConsume,
pub kind: HighlightKind,
pub kind: Option<HighlightKind>,
pub action: GraphAction,
}
@ -729,6 +793,16 @@ impl Charset {
self.bits.fill(value);
}
pub fn invert(&mut self) {
for b in &mut self.bits {
*b = !*b;
}
}
pub fn set(&mut self, index: u8, value: bool) {
self.bits[index as usize] = value;
}
pub fn merge(&mut self, other: &Charset) {
for (a, b) in self.bits.iter_mut().zip(other.bits.iter()) {
*a |= *b;

23
vectorized_match.rs Normal file
View File

@ -0,0 +1,23 @@
#[inline(always)]
unsafe fn vectorized_match(
input: &[u8; 16],
bitmap_0_7: &[u8; 16],
bitmap_8_15: &[u8; 16],
bitmask_lookup: &[u8; 16],
) -> __m128i {
let input = _mm_loadu_si128(input.as_ptr() as *const __m128i);
let bitmap_0_7 = _mm_loadu_si128(bitmap_0_7.as_ptr() as *const __m128i);
let bitmap_8_15 = _mm_loadu_si128(bitmap_8_15.as_ptr() as *const __m128i);
let bitmask_lookup = _mm_loadu_si128(bitmask_lookup.as_ptr() as *const __m128i);
let higher_nibbles = _mm_and_si128(_mm_srli_epi16(input, 4), _mm_set1_epi8(0x0f));
let indices_0_7 = _mm_and_si128(input, _mm_set1_epi8(0x8f));
let msb = _mm_and_si128(input, _mm_set1_epi8(0x80));
let indices_8_15 = _mm_xor_si128(indices_0_7, msb);
let row_0_7 = _mm_shuffle_epi8(bitmap_0_7, indices_0_7);
let row_8_15 = _mm_shuffle_epi8(bitmap_8_15, indices_8_15);
let bitmask = _mm_shuffle_epi8(bitmask_lookup, higher_nibbles);
let bitsets = _mm_or_si128(row_0_7, row_8_15);
let tmp = _mm_and_si128(bitsets, bitmask);
let result = _mm_cmpeq_epi8(tmp, bitmask);
result
}