feat: implement basic syntax highlighting with tokenization

This commit is contained in:
Ayush Muley 2025-11-25 06:56:33 +00:00
parent beb9ca69ec
commit 044e72a0f3
3 changed files with 251 additions and 2 deletions

View File

@ -33,6 +33,7 @@ use std::mem::{self, MaybeUninit};
use std::ops::Range;
use std::rc::Rc;
use std::str;
use std::collections::HashMap;
pub use gap_buffer::GapBuffer;
@ -46,6 +47,7 @@ use crate::oklab::StraightRgba;
use crate::simd::memchr2;
use crate::unicode::{self, Cursor, MeasurementConfig, Utf8Chars};
use crate::{apperr, icu, simd};
use crate::syntax;
/// The margin template is used for line numbers.
/// The max. line number we should ever expect is probably 64-bit,
@ -245,6 +247,9 @@ pub struct TextBuffer {
overtype: bool,
wants_cursor_visibility: bool,
// Cache of tokenization results keyed by the starting byte-offset of
// the displayed fragment.
token_cache: HashMap<usize, Vec<crate::syntax::Token>>,
}
impl TextBuffer {
@ -293,6 +298,7 @@ impl TextBuffer {
overtype: false,
wants_cursor_visibility: false,
token_cache: HashMap::new(),
})
}
@ -655,6 +661,8 @@ impl TextBuffer {
self.cursor = Default::default();
self.set_selection(None);
self.mark_as_clean();
// Clear token cache because the whole buffer changed.
self.token_cache.clear();
self.reflow();
}
@ -1963,9 +1971,38 @@ impl TextBuffer {
visual_pos_x_max = visual_pos_x_max.max(cursor_end.visual_pos.x);
}
fb.replace_text(destination.top + y, destination.left, destination.right, &line);
fb.replace_text(destination.top + y, destination.left, destination.right, &line);
cursor = cursor_end;
// Basic generic syntax highlighting (display-line tokenizer).
// Use a per-fragment cache keyed by the starting byte offset of the
// displayed fragment (`cursor_beg.offset`). This avoids re-tokenizing
// unchanged fragments.
let start_offset = cursor_beg.offset;
let tokens = if let Some(cached) = self.token_cache.get(&start_offset) {
cached.clone()
} else {
let t = crate::syntax::tokenize_display_line(&line);
self.token_cache.insert(start_offset, t.clone());
t
};
for tok in tokens.iter() {
if matches!(tok.kind, crate::syntax::TokenKind::Whitespace) {
continue;
}
let left = destination.left + self.margin_width + tok.start as CoordType;
let right = left + (tok.end.saturating_sub(tok.start)) as CoordType;
if left >= destination.right || right <= destination.left {
continue;
}
let rect = Rect { left: left.max(destination.left), top: destination.top + y, right: right.min(destination.right), bottom: destination.top + y + 1 };
let color = crate::syntax::token_kind_color(tok.kind);
fb.blend_fg(rect, fb.indexed(color));
}
cursor = cursor_end;
}
// Colorize the margin that we wrote above.
@ -2611,6 +2648,15 @@ impl TextBuffer {
fn edit_write(&mut self, text: &[u8]) {
let logical_y_before = self.cursor.logical_pos.y;
// Invalidate token cache entries starting at/after the line that contains
// the active edit offset. This makes the cache per-line relative to
// fragment starting offsets and avoids full-cache clears for small edits.
let off = self.active_edit_off;
let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off);
let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y);
let start_off = start_cursor.offset;
self.token_cache.retain(|&k, _| k < start_off);
// Copy the written portion into the undo entry.
{
let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut();
@ -2636,6 +2682,14 @@ impl TextBuffer {
let off = self.active_edit_off;
let mut out_off = usize::MAX;
// Invalidate token cache entries starting at/after the line that contains
// the deletion start offset (`off`). This prevents stale tokens from
// being reused after deletion.
let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off);
let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y);
let start_off = start_cursor.offset;
self.token_cache.retain(|&k, _| k < start_off);
let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut();
// If this is a continued backspace operation,

View File

@ -39,3 +39,4 @@ pub mod sys;
pub mod tui;
pub mod unicode;
pub mod vt;
pub mod syntax;

194
src/syntax.rs Normal file
View File

@ -0,0 +1,194 @@
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
use crate::framebuffer::IndexedColor;
/// A token kind for the display-level generic tokenizer.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum TokenKind {
Comment,
String,
Number,
Identifier,
Punctuation,
Whitespace,
Other,
}
/// A token within a display line measured in character columns (approximate).
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub struct Token {
pub kind: TokenKind,
/// Start column (inclusive) within the display line.
pub start: usize,
/// End column (exclusive) within the display line.
pub end: usize,
}
/// Simple, fast, single-pass tokenizer that operates on the already-processed
/// display line (tabs expanded, control glyphs replaced). It intentionally
/// keeps things minimal and avoids allocations where possible.
pub fn tokenize_display_line(line: &str) -> Vec<Token> {
let mut out = Vec::new();
let mut chars = line.chars().peekable();
let mut col = 0usize;
while let Some(&ch) = chars.peek() {
// Determine token start at current column.
let start = col;
if ch.is_whitespace() {
// Whitespace run
let mut len = 0usize;
while let Some(&c) = chars.peek() {
if !c.is_whitespace() { break; }
chars.next();
len += 1;
}
out.push(Token { kind: TokenKind::Whitespace, start, end: start + len });
col += len;
continue;
}
// Line comment: starts with '#' or '//' sequence.
if ch == '#' {
// consume rest of line as comment
let mut len = 0usize;
while let Some(c) = chars.next() {
len += 1;
}
out.push(Token { kind: TokenKind::Comment, start, end: start + len });
col += len;
break;
}
if ch == '/' {
// possible // comment
chars.next();
if let Some(&'/') = chars.peek() {
// consume the second '/'
chars.next();
let mut len = 2usize;
while let Some(c) = chars.next() {
len += 1;
}
out.push(Token { kind: TokenKind::Comment, start, end: start + len });
col += len;
break;
} else {
// it's punctuation '/'
out.push(Token { kind: TokenKind::Punctuation, start, end: start + 1 });
col += 1;
continue;
}
}
// Strings: "..." or '...'
if ch == '"' || ch == '\'' {
let quote = ch;
chars.next();
let mut len = 1usize;
let mut escaped = false;
while let Some(c) = chars.next() {
len += 1;
if escaped {
escaped = false;
continue;
}
if c == '\\' {
escaped = true;
continue;
}
if c == quote {
break;
}
}
out.push(Token { kind: TokenKind::String, start, end: start + len });
col += len;
continue;
}
// Numbers: start with digit
if ch.is_ascii_digit() {
let mut len = 0usize;
while let Some(&c) = chars.peek() {
if c.is_ascii_digit() || c == '.' || c == '_' { chars.next(); len += 1; } else { break }
}
out.push(Token { kind: TokenKind::Number, start, end: start + len });
col += len;
continue;
}
// Identifier: starts with letter or underscore
if ch.is_alphabetic() || ch == '_' {
let mut len = 0usize;
while let Some(&c) = chars.peek() {
if c.is_alphanumeric() || c == '_' { chars.next(); len += 1; } else { break }
}
out.push(Token { kind: TokenKind::Identifier, start, end: start + len });
col += len;
continue;
}
// Punctuation/other single char
chars.next();
out.push(Token { kind: TokenKind::Punctuation, start, end: start + 1 });
col += 1;
}
out
}
/// Maps token kinds to an `IndexedColor` from the basic 8-color palette.
pub fn token_kind_color(kind: TokenKind) -> IndexedColor {
match kind {
TokenKind::Comment => IndexedColor::Green,
TokenKind::String => IndexedColor::Red,
TokenKind::Number => IndexedColor::Magenta,
TokenKind::Identifier => IndexedColor::Cyan,
TokenKind::Punctuation => IndexedColor::Yellow,
TokenKind::Whitespace => IndexedColor::White,
TokenKind::Other => IndexedColor::White,
}
}
#[cfg(test)]
mod tests {
use super::*;
#[test]
fn tokenize_basic_line() {
let s = "let x = 42; // comment";
let toks = tokenize_display_line(s);
let kinds: Vec<TokenKind> = toks.iter().map(|t| t.kind).collect();
assert_eq!(kinds[0], TokenKind::Identifier); // "let"
assert_eq!(kinds[kinds.len() - 1], TokenKind::Comment);
// Verify spans for a couple tokens
assert_eq!(toks[0].start, 0);
assert_eq!(toks[0].end, 3); // "let"
// number token should cover "42"
let num_tok = toks.iter().find(|t| t.kind == TokenKind::Number).unwrap();
assert_eq!(&s[num_tok.start..num_tok.end], "42");
}
#[test]
fn tokenize_string_and_ident() {
let s = "\"hello\" world";
let toks = tokenize_display_line(s);
assert_eq!(toks[0].kind, TokenKind::String);
assert_eq!(&s[toks[0].start..toks[0].end], "\"hello\"");
assert_eq!(toks[1].kind, TokenKind::Whitespace);
assert_eq!(toks[2].kind, TokenKind::Identifier);
assert_eq!(&s[toks[2].start..toks[2].end], "world");
}
#[test]
fn tokenize_hash_comment() {
let s = " #hi";
let toks = tokenize_display_line(s);
assert_eq!(toks[0].kind, TokenKind::Whitespace);
assert_eq!(toks[1].kind, TokenKind::Comment);
assert_eq!(&s[toks[1].start..toks[1].end], "#hi");
}
}