feat: implement basic syntax highlighting with tokenization

2025-11-25 06:56:33 +00:00 · 2025-11-25 06:56:33 +00:00 · 044e72a0f3
parent beb9ca69ec
commit 044e72a0f3
3 changed files with 251 additions and 2 deletions
--- a/src/buffer/mod.rs
+++ b/src/buffer/mod.rs
@ -33,6 +33,7 @@ use std::mem::{self, MaybeUninit};
 use std::ops::Range;
 use std::rc::Rc;
 use std::str;
+use std::collections::HashMap;

 pub use gap_buffer::GapBuffer;

@ -46,6 +47,7 @@ use crate::oklab::StraightRgba;
 use crate::simd::memchr2;
 use crate::unicode::{self, Cursor, MeasurementConfig, Utf8Chars};
 use crate::{apperr, icu, simd};
+use crate::syntax;

 /// The margin template is used for line numbers.
 /// The max. line number we should ever expect is probably 64-bit,
@ -245,6 +247,9 @@ pub struct TextBuffer {
    overtype: bool,

    wants_cursor_visibility: bool,
+    // Cache of tokenization results keyed by the starting byte-offset of
+    // the displayed fragment.
+    token_cache: HashMap<usize, Vec<crate::syntax::Token>>,
 }

 impl TextBuffer {
@ -293,6 +298,7 @@ impl TextBuffer {
            overtype: false,

            wants_cursor_visibility: false,
+            token_cache: HashMap::new(),
        })
    }

@ -655,6 +661,8 @@ impl TextBuffer {
        self.cursor = Default::default();
        self.set_selection(None);
        self.mark_as_clean();
+        // Clear token cache because the whole buffer changed.
+        self.token_cache.clear();
        self.reflow();
    }

@ -1963,9 +1971,38 @@ impl TextBuffer {
                visual_pos_x_max = visual_pos_x_max.max(cursor_end.visual_pos.x);
            }

-            fb.replace_text(destination.top + y, destination.left, destination.right, &line);
+                fb.replace_text(destination.top + y, destination.left, destination.right, &line);

-            cursor = cursor_end;
+                // Basic generic syntax highlighting (display-line tokenizer).
+                // Use a per-fragment cache keyed by the starting byte offset of the
+                // displayed fragment (`cursor_beg.offset`). This avoids re-tokenizing
+                // unchanged fragments.
+                let start_offset = cursor_beg.offset;
+                let tokens = if let Some(cached) = self.token_cache.get(&start_offset) {
+                    cached.clone()
+                } else {
+                    let t = crate::syntax::tokenize_display_line(&line);
+                    self.token_cache.insert(start_offset, t.clone());
+                    t
+                };
+
+                for tok in tokens.iter() {
+                    if matches!(tok.kind, crate::syntax::TokenKind::Whitespace) {
+                        continue;
+                    }
+
+                    let left = destination.left + self.margin_width + tok.start as CoordType;
+                    let right = left + (tok.end.saturating_sub(tok.start)) as CoordType;
+                    if left >= destination.right || right <= destination.left {
+                        continue;
+                    }
+
+                    let rect = Rect { left: left.max(destination.left), top: destination.top + y, right: right.min(destination.right), bottom: destination.top + y + 1 };
+                    let color = crate::syntax::token_kind_color(tok.kind);
+                    fb.blend_fg(rect, fb.indexed(color));
+                }
+
+                cursor = cursor_end;
        }

        // Colorize the margin that we wrote above.
@ -2611,6 +2648,15 @@ impl TextBuffer {
    fn edit_write(&mut self, text: &[u8]) {
        let logical_y_before = self.cursor.logical_pos.y;

+        // Invalidate token cache entries starting at/after the line that contains
+        // the active edit offset. This makes the cache per-line relative to
+        // fragment starting offsets and avoids full-cache clears for small edits.
+        let off = self.active_edit_off;
+        let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off);
+        let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y);
+        let start_off = start_cursor.offset;
+        self.token_cache.retain(|&k, _| k < start_off);
+
        // Copy the written portion into the undo entry.
        {
            let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut();
@ -2636,6 +2682,14 @@ impl TextBuffer {
        let off = self.active_edit_off;
        let mut out_off = usize::MAX;

+        // Invalidate token cache entries starting at/after the line that contains
+        // the deletion start offset (`off`). This prevents stale tokens from
+        // being reused after deletion.
+        let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off);
+        let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y);
+        let start_off = start_cursor.offset;
+        self.token_cache.retain(|&k, _| k < start_off);
+
        let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut();

        // If this is a continued backspace operation,
--- a/src/lib.rs
+++ b/src/lib.rs
@ -39,3 +39,4 @@ pub mod sys;
 pub mod tui;
 pub mod unicode;
 pub mod vt;
+pub mod syntax;
--- a/src/syntax.rs
+++ b/src/syntax.rs
@ -0,0 +1,194 @@
+// Copyright (c) Microsoft Corporation.
+// Licensed under the MIT License.
+
+use crate::framebuffer::IndexedColor;
+
+/// A token kind for the display-level generic tokenizer.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum TokenKind {
+    Comment,
+    String,
+    Number,
+    Identifier,
+    Punctuation,
+    Whitespace,
+    Other,
+}
+
+/// A token within a display line measured in character columns (approximate).
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub struct Token {
+    pub kind: TokenKind,
+    /// Start column (inclusive) within the display line.
+    pub start: usize,
+    /// End column (exclusive) within the display line.
+    pub end: usize,
+}
+
+/// Simple, fast, single-pass tokenizer that operates on the already-processed
+/// display line (tabs expanded, control glyphs replaced). It intentionally
+/// keeps things minimal and avoids allocations where possible.
+pub fn tokenize_display_line(line: &str) -> Vec<Token> {
+    let mut out = Vec::new();
+    let mut chars = line.chars().peekable();
+    let mut col = 0usize;
+
+    while let Some(&ch) = chars.peek() {
+        // Determine token start at current column.
+        let start = col;
+
+        if ch.is_whitespace() {
+            // Whitespace run
+            let mut len = 0usize;
+            while let Some(&c) = chars.peek() {
+                if !c.is_whitespace() { break; }
+                chars.next();
+                len += 1;
+            }
+            out.push(Token { kind: TokenKind::Whitespace, start, end: start + len });
+            col += len;
+            continue;
+        }
+
+        // Line comment: starts with '#' or '//' sequence.
+        if ch == '#' {
+            // consume rest of line as comment
+            let mut len = 0usize;
+            while let Some(c) = chars.next() {
+                len += 1;
+            }
+            out.push(Token { kind: TokenKind::Comment, start, end: start + len });
+            col += len;
+            break;
+        }
+
+        if ch == '/' {
+            // possible // comment
+            chars.next();
+            if let Some(&'/') = chars.peek() {
+                // consume the second '/'
+                chars.next();
+                let mut len = 2usize;
+                while let Some(c) = chars.next() {
+                    len += 1;
+                }
+                out.push(Token { kind: TokenKind::Comment, start, end: start + len });
+                col += len;
+                break;
+            } else {
+                // it's punctuation '/'
+                out.push(Token { kind: TokenKind::Punctuation, start, end: start + 1 });
+                col += 1;
+                continue;
+            }
+        }
+
+        // Strings: "..." or '...'
+        if ch == '"' || ch == '\'' {
+            let quote = ch;
+            chars.next();
+            let mut len = 1usize;
+            let mut escaped = false;
+            while let Some(c) = chars.next() {
+                len += 1;
+                if escaped {
+                    escaped = false;
+                    continue;
+                }
+                if c == '\\' {
+                    escaped = true;
+                    continue;
+                }
+                if c == quote {
+                    break;
+                }
+            }
+            out.push(Token { kind: TokenKind::String, start, end: start + len });
+            col += len;
+            continue;
+        }
+
+        // Numbers: start with digit
+        if ch.is_ascii_digit() {
+            let mut len = 0usize;
+            while let Some(&c) = chars.peek() {
+                if c.is_ascii_digit() || c == '.' || c == '_' { chars.next(); len += 1; } else { break }
+            }
+            out.push(Token { kind: TokenKind::Number, start, end: start + len });
+            col += len;
+            continue;
+        }
+
+        // Identifier: starts with letter or underscore
+        if ch.is_alphabetic() || ch == '_' {
+            let mut len = 0usize;
+            while let Some(&c) = chars.peek() {
+                if c.is_alphanumeric() || c == '_' { chars.next(); len += 1; } else { break }
+            }
+            out.push(Token { kind: TokenKind::Identifier, start, end: start + len });
+            col += len;
+            continue;
+        }
+
+        // Punctuation/other single char
+        chars.next();
+        out.push(Token { kind: TokenKind::Punctuation, start, end: start + 1 });
+        col += 1;
+    }
+
+    out
+}
+
+/// Maps token kinds to an `IndexedColor` from the basic 8-color palette.
+pub fn token_kind_color(kind: TokenKind) -> IndexedColor {
+    match kind {
+        TokenKind::Comment => IndexedColor::Green,
+        TokenKind::String => IndexedColor::Red,
+        TokenKind::Number => IndexedColor::Magenta,
+        TokenKind::Identifier => IndexedColor::Cyan,
+        TokenKind::Punctuation => IndexedColor::Yellow,
+        TokenKind::Whitespace => IndexedColor::White,
+        TokenKind::Other => IndexedColor::White,
+    }
+}
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    #[test]
+    fn tokenize_basic_line() {
+        let s = "let x = 42; // comment";
+        let toks = tokenize_display_line(s);
+        let kinds: Vec<TokenKind> = toks.iter().map(|t| t.kind).collect();
+        assert_eq!(kinds[0], TokenKind::Identifier); // "let"
+        assert_eq!(kinds[kinds.len() - 1], TokenKind::Comment);
+
+        // Verify spans for a couple tokens
+        assert_eq!(toks[0].start, 0);
+        assert_eq!(toks[0].end, 3); // "let"
+        // number token should cover "42"
+        let num_tok = toks.iter().find(|t| t.kind == TokenKind::Number).unwrap();
+        assert_eq!(&s[num_tok.start..num_tok.end], "42");
+    }
+
+    #[test]
+    fn tokenize_string_and_ident() {
+        let s = "\"hello\" world";
+        let toks = tokenize_display_line(s);
+        assert_eq!(toks[0].kind, TokenKind::String);
+        assert_eq!(&s[toks[0].start..toks[0].end], "\"hello\"");
+        assert_eq!(toks[1].kind, TokenKind::Whitespace);
+        assert_eq!(toks[2].kind, TokenKind::Identifier);
+        assert_eq!(&s[toks[2].start..toks[2].end], "world");
+    }
+
+    #[test]
+    fn tokenize_hash_comment() {
+        let s = "  #hi";
+        let toks = tokenize_display_line(s);
+        assert_eq!(toks[0].kind, TokenKind::Whitespace);
+        assert_eq!(toks[1].kind, TokenKind::Comment);
+        assert_eq!(&s[toks[1].start..toks[1].end], "#hi");
+    }
+}