mirror of https://github.com/microsoft/edit
feat: implement basic syntax highlighting with tokenization
This commit is contained in:
parent
beb9ca69ec
commit
044e72a0f3
|
|
@ -33,6 +33,7 @@ use std::mem::{self, MaybeUninit};
|
|||
use std::ops::Range;
|
||||
use std::rc::Rc;
|
||||
use std::str;
|
||||
use std::collections::HashMap;
|
||||
|
||||
pub use gap_buffer::GapBuffer;
|
||||
|
||||
|
|
@ -46,6 +47,7 @@ use crate::oklab::StraightRgba;
|
|||
use crate::simd::memchr2;
|
||||
use crate::unicode::{self, Cursor, MeasurementConfig, Utf8Chars};
|
||||
use crate::{apperr, icu, simd};
|
||||
use crate::syntax;
|
||||
|
||||
/// The margin template is used for line numbers.
|
||||
/// The max. line number we should ever expect is probably 64-bit,
|
||||
|
|
@ -245,6 +247,9 @@ pub struct TextBuffer {
|
|||
overtype: bool,
|
||||
|
||||
wants_cursor_visibility: bool,
|
||||
// Cache of tokenization results keyed by the starting byte-offset of
|
||||
// the displayed fragment.
|
||||
token_cache: HashMap<usize, Vec<crate::syntax::Token>>,
|
||||
}
|
||||
|
||||
impl TextBuffer {
|
||||
|
|
@ -293,6 +298,7 @@ impl TextBuffer {
|
|||
overtype: false,
|
||||
|
||||
wants_cursor_visibility: false,
|
||||
token_cache: HashMap::new(),
|
||||
})
|
||||
}
|
||||
|
||||
|
|
@ -655,6 +661,8 @@ impl TextBuffer {
|
|||
self.cursor = Default::default();
|
||||
self.set_selection(None);
|
||||
self.mark_as_clean();
|
||||
// Clear token cache because the whole buffer changed.
|
||||
self.token_cache.clear();
|
||||
self.reflow();
|
||||
}
|
||||
|
||||
|
|
@ -1963,9 +1971,38 @@ impl TextBuffer {
|
|||
visual_pos_x_max = visual_pos_x_max.max(cursor_end.visual_pos.x);
|
||||
}
|
||||
|
||||
fb.replace_text(destination.top + y, destination.left, destination.right, &line);
|
||||
fb.replace_text(destination.top + y, destination.left, destination.right, &line);
|
||||
|
||||
cursor = cursor_end;
|
||||
// Basic generic syntax highlighting (display-line tokenizer).
|
||||
// Use a per-fragment cache keyed by the starting byte offset of the
|
||||
// displayed fragment (`cursor_beg.offset`). This avoids re-tokenizing
|
||||
// unchanged fragments.
|
||||
let start_offset = cursor_beg.offset;
|
||||
let tokens = if let Some(cached) = self.token_cache.get(&start_offset) {
|
||||
cached.clone()
|
||||
} else {
|
||||
let t = crate::syntax::tokenize_display_line(&line);
|
||||
self.token_cache.insert(start_offset, t.clone());
|
||||
t
|
||||
};
|
||||
|
||||
for tok in tokens.iter() {
|
||||
if matches!(tok.kind, crate::syntax::TokenKind::Whitespace) {
|
||||
continue;
|
||||
}
|
||||
|
||||
let left = destination.left + self.margin_width + tok.start as CoordType;
|
||||
let right = left + (tok.end.saturating_sub(tok.start)) as CoordType;
|
||||
if left >= destination.right || right <= destination.left {
|
||||
continue;
|
||||
}
|
||||
|
||||
let rect = Rect { left: left.max(destination.left), top: destination.top + y, right: right.min(destination.right), bottom: destination.top + y + 1 };
|
||||
let color = crate::syntax::token_kind_color(tok.kind);
|
||||
fb.blend_fg(rect, fb.indexed(color));
|
||||
}
|
||||
|
||||
cursor = cursor_end;
|
||||
}
|
||||
|
||||
// Colorize the margin that we wrote above.
|
||||
|
|
@ -2611,6 +2648,15 @@ impl TextBuffer {
|
|||
fn edit_write(&mut self, text: &[u8]) {
|
||||
let logical_y_before = self.cursor.logical_pos.y;
|
||||
|
||||
// Invalidate token cache entries starting at/after the line that contains
|
||||
// the active edit offset. This makes the cache per-line relative to
|
||||
// fragment starting offsets and avoids full-cache clears for small edits.
|
||||
let off = self.active_edit_off;
|
||||
let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off);
|
||||
let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y);
|
||||
let start_off = start_cursor.offset;
|
||||
self.token_cache.retain(|&k, _| k < start_off);
|
||||
|
||||
// Copy the written portion into the undo entry.
|
||||
{
|
||||
let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut();
|
||||
|
|
@ -2636,6 +2682,14 @@ impl TextBuffer {
|
|||
let off = self.active_edit_off;
|
||||
let mut out_off = usize::MAX;
|
||||
|
||||
// Invalidate token cache entries starting at/after the line that contains
|
||||
// the deletion start offset (`off`). This prevents stale tokens from
|
||||
// being reused after deletion.
|
||||
let cursor_at_off = self.cursor_move_to_offset_internal(self.cursor, off);
|
||||
let start_cursor = self.goto_line_start(cursor_at_off, cursor_at_off.logical_pos.y);
|
||||
let start_off = start_cursor.offset;
|
||||
self.token_cache.retain(|&k, _| k < start_off);
|
||||
|
||||
let mut undo = self.undo_stack.back_mut().unwrap().borrow_mut();
|
||||
|
||||
// If this is a continued backspace operation,
|
||||
|
|
|
|||
|
|
@ -39,3 +39,4 @@ pub mod sys;
|
|||
pub mod tui;
|
||||
pub mod unicode;
|
||||
pub mod vt;
|
||||
pub mod syntax;
|
||||
|
|
|
|||
|
|
@ -0,0 +1,194 @@
|
|||
// Copyright (c) Microsoft Corporation.
|
||||
// Licensed under the MIT License.
|
||||
|
||||
use crate::framebuffer::IndexedColor;
|
||||
|
||||
/// A token kind for the display-level generic tokenizer.
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub enum TokenKind {
|
||||
Comment,
|
||||
String,
|
||||
Number,
|
||||
Identifier,
|
||||
Punctuation,
|
||||
Whitespace,
|
||||
Other,
|
||||
}
|
||||
|
||||
/// A token within a display line measured in character columns (approximate).
|
||||
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
|
||||
pub struct Token {
|
||||
pub kind: TokenKind,
|
||||
/// Start column (inclusive) within the display line.
|
||||
pub start: usize,
|
||||
/// End column (exclusive) within the display line.
|
||||
pub end: usize,
|
||||
}
|
||||
|
||||
/// Simple, fast, single-pass tokenizer that operates on the already-processed
|
||||
/// display line (tabs expanded, control glyphs replaced). It intentionally
|
||||
/// keeps things minimal and avoids allocations where possible.
|
||||
pub fn tokenize_display_line(line: &str) -> Vec<Token> {
|
||||
let mut out = Vec::new();
|
||||
let mut chars = line.chars().peekable();
|
||||
let mut col = 0usize;
|
||||
|
||||
while let Some(&ch) = chars.peek() {
|
||||
// Determine token start at current column.
|
||||
let start = col;
|
||||
|
||||
if ch.is_whitespace() {
|
||||
// Whitespace run
|
||||
let mut len = 0usize;
|
||||
while let Some(&c) = chars.peek() {
|
||||
if !c.is_whitespace() { break; }
|
||||
chars.next();
|
||||
len += 1;
|
||||
}
|
||||
out.push(Token { kind: TokenKind::Whitespace, start, end: start + len });
|
||||
col += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Line comment: starts with '#' or '//' sequence.
|
||||
if ch == '#' {
|
||||
// consume rest of line as comment
|
||||
let mut len = 0usize;
|
||||
while let Some(c) = chars.next() {
|
||||
len += 1;
|
||||
}
|
||||
out.push(Token { kind: TokenKind::Comment, start, end: start + len });
|
||||
col += len;
|
||||
break;
|
||||
}
|
||||
|
||||
if ch == '/' {
|
||||
// possible // comment
|
||||
chars.next();
|
||||
if let Some(&'/') = chars.peek() {
|
||||
// consume the second '/'
|
||||
chars.next();
|
||||
let mut len = 2usize;
|
||||
while let Some(c) = chars.next() {
|
||||
len += 1;
|
||||
}
|
||||
out.push(Token { kind: TokenKind::Comment, start, end: start + len });
|
||||
col += len;
|
||||
break;
|
||||
} else {
|
||||
// it's punctuation '/'
|
||||
out.push(Token { kind: TokenKind::Punctuation, start, end: start + 1 });
|
||||
col += 1;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
// Strings: "..." or '...'
|
||||
if ch == '"' || ch == '\'' {
|
||||
let quote = ch;
|
||||
chars.next();
|
||||
let mut len = 1usize;
|
||||
let mut escaped = false;
|
||||
while let Some(c) = chars.next() {
|
||||
len += 1;
|
||||
if escaped {
|
||||
escaped = false;
|
||||
continue;
|
||||
}
|
||||
if c == '\\' {
|
||||
escaped = true;
|
||||
continue;
|
||||
}
|
||||
if c == quote {
|
||||
break;
|
||||
}
|
||||
}
|
||||
out.push(Token { kind: TokenKind::String, start, end: start + len });
|
||||
col += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Numbers: start with digit
|
||||
if ch.is_ascii_digit() {
|
||||
let mut len = 0usize;
|
||||
while let Some(&c) = chars.peek() {
|
||||
if c.is_ascii_digit() || c == '.' || c == '_' { chars.next(); len += 1; } else { break }
|
||||
}
|
||||
out.push(Token { kind: TokenKind::Number, start, end: start + len });
|
||||
col += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Identifier: starts with letter or underscore
|
||||
if ch.is_alphabetic() || ch == '_' {
|
||||
let mut len = 0usize;
|
||||
while let Some(&c) = chars.peek() {
|
||||
if c.is_alphanumeric() || c == '_' { chars.next(); len += 1; } else { break }
|
||||
}
|
||||
out.push(Token { kind: TokenKind::Identifier, start, end: start + len });
|
||||
col += len;
|
||||
continue;
|
||||
}
|
||||
|
||||
// Punctuation/other single char
|
||||
chars.next();
|
||||
out.push(Token { kind: TokenKind::Punctuation, start, end: start + 1 });
|
||||
col += 1;
|
||||
}
|
||||
|
||||
out
|
||||
}
|
||||
|
||||
/// Maps token kinds to an `IndexedColor` from the basic 8-color palette.
|
||||
pub fn token_kind_color(kind: TokenKind) -> IndexedColor {
|
||||
match kind {
|
||||
TokenKind::Comment => IndexedColor::Green,
|
||||
TokenKind::String => IndexedColor::Red,
|
||||
TokenKind::Number => IndexedColor::Magenta,
|
||||
TokenKind::Identifier => IndexedColor::Cyan,
|
||||
TokenKind::Punctuation => IndexedColor::Yellow,
|
||||
TokenKind::Whitespace => IndexedColor::White,
|
||||
TokenKind::Other => IndexedColor::White,
|
||||
}
|
||||
}
|
||||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use super::*;
|
||||
|
||||
#[test]
|
||||
fn tokenize_basic_line() {
|
||||
let s = "let x = 42; // comment";
|
||||
let toks = tokenize_display_line(s);
|
||||
let kinds: Vec<TokenKind> = toks.iter().map(|t| t.kind).collect();
|
||||
assert_eq!(kinds[0], TokenKind::Identifier); // "let"
|
||||
assert_eq!(kinds[kinds.len() - 1], TokenKind::Comment);
|
||||
|
||||
// Verify spans for a couple tokens
|
||||
assert_eq!(toks[0].start, 0);
|
||||
assert_eq!(toks[0].end, 3); // "let"
|
||||
// number token should cover "42"
|
||||
let num_tok = toks.iter().find(|t| t.kind == TokenKind::Number).unwrap();
|
||||
assert_eq!(&s[num_tok.start..num_tok.end], "42");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_string_and_ident() {
|
||||
let s = "\"hello\" world";
|
||||
let toks = tokenize_display_line(s);
|
||||
assert_eq!(toks[0].kind, TokenKind::String);
|
||||
assert_eq!(&s[toks[0].start..toks[0].end], "\"hello\"");
|
||||
assert_eq!(toks[1].kind, TokenKind::Whitespace);
|
||||
assert_eq!(toks[2].kind, TokenKind::Identifier);
|
||||
assert_eq!(&s[toks[2].start..toks[2].end], "world");
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn tokenize_hash_comment() {
|
||||
let s = " #hi";
|
||||
let toks = tokenize_display_line(s);
|
||||
assert_eq!(toks[0].kind, TokenKind::Whitespace);
|
||||
assert_eq!(toks[1].kind, TokenKind::Comment);
|
||||
assert_eq!(&s[toks[1].start..toks[1].end], "#hi");
|
||||
}
|
||||
}
|
||||
Loading…
Reference in New Issue