Add LSP utf-16 coordinates to byte offset conversion

This commit is contained in:
konstin 2023-09-21 12:28:46 +02:00
parent dd8b1244fd
commit f39e8af8ae
1 changed files with 86 additions and 0 deletions

View File

@ -449,6 +449,92 @@ impl<'a> Locator<'a> {
}
}
/// Compute the byte offset from language server protocol zero-indexed row and utf-16 column
/// indices.
///
/// It's possible to negotiate the text encoding with the LSP client, but the default that must
/// always be supported and that we currently use is utf-16.
///
/// We get row and column from the LSP. E.g.
/// ```text
/// a=(1,2,)
/// b=(3,4,)
/// ^
/// c=(5,6,)
/// ```
/// has coordinates `1:2`. Note that indices are computed in utf-16, e.g.
/// ```text
/// "안녕"
/// ^
/// ```
/// where the first syllable is a single character (two bytes), we get `0:2`, while for
/// ```text
/// "감기"
/// ^
/// ```
/// where the first syllable is three characters (three times two bytes), we get `0:4`. But for
/// ```text
/// 豆腐
/// ^
/// ```
/// we get `0:2` because `豆` is two characters (4 bytes) in utf-16.
///
/// ```rust
/// # use ruff_text_size::{Ranged, TextLen, TextRange, TextSize};
/// # use ruff_source_file::Locator;
///
/// let source = "a=(1,2,)\nb=(3,4,)";
/// let locator = Locator::new(source);
/// let offset = locator.convert_row_and_column_utf16(1, 2).unwrap();
/// assert_eq!(&source[TextRange::new(offset, source.text_len())], "(3,4,)");
///
/// let source = "a=(1,2,)\n'안녕'";
/// let locator = Locator::new(source);
/// let offset = locator.convert_row_and_column_utf16(1, 2).unwrap();
/// assert_eq!(&source[TextRange::new(offset, source.text_len())], "녕'");
///
/// let source = "a=(1,2,)\n'감기'";
/// let locator = Locator::new(source);
/// let offset = locator.convert_row_and_column_utf16(1, 4).unwrap();
/// assert_eq!(&source[TextRange::new(offset, source.text_len())], "기'");
///
/// let source = "a=(1,2,)\n'豆腐'";
/// let locator = Locator::new(source);
/// let offset = locator.convert_row_and_column_utf16(1, 2).unwrap();
/// assert_eq!(&source[TextRange::new(offset, source.text_len())], "腐'");
/// ```
pub fn convert_row_and_column_utf16(&self, row: usize, column: usize) -> Option<TextSize> {
let line_start = *self.to_index().line_starts().get(row)?;
let next_line_start = self
.to_index()
.line_starts()
.get(row + 1)
.copied()
.unwrap_or(self.contents.text_len());
let line_contents = &self.contents[TextRange::from(line_start..next_line_start)];
let mut len_bytes = TextSize::default();
let mut len_utf16 = 0;
for char in line_contents
.chars()
// Since the range goes to the next line start, `line_contents` contains the line
// break
.take_while(|c| *c != '\n' && *c != '\r')
{
// This check must be first for the 0 column case
if len_utf16 >= column {
break;
}
len_bytes += char.text_len();
len_utf16 += char.len_utf16();
}
if len_utf16 != column {
return None;
}
Some(line_start + len_bytes)
}
/// Take the source code between the given [`TextRange`].
#[inline]
pub fn slice<T: Ranged>(&self, ranged: T) -> &'a str {