Rename split methods to tokenize

This commit is contained in:
Armin Ronacher 2021-02-02 21:56:55 +01:00
parent b3ae45b118
commit 1f73e01ff1
3 changed files with 46 additions and 46 deletions

View file

@ -62,24 +62,24 @@ impl DiffableStrRef for Vec<u8> {
/// as long as they are ASCII compatible.
pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
/// Splits the value into newlines with newlines attached.
fn split_lines(&self) -> Vec<&Self>;
fn tokenize_lines(&self) -> Vec<&Self>;
/// Splits the value into newlines with newlines separated.
fn split_lines_and_newlines(&self) -> Vec<&Self>;
fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
/// Tokenizes into words.
fn split_words(&self) -> Vec<&Self>;
fn tokenize_words(&self) -> Vec<&Self>;
/// Splits the input into characters.
fn split_chars(&self) -> Vec<&Self>;
fn tokenize_chars(&self) -> Vec<&Self>;
/// Splits into unicode words.
#[cfg(feature = "unicode")]
fn split_unicode_words(&self) -> Vec<&Self>;
fn tokenize_unicode_words(&self) -> Vec<&Self>;
/// Splits into unicode graphemes..
#[cfg(feature = "unicode")]
fn split_graphemes(&self) -> Vec<&Self>;
fn tokenize_graphemes(&self) -> Vec<&Self>;
/// Decodes the string (potentially) lossy.
fn as_str(&self) -> Option<&str>;
@ -106,7 +106,7 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
}
impl DiffableStr for str {
fn split_lines(&self) -> Vec<&Self> {
fn tokenize_lines(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut last_pos = 0;
let mut lines = vec![];
@ -134,7 +134,7 @@ impl DiffableStr for str {
lines
}
fn split_lines_and_newlines(&self) -> Vec<&Self> {
fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
let mut rv = vec![];
let mut iter = self.char_indices().peekable();
@ -155,7 +155,7 @@ impl DiffableStr for str {
rv
}
fn split_words(&self) -> Vec<&Self> {
fn tokenize_words(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut rv = vec![];
@ -176,19 +176,19 @@ impl DiffableStr for str {
rv
}
fn split_chars(&self) -> Vec<&Self> {
fn tokenize_chars(&self) -> Vec<&Self> {
self.char_indices()
.map(move |(i, c)| &self[i..i + c.len_utf8()])
.collect()
}
#[cfg(feature = "unicode")]
fn split_unicode_words(&self) -> Vec<&Self> {
fn tokenize_unicode_words(&self) -> Vec<&Self> {
unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
}
#[cfg(feature = "unicode")]
fn split_graphemes(&self) -> Vec<&Self> {
fn tokenize_graphemes(&self) -> Vec<&Self> {
unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect()
}
@ -219,7 +219,7 @@ impl DiffableStr for str {
#[cfg(feature = "bytes")]
impl DiffableStr for [u8] {
fn split_lines(&self) -> Vec<&Self> {
fn tokenize_lines(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut last_pos = 0;
let mut lines = vec![];
@ -247,7 +247,7 @@ impl DiffableStr for [u8] {
lines
}
fn split_lines_and_newlines(&self) -> Vec<&Self> {
fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
let mut rv = vec![];
let mut iter = self.char_indices().peekable();
@ -266,7 +266,7 @@ impl DiffableStr for [u8] {
rv
}
fn split_words(&self) -> Vec<&Self> {
fn tokenize_words(&self) -> Vec<&Self> {
let mut iter = self.char_indices().peekable();
let mut rv = vec![];
@ -286,16 +286,16 @@ impl DiffableStr for [u8] {
}
#[cfg(feature = "unicode")]
fn split_unicode_words(&self) -> Vec<&Self> {
fn tokenize_unicode_words(&self) -> Vec<&Self> {
self.words_with_breaks().map(|x| x.as_bytes()).collect()
}
#[cfg(feature = "unicode")]
fn split_graphemes(&self) -> Vec<&Self> {
fn tokenize_graphemes(&self) -> Vec<&Self> {
self.graphemes().map(|x| x.as_bytes()).collect()
}
fn split_chars(&self) -> Vec<&Self> {
fn tokenize_chars(&self) -> Vec<&Self> {
self.char_indices()
.map(move |(start, end, _)| &self[start..end])
.collect()
@ -329,18 +329,18 @@ impl DiffableStr for [u8] {
#[test]
fn test_split_lines() {
assert_eq!(
DiffableStr::split_lines("first\nsecond\rthird\r\nfourth\nlast"),
DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"),
vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"]
);
assert_eq!(DiffableStr::split_lines("\n\n"), vec!["\n", "\n"]);
assert_eq!(DiffableStr::split_lines("\n"), vec!["\n"]);
assert!(DiffableStr::split_lines("").is_empty());
assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]);
assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]);
assert!(DiffableStr::tokenize_lines("").is_empty());
}
#[test]
fn test_split_words() {
assert_eq!(
DiffableStr::split_words("foo bar baz\n\n aha"),
DiffableStr::tokenize_words("foo bar baz\n\n aha"),
["foo", " ", "bar", " ", "baz", "\n\n ", "aha"]
);
}
@ -348,7 +348,7 @@ fn test_split_words() {
#[test]
fn test_split_chars() {
assert_eq!(
DiffableStr::split_chars("abcfö❄"),
DiffableStr::tokenize_chars("abcfö❄"),
vec!["a", "b", "c", "f", "ö", "", "\u{fe0f}"]
);
}
@ -357,7 +357,7 @@ fn test_split_chars() {
#[cfg(feature = "unicode")]
fn test_split_graphemes() {
assert_eq!(
DiffableStr::split_graphemes("abcfö❄"),
DiffableStr::tokenize_graphemes("abcfö❄"),
vec!["a", "b", "c", "f", "ö", "❄️"]
);
}
@ -366,7 +366,7 @@ fn test_split_graphemes() {
#[cfg(feature = "bytes")]
fn test_split_lines_bytes() {
assert_eq!(
DiffableStr::split_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
vec![
"first\n".as_bytes(),
"second\r".as_bytes(),
@ -376,21 +376,21 @@ fn test_split_lines_bytes() {
]
);
assert_eq!(
DiffableStr::split_lines("\n\n".as_bytes()),
DiffableStr::tokenize_lines("\n\n".as_bytes()),
vec!["\n".as_bytes(), "\n".as_bytes()]
);
assert_eq!(
DiffableStr::split_lines("\n".as_bytes()),
DiffableStr::tokenize_lines("\n".as_bytes()),
vec!["\n".as_bytes()]
);
assert!(DiffableStr::split_lines("".as_bytes()).is_empty());
assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
}
#[test]
#[cfg(feature = "bytes")]
fn test_split_words_bytes() {
assert_eq!(
DiffableStr::split_words("foo bar baz\n\n aha".as_bytes()),
DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()),
[
&b"foo"[..],
&b" "[..],
@ -407,7 +407,7 @@ fn test_split_words_bytes() {
#[cfg(feature = "bytes")]
fn test_split_chars_bytes() {
assert_eq!(
DiffableStr::split_chars("abcfö❄".as_bytes()),
DiffableStr::tokenize_chars("abcfö❄".as_bytes()),
vec![
&b"a"[..],
&b"b"[..],
@ -424,7 +424,7 @@ fn test_split_chars_bytes() {
#[cfg(all(feature = "bytes", feature = "unicode"))]
fn test_split_graphemes_bytes() {
assert_eq!(
DiffableStr::split_graphemes("abcfö❄".as_bytes()),
DiffableStr::tokenize_graphemes("abcfö❄".as_bytes()),
vec![
&b"a"[..],
&b"b"[..],

View file

@ -17,7 +17,7 @@ impl<'bufs, 's, T: DiffableStr + ?Sized> MultiLookup<'bufs, 's, T> {
let mut seqs = Vec::new();
for (string_idx, string) in strings.iter().enumerate() {
let mut offset = 0;
for word in string.split_unicode_words() {
for word in string.tokenize_unicode_words() {
seqs.push((word, string_idx, offset));
offset += word.len();
}
@ -81,7 +81,7 @@ fn push_values<'s, T: DiffableStr + ?Sized>(
// newlines cause all kinds of wacky stuff if they end up highlighted.
// because of this we want to unemphasize all newlines we encounter.
if emphasized {
for seg in s.split_lines_and_newlines() {
for seg in s.tokenize_lines_and_newlines() {
v[idx].push((!seg.ends_with_newline(), seg));
}
} else {

View file

@ -154,8 +154,8 @@ impl TextDiffConfig {
new: &'new T,
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
self.diff(
Cow::Owned(old.as_diffable_str().split_lines()),
Cow::Owned(new.as_diffable_str().split_lines()),
Cow::Owned(old.as_diffable_str().tokenize_lines()),
Cow::Owned(new.as_diffable_str().tokenize_lines()),
true,
)
}
@ -169,8 +169,8 @@ impl TextDiffConfig {
new: &'new T,
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
self.diff(
Cow::Owned(old.as_diffable_str().split_words()),
Cow::Owned(new.as_diffable_str().split_words()),
Cow::Owned(old.as_diffable_str().tokenize_words()),
Cow::Owned(new.as_diffable_str().tokenize_words()),
false,
)
}
@ -182,8 +182,8 @@ impl TextDiffConfig {
new: &'new T,
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
self.diff(
Cow::Owned(old.as_diffable_str().split_chars()),
Cow::Owned(new.as_diffable_str().split_chars()),
Cow::Owned(old.as_diffable_str().tokenize_chars()),
Cow::Owned(new.as_diffable_str().tokenize_chars()),
false,
)
}
@ -202,8 +202,8 @@ impl TextDiffConfig {
new: &'new T,
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
self.diff(
Cow::Owned(old.as_diffable_str().split_unicode_words()),
Cow::Owned(new.as_diffable_str().split_unicode_words()),
Cow::Owned(old.as_diffable_str().tokenize_unicode_words()),
Cow::Owned(new.as_diffable_str().tokenize_unicode_words()),
false,
)
}
@ -218,8 +218,8 @@ impl TextDiffConfig {
new: &'new T,
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
self.diff(
Cow::Owned(old.as_diffable_str().split_graphemes()),
Cow::Owned(new.as_diffable_str().split_graphemes()),
Cow::Owned(old.as_diffable_str().tokenize_graphemes()),
Cow::Owned(new.as_diffable_str().tokenize_graphemes()),
false,
)
}
@ -615,11 +615,11 @@ pub fn get_close_matches<'a, T: DiffableStr + ?Sized>(
cutoff: f32,
) -> Vec<&'a T> {
let mut matches = BinaryHeap::new();
let seq1 = word.split_chars();
let seq1 = word.tokenize_chars();
let quick_ratio = QuickSeqRatio::new(&seq1);
for &possibility in possibilities {
let seq2 = possibility.split_chars();
let seq2 = possibility.tokenize_chars();
if upper_seq_ratio(&seq1, &seq2) < cutoff || quick_ratio.calc(&seq2) < cutoff {
continue;