Rename split methods to tokenize
This commit is contained in:
parent
b3ae45b118
commit
1f73e01ff1
3 changed files with 46 additions and 46 deletions
|
|
@ -62,24 +62,24 @@ impl DiffableStrRef for Vec<u8> {
|
|||
/// as long as they are ASCII compatible.
|
||||
pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
|
||||
/// Splits the value into newlines with newlines attached.
|
||||
fn split_lines(&self) -> Vec<&Self>;
|
||||
fn tokenize_lines(&self) -> Vec<&Self>;
|
||||
|
||||
/// Splits the value into newlines with newlines separated.
|
||||
fn split_lines_and_newlines(&self) -> Vec<&Self>;
|
||||
fn tokenize_lines_and_newlines(&self) -> Vec<&Self>;
|
||||
|
||||
/// Tokenizes into words.
|
||||
fn split_words(&self) -> Vec<&Self>;
|
||||
fn tokenize_words(&self) -> Vec<&Self>;
|
||||
|
||||
/// Splits the input into characters.
|
||||
fn split_chars(&self) -> Vec<&Self>;
|
||||
fn tokenize_chars(&self) -> Vec<&Self>;
|
||||
|
||||
/// Splits into unicode words.
|
||||
#[cfg(feature = "unicode")]
|
||||
fn split_unicode_words(&self) -> Vec<&Self>;
|
||||
fn tokenize_unicode_words(&self) -> Vec<&Self>;
|
||||
|
||||
/// Splits into unicode graphemes..
|
||||
#[cfg(feature = "unicode")]
|
||||
fn split_graphemes(&self) -> Vec<&Self>;
|
||||
fn tokenize_graphemes(&self) -> Vec<&Self>;
|
||||
|
||||
/// Decodes the string (potentially) lossy.
|
||||
fn as_str(&self) -> Option<&str>;
|
||||
|
|
@ -106,7 +106,7 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
|
|||
}
|
||||
|
||||
impl DiffableStr for str {
|
||||
fn split_lines(&self) -> Vec<&Self> {
|
||||
fn tokenize_lines(&self) -> Vec<&Self> {
|
||||
let mut iter = self.char_indices().peekable();
|
||||
let mut last_pos = 0;
|
||||
let mut lines = vec![];
|
||||
|
|
@ -134,7 +134,7 @@ impl DiffableStr for str {
|
|||
lines
|
||||
}
|
||||
|
||||
fn split_lines_and_newlines(&self) -> Vec<&Self> {
|
||||
fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
|
||||
let mut rv = vec![];
|
||||
let mut iter = self.char_indices().peekable();
|
||||
|
||||
|
|
@ -155,7 +155,7 @@ impl DiffableStr for str {
|
|||
rv
|
||||
}
|
||||
|
||||
fn split_words(&self) -> Vec<&Self> {
|
||||
fn tokenize_words(&self) -> Vec<&Self> {
|
||||
let mut iter = self.char_indices().peekable();
|
||||
let mut rv = vec![];
|
||||
|
||||
|
|
@ -176,19 +176,19 @@ impl DiffableStr for str {
|
|||
rv
|
||||
}
|
||||
|
||||
fn split_chars(&self) -> Vec<&Self> {
|
||||
fn tokenize_chars(&self) -> Vec<&Self> {
|
||||
self.char_indices()
|
||||
.map(move |(i, c)| &self[i..i + c.len_utf8()])
|
||||
.collect()
|
||||
}
|
||||
|
||||
#[cfg(feature = "unicode")]
|
||||
fn split_unicode_words(&self) -> Vec<&Self> {
|
||||
fn tokenize_unicode_words(&self) -> Vec<&Self> {
|
||||
unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect()
|
||||
}
|
||||
|
||||
#[cfg(feature = "unicode")]
|
||||
fn split_graphemes(&self) -> Vec<&Self> {
|
||||
fn tokenize_graphemes(&self) -> Vec<&Self> {
|
||||
unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect()
|
||||
}
|
||||
|
||||
|
|
@ -219,7 +219,7 @@ impl DiffableStr for str {
|
|||
|
||||
#[cfg(feature = "bytes")]
|
||||
impl DiffableStr for [u8] {
|
||||
fn split_lines(&self) -> Vec<&Self> {
|
||||
fn tokenize_lines(&self) -> Vec<&Self> {
|
||||
let mut iter = self.char_indices().peekable();
|
||||
let mut last_pos = 0;
|
||||
let mut lines = vec![];
|
||||
|
|
@ -247,7 +247,7 @@ impl DiffableStr for [u8] {
|
|||
lines
|
||||
}
|
||||
|
||||
fn split_lines_and_newlines(&self) -> Vec<&Self> {
|
||||
fn tokenize_lines_and_newlines(&self) -> Vec<&Self> {
|
||||
let mut rv = vec![];
|
||||
let mut iter = self.char_indices().peekable();
|
||||
|
||||
|
|
@ -266,7 +266,7 @@ impl DiffableStr for [u8] {
|
|||
rv
|
||||
}
|
||||
|
||||
fn split_words(&self) -> Vec<&Self> {
|
||||
fn tokenize_words(&self) -> Vec<&Self> {
|
||||
let mut iter = self.char_indices().peekable();
|
||||
let mut rv = vec![];
|
||||
|
||||
|
|
@ -286,16 +286,16 @@ impl DiffableStr for [u8] {
|
|||
}
|
||||
|
||||
#[cfg(feature = "unicode")]
|
||||
fn split_unicode_words(&self) -> Vec<&Self> {
|
||||
fn tokenize_unicode_words(&self) -> Vec<&Self> {
|
||||
self.words_with_breaks().map(|x| x.as_bytes()).collect()
|
||||
}
|
||||
|
||||
#[cfg(feature = "unicode")]
|
||||
fn split_graphemes(&self) -> Vec<&Self> {
|
||||
fn tokenize_graphemes(&self) -> Vec<&Self> {
|
||||
self.graphemes().map(|x| x.as_bytes()).collect()
|
||||
}
|
||||
|
||||
fn split_chars(&self) -> Vec<&Self> {
|
||||
fn tokenize_chars(&self) -> Vec<&Self> {
|
||||
self.char_indices()
|
||||
.map(move |(start, end, _)| &self[start..end])
|
||||
.collect()
|
||||
|
|
@ -329,18 +329,18 @@ impl DiffableStr for [u8] {
|
|||
#[test]
|
||||
fn test_split_lines() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_lines("first\nsecond\rthird\r\nfourth\nlast"),
|
||||
DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast"),
|
||||
vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"]
|
||||
);
|
||||
assert_eq!(DiffableStr::split_lines("\n\n"), vec!["\n", "\n"]);
|
||||
assert_eq!(DiffableStr::split_lines("\n"), vec!["\n"]);
|
||||
assert!(DiffableStr::split_lines("").is_empty());
|
||||
assert_eq!(DiffableStr::tokenize_lines("\n\n"), vec!["\n", "\n"]);
|
||||
assert_eq!(DiffableStr::tokenize_lines("\n"), vec!["\n"]);
|
||||
assert!(DiffableStr::tokenize_lines("").is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_split_words() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_words("foo bar baz\n\n aha"),
|
||||
DiffableStr::tokenize_words("foo bar baz\n\n aha"),
|
||||
["foo", " ", "bar", " ", "baz", "\n\n ", "aha"]
|
||||
);
|
||||
}
|
||||
|
|
@ -348,7 +348,7 @@ fn test_split_words() {
|
|||
#[test]
|
||||
fn test_split_chars() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_chars("abcfö❄️"),
|
||||
DiffableStr::tokenize_chars("abcfö❄️"),
|
||||
vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"]
|
||||
);
|
||||
}
|
||||
|
|
@ -357,7 +357,7 @@ fn test_split_chars() {
|
|||
#[cfg(feature = "unicode")]
|
||||
fn test_split_graphemes() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_graphemes("abcfö❄️"),
|
||||
DiffableStr::tokenize_graphemes("abcfö❄️"),
|
||||
vec!["a", "b", "c", "f", "ö", "❄️"]
|
||||
);
|
||||
}
|
||||
|
|
@ -366,7 +366,7 @@ fn test_split_graphemes() {
|
|||
#[cfg(feature = "bytes")]
|
||||
fn test_split_lines_bytes() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
|
||||
DiffableStr::tokenize_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()),
|
||||
vec![
|
||||
"first\n".as_bytes(),
|
||||
"second\r".as_bytes(),
|
||||
|
|
@ -376,21 +376,21 @@ fn test_split_lines_bytes() {
|
|||
]
|
||||
);
|
||||
assert_eq!(
|
||||
DiffableStr::split_lines("\n\n".as_bytes()),
|
||||
DiffableStr::tokenize_lines("\n\n".as_bytes()),
|
||||
vec!["\n".as_bytes(), "\n".as_bytes()]
|
||||
);
|
||||
assert_eq!(
|
||||
DiffableStr::split_lines("\n".as_bytes()),
|
||||
DiffableStr::tokenize_lines("\n".as_bytes()),
|
||||
vec!["\n".as_bytes()]
|
||||
);
|
||||
assert!(DiffableStr::split_lines("".as_bytes()).is_empty());
|
||||
assert!(DiffableStr::tokenize_lines("".as_bytes()).is_empty());
|
||||
}
|
||||
|
||||
#[test]
|
||||
#[cfg(feature = "bytes")]
|
||||
fn test_split_words_bytes() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_words("foo bar baz\n\n aha".as_bytes()),
|
||||
DiffableStr::tokenize_words("foo bar baz\n\n aha".as_bytes()),
|
||||
[
|
||||
&b"foo"[..],
|
||||
&b" "[..],
|
||||
|
|
@ -407,7 +407,7 @@ fn test_split_words_bytes() {
|
|||
#[cfg(feature = "bytes")]
|
||||
fn test_split_chars_bytes() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_chars("abcfö❄️".as_bytes()),
|
||||
DiffableStr::tokenize_chars("abcfö❄️".as_bytes()),
|
||||
vec![
|
||||
&b"a"[..],
|
||||
&b"b"[..],
|
||||
|
|
@ -424,7 +424,7 @@ fn test_split_chars_bytes() {
|
|||
#[cfg(all(feature = "bytes", feature = "unicode"))]
|
||||
fn test_split_graphemes_bytes() {
|
||||
assert_eq!(
|
||||
DiffableStr::split_graphemes("abcfö❄️".as_bytes()),
|
||||
DiffableStr::tokenize_graphemes("abcfö❄️".as_bytes()),
|
||||
vec![
|
||||
&b"a"[..],
|
||||
&b"b"[..],
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ impl<'bufs, 's, T: DiffableStr + ?Sized> MultiLookup<'bufs, 's, T> {
|
|||
let mut seqs = Vec::new();
|
||||
for (string_idx, string) in strings.iter().enumerate() {
|
||||
let mut offset = 0;
|
||||
for word in string.split_unicode_words() {
|
||||
for word in string.tokenize_unicode_words() {
|
||||
seqs.push((word, string_idx, offset));
|
||||
offset += word.len();
|
||||
}
|
||||
|
|
@ -81,7 +81,7 @@ fn push_values<'s, T: DiffableStr + ?Sized>(
|
|||
// newlines cause all kinds of wacky stuff if they end up highlighted.
|
||||
// because of this we want to unemphasize all newlines we encounter.
|
||||
if emphasized {
|
||||
for seg in s.split_lines_and_newlines() {
|
||||
for seg in s.tokenize_lines_and_newlines() {
|
||||
v[idx].push((!seg.ends_with_newline(), seg));
|
||||
}
|
||||
} else {
|
||||
|
|
|
|||
|
|
@ -154,8 +154,8 @@ impl TextDiffConfig {
|
|||
new: &'new T,
|
||||
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
|
||||
self.diff(
|
||||
Cow::Owned(old.as_diffable_str().split_lines()),
|
||||
Cow::Owned(new.as_diffable_str().split_lines()),
|
||||
Cow::Owned(old.as_diffable_str().tokenize_lines()),
|
||||
Cow::Owned(new.as_diffable_str().tokenize_lines()),
|
||||
true,
|
||||
)
|
||||
}
|
||||
|
|
@ -169,8 +169,8 @@ impl TextDiffConfig {
|
|||
new: &'new T,
|
||||
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
|
||||
self.diff(
|
||||
Cow::Owned(old.as_diffable_str().split_words()),
|
||||
Cow::Owned(new.as_diffable_str().split_words()),
|
||||
Cow::Owned(old.as_diffable_str().tokenize_words()),
|
||||
Cow::Owned(new.as_diffable_str().tokenize_words()),
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
|
@ -182,8 +182,8 @@ impl TextDiffConfig {
|
|||
new: &'new T,
|
||||
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
|
||||
self.diff(
|
||||
Cow::Owned(old.as_diffable_str().split_chars()),
|
||||
Cow::Owned(new.as_diffable_str().split_chars()),
|
||||
Cow::Owned(old.as_diffable_str().tokenize_chars()),
|
||||
Cow::Owned(new.as_diffable_str().tokenize_chars()),
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
|
@ -202,8 +202,8 @@ impl TextDiffConfig {
|
|||
new: &'new T,
|
||||
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
|
||||
self.diff(
|
||||
Cow::Owned(old.as_diffable_str().split_unicode_words()),
|
||||
Cow::Owned(new.as_diffable_str().split_unicode_words()),
|
||||
Cow::Owned(old.as_diffable_str().tokenize_unicode_words()),
|
||||
Cow::Owned(new.as_diffable_str().tokenize_unicode_words()),
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
|
@ -218,8 +218,8 @@ impl TextDiffConfig {
|
|||
new: &'new T,
|
||||
) -> TextDiff<'old, 'new, 'bufs, T::Output> {
|
||||
self.diff(
|
||||
Cow::Owned(old.as_diffable_str().split_graphemes()),
|
||||
Cow::Owned(new.as_diffable_str().split_graphemes()),
|
||||
Cow::Owned(old.as_diffable_str().tokenize_graphemes()),
|
||||
Cow::Owned(new.as_diffable_str().tokenize_graphemes()),
|
||||
false,
|
||||
)
|
||||
}
|
||||
|
|
@ -615,11 +615,11 @@ pub fn get_close_matches<'a, T: DiffableStr + ?Sized>(
|
|||
cutoff: f32,
|
||||
) -> Vec<&'a T> {
|
||||
let mut matches = BinaryHeap::new();
|
||||
let seq1 = word.split_chars();
|
||||
let seq1 = word.tokenize_chars();
|
||||
let quick_ratio = QuickSeqRatio::new(&seq1);
|
||||
|
||||
for &possibility in possibilities {
|
||||
let seq2 = possibility.split_chars();
|
||||
let seq2 = possibility.tokenize_chars();
|
||||
|
||||
if upper_seq_ratio(&seq1, &seq2) < cutoff || quick_ratio.calc(&seq2) < cutoff {
|
||||
continue;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue