Add proper multi-sequence inline highlighting

Fixes #5
This commit is contained in:
Armin Ronacher 2021-02-01 01:37:59 +01:00
parent e14b26502b
commit 37587908de
3 changed files with 221 additions and 107 deletions

View file

@ -43,5 +43,6 @@ fn main() {
- [Documentation](https://docs.rs/similar/) - [Documentation](https://docs.rs/similar/)
- [Issue Tracker](https://github.com/mitsuhiko/similar/issues) - [Issue Tracker](https://github.com/mitsuhiko/similar/issues)
- [Examples](https://github.com/mitsuhiko/similar/tree/main/examples)
- License: [Apache-2.0](https://github.com/mitsuhiko/similar/blob/main/LICENSE) - License: [Apache-2.0](https://github.com/mitsuhiko/similar/blob/main/LICENSE)

View file

@ -1,33 +1,108 @@
#![cfg(feature = "inline")] #![cfg(feature = "inline")]
use std::{fmt, iter}; use std::fmt;
use crate::algorithms::{Algorithm, DiffOp, DiffTag}; use crate::algorithms::{capture_diff, Algorithm, DiffOp, DiffTag};
use crate::text::{Change, ChangeTag, TextDiff}; use crate::text::{Change, ChangeTag, TextDiff};
use super::split_unicode_words; use super::{diff_ratio, split_unicode_words};
use std::ops::Range; use std::ops::Index;
struct MultiIndex<'a, 's> { struct MultiLookup<'bufs, 's> {
seq: &'a [&'s str], strings: &'bufs [&'s str],
value: &'s str, seqs: Vec<(&'s str, usize, usize)>,
} }
impl<'a, 's> MultiIndex<'a, 's> { impl<'bufs, 's> MultiLookup<'bufs, 's> {
pub fn new(seq: &'a [&'s str], value: &'s str) -> MultiIndex<'a, 's> { fn new(strings: &'bufs [&'s str]) -> MultiLookup<'bufs, 's> {
MultiIndex { seq, value } let mut seqs = Vec::new();
for (string_idx, string) in strings.iter().enumerate() {
let mut offset = 0;
for word in split_unicode_words(string) {
seqs.push((word, string_idx, offset));
offset += word.len();
}
}
MultiLookup { strings, seqs }
} }
pub fn get_slice(&self, rng: Range<usize>) -> &'s str { pub fn len(&self) -> usize {
let mut start = 0; self.seqs.len()
for &sseq in &self.seq[..rng.start] { }
start += sseq.len();
fn get_original_slices(&self, idx: usize, len: usize) -> Vec<(usize, &'s str)> {
let mut last = None;
let mut rv = Vec::new();
for offset in 0..len {
let (s, str_idx, char_idx) = self.seqs[idx + offset];
last = match last {
None => Some((str_idx, char_idx, s.len())),
Some((last_str_idx, start_char_idx, last_len)) => {
if last_str_idx == str_idx {
Some((str_idx, start_char_idx, last_len + s.len()))
} else {
rv.push((
last_str_idx,
&self.strings[last_str_idx][start_char_idx..start_char_idx + last_len],
));
Some((str_idx, char_idx, s.len()))
}
}
};
} }
let mut end = start;
for &sseq in &self.seq[rng.start..rng.end] { if let Some((str_idx, start_char_idx, len)) = last {
end += sseq.len(); rv.push((
str_idx,
&self.strings[str_idx][start_char_idx..start_char_idx + len],
));
} }
&self.value[start..end]
rv
}
}
impl<'bufs, 's> Index<usize> for MultiLookup<'bufs, 's> {
type Output = str;
fn index(&self, index: usize) -> &Self::Output {
&self.seqs[index].0
}
}
fn partition_newlines(s: &str) -> impl Iterator<Item = (&str, bool)> {
let mut iter = s.char_indices().peekable();
std::iter::from_fn(move || {
if let Some((idx, c)) = iter.next() {
let is_newline = c == '\r' || c == '\n';
let start = idx;
let mut end = idx + c.len_utf8();
while let Some(&(_, next_char)) = iter.peek() {
if (next_char == '\r' || next_char == '\n') != is_newline {
break;
}
iter.next();
end += next_char.len_utf8();
}
Some((&s[start..end], is_newline))
} else {
None
}
})
}
fn push_values<'s>(v: &mut Vec<Vec<(bool, &'s str)>>, idx: usize, emphasized: bool, s: &'s str) {
v.resize_with(v.len().max(idx + 1), Vec::new);
// newlines cause all kinds of wacky stuff if they end up highlighted.
// because of this we want to unemphasize all newlines we encounter.
if emphasized {
for (seg, is_nl) in partition_newlines(s) {
v[idx].push((!is_nl, seg));
}
} else {
v[idx].push((false, s));
} }
} }
@ -104,87 +179,122 @@ pub(crate) fn iter_inline_changes<'diff>(
diff: &'diff TextDiff, diff: &'diff TextDiff,
op: &DiffOp, op: &DiffOp,
) -> impl Iterator<Item = InlineChange<'diff>> { ) -> impl Iterator<Item = InlineChange<'diff>> {
let mut change_iter = diff.iter_changes(op).peekable();
let mut skip_next = false;
let newline_terminated = diff.newline_terminated; let newline_terminated = diff.newline_terminated;
let (tag, old_range, new_range) = op.as_tag_tuple();
iter::from_fn(move || { if let DiffTag::Equal | DiffTag::Insert | DiffTag::Delete = tag {
if skip_next { return Box::new(diff.iter_changes(op).map(|x| x.into())) as Box<dyn Iterator<Item = _>>;
change_iter.next(); }
skip_next = false;
}
if let Some(change) = change_iter.next() {
let next_change = change_iter.peek();
match (change.tag, next_change.map(|x| x.tag())) {
(ChangeTag::Delete, Some(ChangeTag::Insert)) => {
let old_value = change.value();
let new_value = next_change.unwrap().value();
let old_chars = split_unicode_words(&old_value).collect::<Vec<_>>();
let new_chars = split_unicode_words(&new_value).collect::<Vec<_>>();
let old_mindex = MultiIndex::new(&old_chars, old_value);
let new_mindex = MultiIndex::new(&new_chars, new_value);
let inline_diff = TextDiff::configure()
.algorithm(Algorithm::Patience)
.diff_slices(&old_chars, &new_chars);
if inline_diff.ratio() < 0.5 { let mut old_index = old_range.start;
return Some(None.into_iter().chain(Some(change.into()).into_iter())); let mut new_index = new_range.start;
} let old_slices = &diff.old_slices()[old_range];
let new_slices = &diff.new_slices()[new_range];
let old_lookup = MultiLookup::new(old_slices);
let new_lookup = MultiLookup::new(new_slices);
// skip the next element as we handle it here let ops = capture_diff(
skip_next = true; Algorithm::Patience,
&old_lookup,
0..old_lookup.len(),
&new_lookup,
0..new_lookup.len(),
);
let mut old_values = vec![]; if diff_ratio(&ops, old_lookup.len(), new_lookup.len()) < 0.5 {
let mut new_values = vec![]; return Box::new(diff.iter_changes(op).map(|x| x.into())) as Box<dyn Iterator<Item = _>>;
for op in inline_diff.ops() { }
match op.tag() {
DiffTag::Equal => {
old_values.push((false, old_mindex.get_slice(op.old_range())));
new_values.push((false, old_mindex.get_slice(op.old_range())));
}
DiffTag::Delete => {
old_values.push((true, old_mindex.get_slice(op.old_range())));
}
DiffTag::Insert => {
new_values.push((true, new_mindex.get_slice(op.new_range())));
}
DiffTag::Replace => {
old_values.push((true, old_mindex.get_slice(op.old_range())));
new_values.push((true, new_mindex.get_slice(op.new_range())));
}
}
}
Some( let mut old_values = Vec::<Vec<_>>::new();
Some(InlineChange { let mut new_values = Vec::<Vec<_>>::new();
tag: ChangeTag::Delete,
old_index: change.old_index(), for op in ops {
new_index: None, match op {
values: old_values, DiffOp::Equal {
missing_newline: newline_terminated old_index,
&& !old_value.ends_with(&['\r', '\n'][..]), len,
}) new_index,
.into_iter() } => {
.chain( for (idx, slice) in old_lookup.get_original_slices(old_index, len) {
Some(InlineChange { push_values(&mut old_values, idx, false, slice);
tag: ChangeTag::Insert, }
old_index: None, for (idx, slice) in new_lookup.get_original_slices(new_index, len) {
new_index: next_change.unwrap().new_index(), push_values(&mut new_values, idx, false, slice);
values: new_values, }
missing_newline: newline_terminated }
&& !new_value.ends_with(&['\r', '\n'][..]), DiffOp::Delete {
}) old_index, old_len, ..
.into_iter(), } => {
), for (idx, slice) in old_lookup.get_original_slices(old_index, old_len) {
) push_values(&mut old_values, idx, true, slice);
}
}
DiffOp::Insert {
new_index, new_len, ..
} => {
for (idx, slice) in new_lookup.get_original_slices(new_index, new_len) {
push_values(&mut new_values, idx, true, slice);
}
}
DiffOp::Replace {
old_index,
old_len,
new_index,
new_len,
} => {
for (idx, slice) in old_lookup.get_original_slices(old_index, old_len) {
push_values(&mut old_values, idx, true, slice);
}
for (idx, slice) in new_lookup.get_original_slices(new_index, new_len) {
push_values(&mut new_values, idx, true, slice);
} }
_ => Some(None.into_iter().chain(Some(change.into()).into_iter())),
} }
} else {
None
} }
}) }
.flatten()
let mut rv = Vec::new();
for values in old_values {
rv.push(InlineChange {
tag: ChangeTag::Delete,
old_index: Some(old_index),
new_index: None,
values,
missing_newline: false,
});
old_index += 1;
}
if newline_terminated
&& !old_slices.is_empty()
&& !old_slices[old_slices.len() - 1].ends_with(&['\r', '\n'][..])
{
if let Some(last) = rv.last_mut() {
last.missing_newline = true;
}
}
for values in new_values {
rv.push(InlineChange {
tag: ChangeTag::Insert,
old_index: None,
new_index: Some(new_index),
values,
missing_newline: false,
});
new_index += 1;
}
if newline_terminated
&& !new_slices.is_empty()
&& !new_slices[new_slices.len() - 1].ends_with(&['\r', '\n'][..])
{
if let Some(last) = rv.last_mut() {
last.missing_newline = true;
}
}
Box::new(rv.into_iter()) as Box<dyn Iterator<Item = _>>
} }
#[test] #[test]

View file

@ -394,23 +394,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
/// assert_eq!(diff.ratio(), 0.75); /// assert_eq!(diff.ratio(), 0.75);
/// ``` /// ```
pub fn ratio(&self) -> f32 { pub fn ratio(&self) -> f32 {
let matches = self diff_ratio(self.ops(), self.old.len(), self.new.len())
.ops()
.iter()
.map(|op| {
if let DiffOp::Equal { len, .. } = *op {
len
} else {
0
}
})
.sum::<usize>();
let len = self.old.len() + self.new.len();
if len == 0 {
1.0
} else {
2.0 * matches as f32 / len as f32
}
} }
/// Iterates over the changes the op expands to. /// Iterates over the changes the op expands to.
@ -617,6 +601,25 @@ fn split_graphemes(s: &str) -> impl Iterator<Item = &str> {
unicode_segmentation::UnicodeSegmentation::graphemes(s, true) unicode_segmentation::UnicodeSegmentation::graphemes(s, true)
} }
fn diff_ratio(ops: &[DiffOp], s1_len: usize, s2_len: usize) -> f32 {
let matches = ops
.iter()
.map(|op| {
if let DiffOp::Equal { len, .. } = *op {
len
} else {
0
}
})
.sum::<usize>();
let len = s1_len + s2_len;
if len == 0 {
1.0
} else {
2.0 * matches as f32 / len as f32
}
}
// quick and dirty way to get an upper sequence ratio. // quick and dirty way to get an upper sequence ratio.
fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 { fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
let n = seq1.len() + seq2.len(); let n = seq1.len() + seq2.len();