parent
e14b26502b
commit
37587908de
3 changed files with 221 additions and 107 deletions
|
|
@ -43,5 +43,6 @@ fn main() {
|
||||||
|
|
||||||
- [Documentation](https://docs.rs/similar/)
|
- [Documentation](https://docs.rs/similar/)
|
||||||
- [Issue Tracker](https://github.com/mitsuhiko/similar/issues)
|
- [Issue Tracker](https://github.com/mitsuhiko/similar/issues)
|
||||||
|
- [Examples](https://github.com/mitsuhiko/similar/tree/main/examples)
|
||||||
- License: [Apache-2.0](https://github.com/mitsuhiko/similar/blob/main/LICENSE)
|
- License: [Apache-2.0](https://github.com/mitsuhiko/similar/blob/main/LICENSE)
|
||||||
|
|
||||||
|
|
|
||||||
|
|
@ -1,33 +1,108 @@
|
||||||
#![cfg(feature = "inline")]
|
#![cfg(feature = "inline")]
|
||||||
use std::{fmt, iter};
|
use std::fmt;
|
||||||
|
|
||||||
use crate::algorithms::{Algorithm, DiffOp, DiffTag};
|
use crate::algorithms::{capture_diff, Algorithm, DiffOp, DiffTag};
|
||||||
use crate::text::{Change, ChangeTag, TextDiff};
|
use crate::text::{Change, ChangeTag, TextDiff};
|
||||||
|
|
||||||
use super::split_unicode_words;
|
use super::{diff_ratio, split_unicode_words};
|
||||||
|
|
||||||
use std::ops::Range;
|
use std::ops::Index;
|
||||||
|
|
||||||
struct MultiIndex<'a, 's> {
|
struct MultiLookup<'bufs, 's> {
|
||||||
seq: &'a [&'s str],
|
strings: &'bufs [&'s str],
|
||||||
value: &'s str,
|
seqs: Vec<(&'s str, usize, usize)>,
|
||||||
}
|
}
|
||||||
|
|
||||||
impl<'a, 's> MultiIndex<'a, 's> {
|
impl<'bufs, 's> MultiLookup<'bufs, 's> {
|
||||||
pub fn new(seq: &'a [&'s str], value: &'s str) -> MultiIndex<'a, 's> {
|
fn new(strings: &'bufs [&'s str]) -> MultiLookup<'bufs, 's> {
|
||||||
MultiIndex { seq, value }
|
let mut seqs = Vec::new();
|
||||||
|
for (string_idx, string) in strings.iter().enumerate() {
|
||||||
|
let mut offset = 0;
|
||||||
|
for word in split_unicode_words(string) {
|
||||||
|
seqs.push((word, string_idx, offset));
|
||||||
|
offset += word.len();
|
||||||
|
}
|
||||||
|
}
|
||||||
|
MultiLookup { strings, seqs }
|
||||||
}
|
}
|
||||||
|
|
||||||
pub fn get_slice(&self, rng: Range<usize>) -> &'s str {
|
pub fn len(&self) -> usize {
|
||||||
let mut start = 0;
|
self.seqs.len()
|
||||||
for &sseq in &self.seq[..rng.start] {
|
}
|
||||||
start += sseq.len();
|
|
||||||
|
fn get_original_slices(&self, idx: usize, len: usize) -> Vec<(usize, &'s str)> {
|
||||||
|
let mut last = None;
|
||||||
|
let mut rv = Vec::new();
|
||||||
|
|
||||||
|
for offset in 0..len {
|
||||||
|
let (s, str_idx, char_idx) = self.seqs[idx + offset];
|
||||||
|
last = match last {
|
||||||
|
None => Some((str_idx, char_idx, s.len())),
|
||||||
|
Some((last_str_idx, start_char_idx, last_len)) => {
|
||||||
|
if last_str_idx == str_idx {
|
||||||
|
Some((str_idx, start_char_idx, last_len + s.len()))
|
||||||
|
} else {
|
||||||
|
rv.push((
|
||||||
|
last_str_idx,
|
||||||
|
&self.strings[last_str_idx][start_char_idx..start_char_idx + last_len],
|
||||||
|
));
|
||||||
|
Some((str_idx, char_idx, s.len()))
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
let mut end = start;
|
|
||||||
for &sseq in &self.seq[rng.start..rng.end] {
|
if let Some((str_idx, start_char_idx, len)) = last {
|
||||||
end += sseq.len();
|
rv.push((
|
||||||
|
str_idx,
|
||||||
|
&self.strings[str_idx][start_char_idx..start_char_idx + len],
|
||||||
|
));
|
||||||
}
|
}
|
||||||
&self.value[start..end]
|
|
||||||
|
rv
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
impl<'bufs, 's> Index<usize> for MultiLookup<'bufs, 's> {
|
||||||
|
type Output = str;
|
||||||
|
|
||||||
|
fn index(&self, index: usize) -> &Self::Output {
|
||||||
|
&self.seqs[index].0
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
fn partition_newlines(s: &str) -> impl Iterator<Item = (&str, bool)> {
|
||||||
|
let mut iter = s.char_indices().peekable();
|
||||||
|
|
||||||
|
std::iter::from_fn(move || {
|
||||||
|
if let Some((idx, c)) = iter.next() {
|
||||||
|
let is_newline = c == '\r' || c == '\n';
|
||||||
|
let start = idx;
|
||||||
|
let mut end = idx + c.len_utf8();
|
||||||
|
while let Some(&(_, next_char)) = iter.peek() {
|
||||||
|
if (next_char == '\r' || next_char == '\n') != is_newline {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
iter.next();
|
||||||
|
end += next_char.len_utf8();
|
||||||
|
}
|
||||||
|
Some((&s[start..end], is_newline))
|
||||||
|
} else {
|
||||||
|
None
|
||||||
|
}
|
||||||
|
})
|
||||||
|
}
|
||||||
|
|
||||||
|
fn push_values<'s>(v: &mut Vec<Vec<(bool, &'s str)>>, idx: usize, emphasized: bool, s: &'s str) {
|
||||||
|
v.resize_with(v.len().max(idx + 1), Vec::new);
|
||||||
|
// newlines cause all kinds of wacky stuff if they end up highlighted.
|
||||||
|
// because of this we want to unemphasize all newlines we encounter.
|
||||||
|
if emphasized {
|
||||||
|
for (seg, is_nl) in partition_newlines(s) {
|
||||||
|
v[idx].push((!is_nl, seg));
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
v[idx].push((false, s));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -104,87 +179,122 @@ pub(crate) fn iter_inline_changes<'diff>(
|
||||||
diff: &'diff TextDiff,
|
diff: &'diff TextDiff,
|
||||||
op: &DiffOp,
|
op: &DiffOp,
|
||||||
) -> impl Iterator<Item = InlineChange<'diff>> {
|
) -> impl Iterator<Item = InlineChange<'diff>> {
|
||||||
let mut change_iter = diff.iter_changes(op).peekable();
|
|
||||||
let mut skip_next = false;
|
|
||||||
let newline_terminated = diff.newline_terminated;
|
let newline_terminated = diff.newline_terminated;
|
||||||
|
let (tag, old_range, new_range) = op.as_tag_tuple();
|
||||||
|
|
||||||
iter::from_fn(move || {
|
if let DiffTag::Equal | DiffTag::Insert | DiffTag::Delete = tag {
|
||||||
if skip_next {
|
return Box::new(diff.iter_changes(op).map(|x| x.into())) as Box<dyn Iterator<Item = _>>;
|
||||||
change_iter.next();
|
}
|
||||||
skip_next = false;
|
|
||||||
}
|
|
||||||
if let Some(change) = change_iter.next() {
|
|
||||||
let next_change = change_iter.peek();
|
|
||||||
match (change.tag, next_change.map(|x| x.tag())) {
|
|
||||||
(ChangeTag::Delete, Some(ChangeTag::Insert)) => {
|
|
||||||
let old_value = change.value();
|
|
||||||
let new_value = next_change.unwrap().value();
|
|
||||||
let old_chars = split_unicode_words(&old_value).collect::<Vec<_>>();
|
|
||||||
let new_chars = split_unicode_words(&new_value).collect::<Vec<_>>();
|
|
||||||
let old_mindex = MultiIndex::new(&old_chars, old_value);
|
|
||||||
let new_mindex = MultiIndex::new(&new_chars, new_value);
|
|
||||||
let inline_diff = TextDiff::configure()
|
|
||||||
.algorithm(Algorithm::Patience)
|
|
||||||
.diff_slices(&old_chars, &new_chars);
|
|
||||||
|
|
||||||
if inline_diff.ratio() < 0.5 {
|
let mut old_index = old_range.start;
|
||||||
return Some(None.into_iter().chain(Some(change.into()).into_iter()));
|
let mut new_index = new_range.start;
|
||||||
}
|
let old_slices = &diff.old_slices()[old_range];
|
||||||
|
let new_slices = &diff.new_slices()[new_range];
|
||||||
|
let old_lookup = MultiLookup::new(old_slices);
|
||||||
|
let new_lookup = MultiLookup::new(new_slices);
|
||||||
|
|
||||||
// skip the next element as we handle it here
|
let ops = capture_diff(
|
||||||
skip_next = true;
|
Algorithm::Patience,
|
||||||
|
&old_lookup,
|
||||||
|
0..old_lookup.len(),
|
||||||
|
&new_lookup,
|
||||||
|
0..new_lookup.len(),
|
||||||
|
);
|
||||||
|
|
||||||
let mut old_values = vec![];
|
if diff_ratio(&ops, old_lookup.len(), new_lookup.len()) < 0.5 {
|
||||||
let mut new_values = vec![];
|
return Box::new(diff.iter_changes(op).map(|x| x.into())) as Box<dyn Iterator<Item = _>>;
|
||||||
for op in inline_diff.ops() {
|
}
|
||||||
match op.tag() {
|
|
||||||
DiffTag::Equal => {
|
|
||||||
old_values.push((false, old_mindex.get_slice(op.old_range())));
|
|
||||||
new_values.push((false, old_mindex.get_slice(op.old_range())));
|
|
||||||
}
|
|
||||||
DiffTag::Delete => {
|
|
||||||
old_values.push((true, old_mindex.get_slice(op.old_range())));
|
|
||||||
}
|
|
||||||
DiffTag::Insert => {
|
|
||||||
new_values.push((true, new_mindex.get_slice(op.new_range())));
|
|
||||||
}
|
|
||||||
DiffTag::Replace => {
|
|
||||||
old_values.push((true, old_mindex.get_slice(op.old_range())));
|
|
||||||
new_values.push((true, new_mindex.get_slice(op.new_range())));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
Some(
|
let mut old_values = Vec::<Vec<_>>::new();
|
||||||
Some(InlineChange {
|
let mut new_values = Vec::<Vec<_>>::new();
|
||||||
tag: ChangeTag::Delete,
|
|
||||||
old_index: change.old_index(),
|
for op in ops {
|
||||||
new_index: None,
|
match op {
|
||||||
values: old_values,
|
DiffOp::Equal {
|
||||||
missing_newline: newline_terminated
|
old_index,
|
||||||
&& !old_value.ends_with(&['\r', '\n'][..]),
|
len,
|
||||||
})
|
new_index,
|
||||||
.into_iter()
|
} => {
|
||||||
.chain(
|
for (idx, slice) in old_lookup.get_original_slices(old_index, len) {
|
||||||
Some(InlineChange {
|
push_values(&mut old_values, idx, false, slice);
|
||||||
tag: ChangeTag::Insert,
|
}
|
||||||
old_index: None,
|
for (idx, slice) in new_lookup.get_original_slices(new_index, len) {
|
||||||
new_index: next_change.unwrap().new_index(),
|
push_values(&mut new_values, idx, false, slice);
|
||||||
values: new_values,
|
}
|
||||||
missing_newline: newline_terminated
|
}
|
||||||
&& !new_value.ends_with(&['\r', '\n'][..]),
|
DiffOp::Delete {
|
||||||
})
|
old_index, old_len, ..
|
||||||
.into_iter(),
|
} => {
|
||||||
),
|
for (idx, slice) in old_lookup.get_original_slices(old_index, old_len) {
|
||||||
)
|
push_values(&mut old_values, idx, true, slice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DiffOp::Insert {
|
||||||
|
new_index, new_len, ..
|
||||||
|
} => {
|
||||||
|
for (idx, slice) in new_lookup.get_original_slices(new_index, new_len) {
|
||||||
|
push_values(&mut new_values, idx, true, slice);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
DiffOp::Replace {
|
||||||
|
old_index,
|
||||||
|
old_len,
|
||||||
|
new_index,
|
||||||
|
new_len,
|
||||||
|
} => {
|
||||||
|
for (idx, slice) in old_lookup.get_original_slices(old_index, old_len) {
|
||||||
|
push_values(&mut old_values, idx, true, slice);
|
||||||
|
}
|
||||||
|
for (idx, slice) in new_lookup.get_original_slices(new_index, new_len) {
|
||||||
|
push_values(&mut new_values, idx, true, slice);
|
||||||
}
|
}
|
||||||
_ => Some(None.into_iter().chain(Some(change.into()).into_iter())),
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
None
|
|
||||||
}
|
}
|
||||||
})
|
}
|
||||||
.flatten()
|
|
||||||
|
let mut rv = Vec::new();
|
||||||
|
|
||||||
|
for values in old_values {
|
||||||
|
rv.push(InlineChange {
|
||||||
|
tag: ChangeTag::Delete,
|
||||||
|
old_index: Some(old_index),
|
||||||
|
new_index: None,
|
||||||
|
values,
|
||||||
|
missing_newline: false,
|
||||||
|
});
|
||||||
|
old_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if newline_terminated
|
||||||
|
&& !old_slices.is_empty()
|
||||||
|
&& !old_slices[old_slices.len() - 1].ends_with(&['\r', '\n'][..])
|
||||||
|
{
|
||||||
|
if let Some(last) = rv.last_mut() {
|
||||||
|
last.missing_newline = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
for values in new_values {
|
||||||
|
rv.push(InlineChange {
|
||||||
|
tag: ChangeTag::Insert,
|
||||||
|
old_index: None,
|
||||||
|
new_index: Some(new_index),
|
||||||
|
values,
|
||||||
|
missing_newline: false,
|
||||||
|
});
|
||||||
|
new_index += 1;
|
||||||
|
}
|
||||||
|
|
||||||
|
if newline_terminated
|
||||||
|
&& !new_slices.is_empty()
|
||||||
|
&& !new_slices[new_slices.len() - 1].ends_with(&['\r', '\n'][..])
|
||||||
|
{
|
||||||
|
if let Some(last) = rv.last_mut() {
|
||||||
|
last.missing_newline = true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
Box::new(rv.into_iter()) as Box<dyn Iterator<Item = _>>
|
||||||
}
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
|
|
|
||||||
|
|
@ -394,23 +394,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
|
||||||
/// assert_eq!(diff.ratio(), 0.75);
|
/// assert_eq!(diff.ratio(), 0.75);
|
||||||
/// ```
|
/// ```
|
||||||
pub fn ratio(&self) -> f32 {
|
pub fn ratio(&self) -> f32 {
|
||||||
let matches = self
|
diff_ratio(self.ops(), self.old.len(), self.new.len())
|
||||||
.ops()
|
|
||||||
.iter()
|
|
||||||
.map(|op| {
|
|
||||||
if let DiffOp::Equal { len, .. } = *op {
|
|
||||||
len
|
|
||||||
} else {
|
|
||||||
0
|
|
||||||
}
|
|
||||||
})
|
|
||||||
.sum::<usize>();
|
|
||||||
let len = self.old.len() + self.new.len();
|
|
||||||
if len == 0 {
|
|
||||||
1.0
|
|
||||||
} else {
|
|
||||||
2.0 * matches as f32 / len as f32
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Iterates over the changes the op expands to.
|
/// Iterates over the changes the op expands to.
|
||||||
|
|
@ -617,6 +601,25 @@ fn split_graphemes(s: &str) -> impl Iterator<Item = &str> {
|
||||||
unicode_segmentation::UnicodeSegmentation::graphemes(s, true)
|
unicode_segmentation::UnicodeSegmentation::graphemes(s, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
fn diff_ratio(ops: &[DiffOp], s1_len: usize, s2_len: usize) -> f32 {
|
||||||
|
let matches = ops
|
||||||
|
.iter()
|
||||||
|
.map(|op| {
|
||||||
|
if let DiffOp::Equal { len, .. } = *op {
|
||||||
|
len
|
||||||
|
} else {
|
||||||
|
0
|
||||||
|
}
|
||||||
|
})
|
||||||
|
.sum::<usize>();
|
||||||
|
let len = s1_len + s2_len;
|
||||||
|
if len == 0 {
|
||||||
|
1.0
|
||||||
|
} else {
|
||||||
|
2.0 * matches as f32 / len as f32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// quick and dirty way to get an upper sequence ratio.
|
// quick and dirty way to get an upper sequence ratio.
|
||||||
fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
|
fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
|
||||||
let n = seq1.len() + seq2.len();
|
let n = seq1.len() + seq2.len();
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue