Add compaction support (#22)

This commit is contained in:
Armin Ronacher 2021-02-25 22:13:43 +01:00 committed by GitHub
parent ddb73d8351
commit 7e628d78d8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 712 additions and 19 deletions

355
src/algorithms/compact.rs Normal file
View file

@ -0,0 +1,355 @@
//! Implements basic compacting. This is based on the compaction logic from
//! diffy by Brandon Williams.
use std::ops::Index;
use crate::{DiffOp, DiffTag};
use super::utils::{common_prefix_len, common_suffix_len};
use super::DiffHook;
/// Performs semantic cleanup operations on a diff.
///
/// This merges similar ops together but also tries to move hunks up and
/// down the diff with the desire to connect as many hunks as possible.
/// It still needs to be combined with [`Replace`](crate::algorithms::Replace)
/// to get actual replace diff ops out.
#[derive(Debug)]
pub struct Compact<'old, 'new, Old: ?Sized, New: ?Sized, D> {
d: D,
ops: Vec<DiffOp>,
old: &'old Old,
new: &'new New,
}
impl<'old, 'new, Old, New, D> Compact<'old, 'new, Old, New, D>
where
D: DiffHook,
Old: Index<usize> + ?Sized + 'old,
New: Index<usize> + ?Sized + 'new,
New::Output: PartialEq<Old::Output>,
{
/// Creates a new compact hook wrapping another hook.
pub fn new(d: D, old: &'old Old, new: &'new New) -> Self {
Compact {
d,
ops: Vec::new(),
old,
new,
}
}
/// Extracts the inner hook.
pub fn into_inner(self) -> D {
self.d
}
}
impl<'old, 'new, Old: ?Sized, New: ?Sized, D: DiffHook> AsRef<D>
for Compact<'old, 'new, Old, New, D>
{
fn as_ref(&self) -> &D {
&self.d
}
}
impl<'old, 'new, Old: ?Sized, New: ?Sized, D: DiffHook> AsMut<D>
for Compact<'old, 'new, Old, New, D>
{
fn as_mut(&mut self) -> &mut D {
&mut self.d
}
}
impl<'old, 'new, Old, New, D> DiffHook for Compact<'old, 'new, Old, New, D>
where
D: DiffHook,
Old: Index<usize> + ?Sized + 'old,
New: Index<usize> + ?Sized + 'new,
New::Output: PartialEq<Old::Output>,
{
type Error = D::Error;
#[inline(always)]
fn equal(&mut self, old_index: usize, new_index: usize, len: usize) -> Result<(), Self::Error> {
self.ops.push(DiffOp::Equal {
old_index,
new_index,
len,
});
Ok(())
}
#[inline(always)]
fn delete(
&mut self,
old_index: usize,
old_len: usize,
new_index: usize,
) -> Result<(), Self::Error> {
self.ops.push(DiffOp::Delete {
old_index,
old_len,
new_index,
});
Ok(())
}
#[inline(always)]
fn insert(
&mut self,
old_index: usize,
new_index: usize,
new_len: usize,
) -> Result<(), Self::Error> {
self.ops.push(DiffOp::Insert {
old_index,
new_index,
new_len,
});
Ok(())
}
fn finish(&mut self) -> Result<(), Self::Error> {
cleanup_diff_ops(self.old, self.new, &mut self.ops);
for op in &self.ops {
op.apply_to_hook(&mut self.d)?;
}
self.d.finish()
}
}
// Walks through all edits and shifts them up and then down, trying to see if
// they run into similar edits which can be merged.
pub fn cleanup_diff_ops<Old, New>(old: &Old, new: &New, ops: &mut Vec<DiffOp>)
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
// First attempt to compact all Deletions
let mut pointer = 0;
while let Some(&op) = ops.get(pointer) {
if let DiffTag::Delete = op.tag() {
pointer = shift_diff_ops_up(ops, old, new, pointer);
pointer = shift_diff_ops_down(ops, old, new, pointer);
}
pointer += 1;
}
// Then attempt to compact all Insertions
let mut pointer = 0;
while let Some(&op) = ops.get(pointer) {
if let DiffTag::Insert = op.tag() {
pointer = shift_diff_ops_up(ops, old, new, pointer);
pointer = shift_diff_ops_down(ops, old, new, pointer);
}
pointer += 1;
}
}
fn shift_diff_ops_up<Old, New>(
ops: &mut Vec<DiffOp>,
old: &Old,
new: &New,
mut pointer: usize,
) -> usize
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
while let Some(&prev_op) = pointer.checked_sub(1).and_then(|idx| ops.get(idx)) {
let this_op = ops[pointer];
match (this_op.tag(), prev_op.tag()) {
// Shift Inserts Upwards
(DiffTag::Insert, DiffTag::Equal) => {
let suffix_len =
common_suffix_len(old, prev_op.old_range(), new, this_op.new_range());
if suffix_len > 0 {
if let Some(DiffTag::Equal) = ops.get(pointer + 1).map(|x| x.tag()) {
ops[pointer + 1].grow_left(suffix_len);
} else {
ops.insert(
pointer + 1,
DiffOp::Equal {
old_index: prev_op.old_range().end - suffix_len,
new_index: this_op.new_range().end - suffix_len,
len: suffix_len,
},
);
}
ops[pointer].shift_left(suffix_len);
ops[pointer - 1].shrink_left(suffix_len);
if ops[pointer - 1].is_empty() {
ops.remove(pointer - 1);
pointer -= 1;
}
} else if ops[pointer - 1].is_empty() {
ops.remove(pointer - 1);
pointer -= 1;
} else {
// We can't shift upwards anymore
break;
}
}
// Shift Deletions Upwards
(DiffTag::Delete, DiffTag::Equal) => {
// check common suffix for the amount we can shift
let suffix_len =
common_suffix_len(old, prev_op.old_range(), new, this_op.new_range());
if suffix_len != 0 {
if let Some(DiffTag::Equal) = ops.get(pointer + 1).map(|x| x.tag()) {
ops[pointer + 1].grow_left(suffix_len);
} else {
let old_range = prev_op.old_range();
ops.insert(
pointer + 1,
DiffOp::Equal {
old_index: old_range.end - suffix_len,
new_index: this_op.new_range().end - suffix_len,
len: old_range.len() - suffix_len,
},
);
}
ops[pointer].shift_left(suffix_len);
ops[pointer - 1].shrink_left(suffix_len);
if ops[pointer - 1].is_empty() {
ops.remove(pointer - 1);
pointer -= 1;
}
} else if ops[pointer - 1].is_empty() {
ops.remove(pointer - 1);
pointer -= 1;
} else {
// We can't shift upwards anymore
break;
}
}
// Swap the Delete and Insert
(DiffTag::Insert, DiffTag::Delete) | (DiffTag::Delete, DiffTag::Insert) => {
ops.swap(pointer - 1, pointer);
pointer -= 1;
}
// Merge the two ranges
(DiffTag::Insert, DiffTag::Insert) => {
ops[pointer - 1].grow_right(this_op.new_range().len());
ops.remove(pointer);
pointer -= 1;
}
(DiffTag::Delete, DiffTag::Delete) => {
ops[pointer - 1].grow_right(this_op.old_range().len());
ops.remove(pointer);
pointer -= 1;
}
_ => unreachable!("unexpected tag"),
}
}
pointer
}
fn shift_diff_ops_down<Old, New>(
ops: &mut Vec<DiffOp>,
old: &Old,
new: &New,
mut pointer: usize,
) -> usize
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
New::Output: PartialEq<Old::Output>,
{
while let Some(&next_op) = pointer.checked_add(1).and_then(|idx| ops.get(idx)) {
let this_op = ops[pointer];
match (this_op.tag(), next_op.tag()) {
// Shift Inserts Downwards
(DiffTag::Insert, DiffTag::Equal) => {
let prefix_len =
common_prefix_len(old, next_op.old_range(), new, this_op.new_range());
if prefix_len > 0 {
if let Some(DiffTag::Equal) = pointer
.checked_sub(1)
.and_then(|x| ops.get(x))
.map(|x| x.tag())
{
ops[pointer - 1].grow_right(prefix_len);
} else {
ops.insert(
pointer,
DiffOp::Equal {
old_index: next_op.old_range().start,
new_index: this_op.new_range().start,
len: prefix_len,
},
);
pointer += 1;
}
ops[pointer].shift_right(prefix_len);
ops[pointer + 1].shrink_right(prefix_len);
if ops[pointer + 1].is_empty() {
ops.remove(pointer + 1);
}
} else if ops[pointer + 1].is_empty() {
ops.remove(pointer + 1);
} else {
// We can't shift upwards anymore
break;
}
}
// Shift Deletions Downwards
(DiffTag::Delete, DiffTag::Equal) => {
// check common suffix for the amount we can shift
let prefix_len =
common_prefix_len(old, next_op.old_range(), new, this_op.new_range());
if prefix_len > 0 {
if let Some(DiffTag::Equal) = pointer
.checked_sub(1)
.and_then(|x| ops.get(x))
.map(|x| x.tag())
{
ops[pointer - 1].grow_right(prefix_len);
} else {
ops.insert(
pointer,
DiffOp::Equal {
old_index: next_op.old_range().start,
new_index: this_op.new_range().start,
len: prefix_len,
},
);
pointer += 1;
}
ops[pointer].shift_right(prefix_len);
ops[pointer + 1].shrink_right(prefix_len);
if ops[pointer + 1].is_empty() {
ops.remove(pointer + 1);
}
} else if ops[pointer + 1].is_empty() {
ops.remove(pointer + 1);
} else {
// We can't shift downwards anymore
break;
}
}
// Swap the Delete and Insert
(DiffTag::Insert, DiffTag::Delete) | (DiffTag::Delete, DiffTag::Insert) => {
ops.swap(pointer, pointer + 1);
pointer += 1;
}
// Merge the two ranges
(DiffTag::Insert, DiffTag::Insert) => {
ops[pointer].grow_right(next_op.new_range().len());
ops.remove(pointer + 1);
}
(DiffTag::Delete, DiffTag::Delete) => {
ops[pointer].grow_right(next_op.old_range().len());
ops.remove(pointer + 1);
}
_ => unreachable!("unexpected tag"),
}
}
pointer
}

View file

@ -34,17 +34,20 @@
//! [`capture_diff_slices`](crate::capture_diff_slices).
mod capture;
mod compact;
mod hook;
mod replace;
mod utils;
pub(crate) mod utils;
use std::hash::Hash;
use std::ops::{Index, Range};
use std::time::Instant;
pub use capture::Capture;
pub use compact::Compact;
pub use hook::{DiffHook, NoFinishHook};
pub use replace::Replace;
pub use utils::IdentifyDistinct;
#[doc(no_inline)]
pub use crate::Algorithm;

View file

@ -65,8 +65,8 @@ where
New::Output: PartialEq<Old::Output>,
{
let max_d = max_d(old_range.len(), new_range.len());
let mut vf = V::new(max_d);
let mut vb = V::new(max_d);
let mut vf = V::new(max_d);
conquer(
d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
)?;

View file

@ -1,8 +1,8 @@
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::Hash;
use std::ops::{Index, Range};
use std::hash::{Hash, Hasher};
use std::ops::{Add, Index, Range};
/// Utility function to check if a range is empty that works on older rust versions
#[inline(always)]
@ -142,6 +142,175 @@ where
.count()
}
struct OffsetLookup<Int> {
offset: usize,
vec: Vec<Int>,
}
impl<Int> Index<usize> for OffsetLookup<Int> {
type Output = Int;
#[inline(always)]
fn index(&self, index: usize) -> &Self::Output {
&self.vec[index - self.offset]
}
}
/// A utility struct to convert distinct items to unique integers.
///
/// This can be helpful on larger inputs to speed up the comparisons
/// performed by doing a first pass where the data set gets reduced
/// to (small) integers.
///
/// The idea is that instead of passing two sequences to a diffling algorithm
/// you first pass it via [`IdentifyDistinct`]:
///
/// ```rust
/// use similar::capture_diff;
/// use similar::algorithms::{Algorithm, IdentifyDistinct};
///
/// let old = &["foo", "bar", "baz"][..];
/// let new = &["foo", "blah", "baz"][..];
/// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
/// let ops = capture_diff(
/// Algorithm::Myers,
/// h.old_lookup(),
/// h.old_range(),
/// h.new_lookup(),
/// h.new_range(),
/// );
/// ```
///
/// The indexes are the same as with the passed source ranges.
pub struct IdentifyDistinct<Int> {
old: OffsetLookup<Int>,
new: OffsetLookup<Int>,
}
impl<Int> IdentifyDistinct<Int>
where
Int: Add<Output = Int> + From<u8> + Default + Copy,
{
/// Creates an int hasher for two sequences.
pub fn new<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> Self
where
Old: Index<usize> + ?Sized,
Old::Output: Eq + Hash,
New: Index<usize> + ?Sized,
New::Output: Eq + Hash + PartialEq<Old::Output>,
{
enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
Old(&'old Old),
New(&'new New),
}
impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
where
Old: Hash + ?Sized,
New: Hash + ?Sized,
{
fn hash<H: Hasher>(&self, state: &mut H) {
match *self {
Key::Old(val) => val.hash(state),
Key::New(val) => val.hash(state),
}
}
}
impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
where
Old: Eq + ?Sized,
New: Eq + PartialEq<Old> + ?Sized,
{
#[inline(always)]
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Key::Old(a), Key::Old(b)) => a == b,
(Key::New(a), Key::New(b)) => a == b,
(Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
}
}
}
impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
where
Old: Eq + ?Sized,
New: Eq + PartialEq<Old> + ?Sized,
{
}
let mut map = HashMap::new();
let mut old_seq = Vec::new();
let mut new_seq = Vec::new();
let mut next_id = Int::default();
let step = Int::from(1);
let old_start = old_range.start;
let new_start = new_range.start;
for idx in old_range {
let item = Key::Old(&old[idx]);
let id = match map.entry(item) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let id = next_id;
next_id = next_id + step;
*v.insert(id)
}
};
old_seq.push(id);
}
for idx in new_range {
let item = Key::New(&new[idx]);
let id = match map.entry(item) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let id = next_id;
next_id = next_id + step;
*v.insert(id)
}
};
new_seq.push(id);
}
IdentifyDistinct {
old: OffsetLookup {
offset: old_start,
vec: old_seq,
},
new: OffsetLookup {
offset: new_start,
vec: new_seq,
},
}
}
/// Returns a lookup for the old side.
pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
&self.old
}
/// Returns a lookup for the new side.
pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
&self.new
}
/// Convenience method to get back the old range.
pub fn old_range(&self) -> Range<usize> {
self.old.offset..self.old.offset + self.old.vec.len()
}
/// Convenience method to get back the new range.
pub fn new_range(&self) -> Range<usize> {
self.new.offset..self.new.offset + self.new.vec.len()
}
}
#[test]
fn test_unique() {
let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
@ -151,6 +320,24 @@ fn test_unique() {
assert_eq!(u, vec![('a', 0), ('c', 2)]);
}
#[test]
fn test_int_hasher() {
let ih = IdentifyDistinct::<u8>::new(
&["", "foo", "bar", "baz"][..],
1..4,
&["", "foo", "blah", "baz"][..],
1..4,
);
assert_eq!(ih.old_lookup()[1], 0);
assert_eq!(ih.old_lookup()[2], 1);
assert_eq!(ih.old_lookup()[3], 2);
assert_eq!(ih.new_lookup()[1], 0);
assert_eq!(ih.new_lookup()[2], 3);
assert_eq!(ih.new_lookup()[3], 2);
assert_eq!(ih.old_range(), 1..4);
assert_eq!(ih.new_range(), 1..4);
}
#[test]
fn test_common_prefix_len() {
assert_eq!(

View file

@ -2,14 +2,14 @@ use std::hash::Hash;
use std::ops::{Index, Range};
use std::time::Instant;
use crate::algorithms::{diff_deadline, diff_slices_deadline, Capture, Replace};
use crate::algorithms::{diff_deadline, Capture, Compact, Replace};
use crate::{Algorithm, DiffOp};
/// Creates a diff between old and new with the given algorithm capturing the ops.
///
/// This is like [`diff`](crate::algorithms::diff) but instead of using an
/// arbitrary hook this will always use [`Replace`] + [`Capture`] and return the
/// captured [`DiffOp`]s.
/// arbitrary hook this will always use [`Compact`] + [`Replace`] + [`Capture`]
/// and return the captured [`DiffOp`]s.
pub fn capture_diff<Old, New>(
alg: Algorithm,
old: &Old,
@ -43,9 +43,9 @@ where
Old::Output: Hash + Eq + Ord,
New::Output: PartialEq<Old::Output> + Hash + Eq + Ord,
{
let mut d = Replace::new(Capture::new());
let mut d = Compact::new(Replace::new(Capture::new()), old, new);
diff_deadline(alg, &mut d, old, old_range, new, new_range, deadline).unwrap();
d.into_inner().into_ops()
d.into_inner().into_inner().into_ops()
}
/// Creates a diff between old and new with the given algorithm capturing the ops.
@ -68,9 +68,7 @@ pub fn capture_diff_slices_deadline<T>(
where
T: Eq + Hash + Ord,
{
let mut d = Replace::new(Capture::new());
diff_slices_deadline(alg, &mut d, old, new, deadline).unwrap();
d.into_inner().into_ops()
capture_diff_deadline(alg, old, 0..old.len(), new, 0..new.len(), deadline)
}
/// Return a measure of similarity in the range `0..=1`.

View file

@ -14,9 +14,10 @@ pub use self::abstraction::{DiffableStr, DiffableStrRef};
pub use self::inline::InlineChange;
use self::utils::{upper_seq_ratio, QuickSeqRatio};
use crate::algorithms::IdentifyDistinct;
use crate::iter::{AllChangesIter, ChangesIter};
use crate::udiff::UnifiedDiff;
use crate::{capture_diff_slices_deadline, get_diff_ratio, group_diff_ops, Algorithm, DiffOp};
use crate::{capture_diff_deadline, get_diff_ratio, group_diff_ops, Algorithm, DiffOp};
#[derive(Debug, Clone, Copy)]
enum Deadline {
@ -327,12 +328,27 @@ impl TextDiffConfig {
new: Cow<'bufs, [&'new T]>,
newline_terminated: bool,
) -> TextDiff<'old, 'new, 'bufs, T> {
let ops = capture_diff_slices_deadline(
self.algorithm,
&old,
&new,
self.deadline.map(|x| x.into_instant()),
);
let deadline = self.deadline.map(|x| x.into_instant());
let ops = if old.len() > 100 || new.len() > 100 {
let ih = IdentifyDistinct::<u32>::new(&old[..], 0..old.len(), &new[..], 0..new.len());
capture_diff_deadline(
self.algorithm,
ih.old_lookup(),
ih.old_range(),
ih.new_lookup(),
ih.new_range(),
deadline,
)
} else {
capture_diff_deadline(
self.algorithm,
&old[..],
0..old.len(),
&new[..],
0..new.len(),
deadline,
)
};
TextDiff {
old,
new,

View file

@ -1,6 +1,7 @@
use std::fmt;
use std::ops::{Index, Range};
use crate::algorithms::utils::is_empty_range;
use crate::algorithms::DiffHook;
use crate::iter::ChangesIter;
@ -343,6 +344,87 @@ impl DiffOp {
.chain(Some((ChangeTag::Insert, &new[new_index..new_index + new_len])).into_iter()),
}
}
pub(crate) fn is_empty(&self) -> bool {
let (_, old, new) = self.as_tag_tuple();
is_empty_range(&old) && is_empty_range(&new)
}
pub(crate) fn shift_left(&mut self, adjust: usize) {
self.adjust((adjust, true), (0, false));
}
pub(crate) fn shift_right(&mut self, adjust: usize) {
self.adjust((adjust, false), (0, false));
}
pub(crate) fn grow_left(&mut self, adjust: usize) {
self.adjust((adjust, true), (adjust, false));
}
pub(crate) fn grow_right(&mut self, adjust: usize) {
self.adjust((0, false), (adjust, false));
}
pub(crate) fn shrink_left(&mut self, adjust: usize) {
self.adjust((0, false), (adjust, true));
}
pub(crate) fn shrink_right(&mut self, adjust: usize) {
self.adjust((adjust, false), (adjust, true));
}
fn adjust(&mut self, adjust_offset: (usize, bool), adjust_len: (usize, bool)) {
#[inline(always)]
fn modify(val: &mut usize, adj: (usize, bool)) {
if adj.1 {
*val -= adj.0;
} else {
*val += adj.0;
}
}
match self {
DiffOp::Equal {
old_index,
new_index,
len,
} => {
modify(old_index, adjust_offset);
modify(new_index, adjust_offset);
modify(len, adjust_len);
}
DiffOp::Delete {
old_index,
old_len,
new_index,
} => {
modify(old_index, adjust_offset);
modify(old_len, adjust_len);
modify(new_index, adjust_offset);
}
DiffOp::Insert {
old_index,
new_index,
new_len,
} => {
modify(old_index, adjust_offset);
modify(new_index, adjust_offset);
modify(new_len, adjust_len);
}
DiffOp::Replace {
old_index,
old_len,
new_index,
new_len,
} => {
modify(old_index, adjust_offset);
modify(old_len, adjust_len);
modify(new_index, adjust_offset);
modify(new_len, adjust_len);
}
}
}
}
#[cfg(feature = "text")]