diff --git a/CHANGELOG.md b/CHANGELOG.md index bd38c4d..86324c4 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,7 +5,11 @@ All notable changes to similar are documented here. ## 1.2.0 * Make the unicode feature optional for inline diffing. -* Added Hunt–McIlroy LCS algorithm. +* Added Hunt–McIlroy LCS algorithm (`lcs`). +* Changed the implementation of Mayer's diff. This has slightly changed the + behavior but resulted in snigificantly improved performance and more + readable code. +* Added `NoFinishHook` to aid composing of diff hooks. ## 1.1.0 diff --git a/Cargo.toml b/Cargo.toml index ef5e04a..879bfa8 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,7 +1,7 @@ [package] name = "similar" -version = "1.1.0" -authors = ["Armin Ronacher ", "Pierre-Étienne Meunier "] +version = "1.2.0" +authors = ["Armin Ronacher ", "Pierre-Étienne Meunier ", "Brandon Williams "] edition = "2018" license = "Apache-2.0" description = "A diff library for Rust" @@ -56,3 +56,6 @@ required-features = ["text", "bytes"] [[example]] name = "close-matches" required-features = ["text"] + +[profile.release] +debug = true diff --git a/src/algorithms/capture.rs b/src/algorithms/capture.rs index 11f3d76..49375a5 100644 --- a/src/algorithms/capture.rs +++ b/src/algorithms/capture.rs @@ -34,6 +34,7 @@ impl Capture { impl DiffHook for Capture { type Error = Infallible; + #[inline(always)] fn equal(&mut self, old_index: usize, new_index: usize, len: usize) -> Result<(), Self::Error> { self.0.push(DiffOp::Equal { old_index, @@ -43,6 +44,7 @@ impl DiffHook for Capture { Ok(()) } + #[inline(always)] fn delete( &mut self, old_index: usize, @@ -57,6 +59,7 @@ impl DiffHook for Capture { Ok(()) } + #[inline(always)] fn insert( &mut self, old_index: usize, @@ -71,6 +74,7 @@ impl DiffHook for Capture { Ok(()) } + #[inline(always)] fn replace( &mut self, old_index: usize, diff --git a/src/algorithms/hook.rs b/src/algorithms/hook.rs index 8106bcd..23dcb7f 100644 --- a/src/algorithms/hook.rs +++ b/src/algorithms/hook.rs @@ -51,6 +51,7 @@ pub trait DiffHook: Sized { /// /// You can use the [`Replace`](crate::algorithms::Replace) hook to /// automatically generate these. + #[inline(always)] fn replace( &mut self, old_index: usize, @@ -63,6 +64,7 @@ pub trait DiffHook: Sized { } /// Always called at the end of the algorithm. + #[inline(always)] fn finish(&mut self) -> Result<(), Self::Error> { Ok(()) } @@ -71,10 +73,12 @@ pub trait DiffHook: Sized { impl<'a, D: DiffHook + 'a> DiffHook for &'a mut D { type Error = D::Error; + #[inline(always)] fn equal(&mut self, old_index: usize, new_index: usize, len: usize) -> Result<(), Self::Error> { (*self).equal(old_index, new_index, len) } + #[inline(always)] fn delete( &mut self, old_index: usize, @@ -84,6 +88,7 @@ impl<'a, D: DiffHook + 'a> DiffHook for &'a mut D { (*self).delete(old_index, old_len, new_index) } + #[inline(always)] fn insert( &mut self, old_index: usize, @@ -93,6 +98,7 @@ impl<'a, D: DiffHook + 'a> DiffHook for &'a mut D { (*self).insert(old_index, new_index, new_len) } + #[inline(always)] fn replace( &mut self, old: usize, @@ -103,7 +109,70 @@ impl<'a, D: DiffHook + 'a> DiffHook for &'a mut D { (*self).replace(old, old_len, new, new_len) } + #[inline(always)] fn finish(&mut self) -> Result<(), Self::Error> { (*self).finish() } } + +/// Wrapper [`DiffHook`] that prevents calls to [`DiffHook::finish`]. +/// +/// This hook is useful in situations where diff hooks are composed but you +/// want to prevent that the finish hook method is called. +pub struct NoFinishHook(D); + +impl NoFinishHook { + /// Wraps another hook. + pub fn new(d: D) -> NoFinishHook { + NoFinishHook(d) + } + + /// Extracts the inner hook. + pub fn into_inner(self) -> D { + self.0 + } +} + +impl DiffHook for NoFinishHook { + type Error = D::Error; + + #[inline(always)] + fn equal(&mut self, old_index: usize, new_index: usize, len: usize) -> Result<(), Self::Error> { + self.0.equal(old_index, new_index, len) + } + + #[inline(always)] + fn delete( + &mut self, + old_index: usize, + old_len: usize, + new_index: usize, + ) -> Result<(), Self::Error> { + self.0.delete(old_index, old_len, new_index) + } + + #[inline(always)] + fn insert( + &mut self, + old_index: usize, + new_index: usize, + new_len: usize, + ) -> Result<(), Self::Error> { + self.0.insert(old_index, new_index, new_len) + } + + #[inline(always)] + fn replace( + &mut self, + old_index: usize, + old_len: usize, + new_index: usize, + new_len: usize, + ) -> Result<(), Self::Error> { + self.0.replace(old_index, old_len, new_index, new_len) + } + + fn finish(&mut self) -> Result<(), Self::Error> { + Ok(()) + } +} diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs index c9c47f0..0654142 100644 --- a/src/algorithms/mod.rs +++ b/src/algorithms/mod.rs @@ -41,7 +41,7 @@ use std::hash::Hash; use std::ops::{Index, Range}; pub use capture::Capture; -pub use hook::DiffHook; +pub use hook::{DiffHook, NoFinishHook}; pub use replace::Replace; #[doc(no_inline)] diff --git a/src/algorithms/myers.rs b/src/algorithms/myers.rs index 2b5700c..e4e98ad 100644 --- a/src/algorithms/myers.rs +++ b/src/algorithms/myers.rs @@ -5,9 +5,11 @@ //! //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) //! describing it. +//! +//! The implementation of this algorithm is based on the implementation by +//! Brandon Williams. -use std::cmp::{max, min}; -use std::ops::{Index, Range}; +use std::ops::{Index, IndexMut, Range}; use crate::algorithms::DiffHook; @@ -27,15 +29,10 @@ where D: DiffHook, New::Output: PartialEq, { - diff_offsets( - d, - old, - old_range.start, - old_range.end, - new, - new_range.start, - new_range.end, - )?; + let max_d = max_d(old_range.len(), new_range.len()); + let mut vf = V::new(max_d); + let mut vb = V::new(max_d); + conquer(d, old, old_range, new, new_range, &mut vf, &mut vb)?; d.finish() } @@ -48,135 +45,312 @@ where diff(d, old, 0..old.len(), new, 0..new.len()) } -fn modulo(a: isize, b: usize) -> usize { - a.rem_euclid(b as isize) as usize +// A D-path is a path which starts at (0,0) that has exactly D non-diagonal +// edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge +// and then a possibly empty sequence of diagonal edges called a snake. + +/// `V` contains the endpoints of the furthest reaching `D-paths`. For each +/// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because +/// `y` can be computed from `x - k`. In other words, `V` is an array of integers +/// where `V[k]` contains the row index of the endpoint of the furthest reaching +/// path in diagonal `k`. +/// +/// We can't use a traditional Vec to represent `V` since we use `k` as an index +/// and it can take on negative values. So instead `V` is represented as a +/// light-weight wrapper around a Vec plus an `offset` which is the maximum value +/// `k` can take on in order to map negative `k`'s back to a value >= 0. +#[derive(Debug)] +struct V { + offset: isize, + v: Vec, // Look into initializing this to -1 and storing isize } -pub(crate) fn diff_offsets( - diff: &mut D, +impl V { + fn new(max_d: usize) -> Self { + Self { + offset: max_d as isize, + v: vec![0; 2 * max_d], + } + } + + fn len(&self) -> usize { + self.v.len() + } +} + +impl Index for V { + type Output = usize; + + fn index(&self, index: isize) -> &Self::Output { + &self.v[(index + self.offset) as usize] + } +} + +impl IndexMut for V { + fn index_mut(&mut self, index: isize) -> &mut Self::Output { + &mut self.v[(index + self.offset) as usize] + } +} + +/// A `Snake` is a sequence of diagonal edges in the edit graph. Normally +/// a snake has a start end end point (and it is possible for a snake to have +/// a length of zero, meaning the start and end points are the same) however +/// we do not need the end point which is why it's not implemented here. +#[derive(Debug)] +struct Snake { + x_start: usize, + y_start: usize, +} + +fn max_d(len1: usize, len2: usize) -> usize { + // XXX look into reducing the need to have the additional '+ 1' + (len1 + len2 + 1) / 2 + 1 +} + +fn common_prefix_len( old: &Old, - old_current: usize, - old_end: usize, + old_range: Range, new: &New, - new_current: usize, - new_end: usize, -) -> Result<(), D::Error> + new_range: Range, +) -> usize where - D: DiffHook + ?Sized, Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { - #![allow(clippy::many_single_char_names)] - if old_end > old_current && new_end > new_current { - let old_span = old_end - old_current; - let new_span = new_end - new_current; - let total_span = (old_span + new_span) as isize; - let vec_size = (2 * min(old_span, new_span) + 2) as usize; - let w = old_span as isize - new_span as isize; - let mut vec_down = vec![0; vec_size as usize]; - let mut vec_up = vec![0; vec_size as usize]; - for i in 0..=(total_span / 2 + total_span % 2) { - for &inverse in &[true, false][..] { - let (v1, v2) = if inverse { - (&mut vec_down, &mut vec_up) - } else { - (&mut vec_up, &mut vec_down) - }; - let j_start = -(i - 2 * max(0, i - new_span as isize)); - let j_end = i - 2 * max(0, i - old_span as isize) + 1; - for j in (j_start..j_end).step_by(2) { - let mut a: usize = if j == -i - || j != i && v1[modulo(j - 1, vec_size)] < v1[modulo(j + 1, vec_size)] - { - v1[modulo(j + 1, vec_size)] - } else { - v1[modulo(j - 1, vec_size)] + 1 + if old_range.is_empty() || new_range.is_empty() { + return 0; + } + new_range + .zip(old_range) + .take_while( + #[inline(always)] + |x| new[x.0] == old[x.1], + ) + .count() +} + +fn common_suffix_len( + old: &Old, + old_range: Range, + new: &New, + new_range: Range, +) -> usize +where + Old: Index + ?Sized, + New: Index + ?Sized, + New::Output: PartialEq, +{ + if old_range.is_empty() || new_range.is_empty() { + return 0; + } + new_range + .rev() + .zip(old_range.rev()) + .take_while( + #[inline(always)] + |x| new[x.0] == old[x.1], + ) + .count() +} + +#[inline(always)] +fn split_at(range: Range, at: usize) -> (Range, Range) { + (range.start..at, at..range.end) +} + +// The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes +// some of which may be empty. The divide step requires finding the ceil(D/2) + +// 1 or middle snake of an optimal D-path. The idea for doing so is to +// simultaneously run the basic algorithm in both the forward and reverse +// directions until furthest reaching forward and reverse paths starting at +// opposing corners 'overlap'. +fn find_middle_snake( + old: &Old, + old_range: Range, + new: &New, + new_range: Range, + vf: &mut V, + vb: &mut V, +) -> Snake +where + Old: Index + ?Sized, + New: Index + ?Sized, + New::Output: PartialEq, +{ + let n = old_range.len(); + let m = new_range.len(); + + // By Lemma 1 in the paper, the optimal edit script length is odd or even as + // `delta` is odd or even. + let delta = n as isize - m as isize; + let odd = delta & 1 == 1; + + // The initial point at (0, -1) + vf[1] = 0; + // The initial point at (N, M+1) + vb[1] = 0; + + // We only need to explore ceil(D/2) + 1 + let d_max = max_d(n, m); + assert!(vf.len() >= d_max); + assert!(vb.len() >= d_max); + + for d in 0..d_max as isize { + // Forward path + for k in (-d..=d).rev().step_by(2) { + let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { + vf[k + 1] + } else { + vf[k - 1] + 1 + }; + let y = (x as isize - k) as usize; + + // The coordinate of the start of a snake + let (x0, y0) = (x, y); + // While these sequences are identical, keep moving through the + // graph with no cost + if x < old_range.len() && y < new_range.len() { + let advance = common_prefix_len( + old, + old_range.start + x..old_range.end, + new, + new_range.start + y..new_range.end, + ); + x += advance; + } + + // This is the new best x value + vf[k] = x; + + // Only check for connections from the forward search when N - M is + // odd and when there is a reciprocal k line coming from the other + // direction. + if odd && (k - delta).abs() <= (d - 1) { + // TODO optimize this so we don't have to compare against n + if vf[k] + vb[-(k - delta)] >= n { + // Return the snake + return Snake { + x_start: x0 + old_range.start, + y_start: y0 + new_range.start, }; - let mut b = (a as isize - j) as usize; - let (s, t) = (a, b); - while a < old_span && b < new_span && { - let (e_i, f_i) = if inverse { - (a, b) - } else { - (old_span - a - 1, new_span - b - 1) - }; - new[new_current + f_i] == old[old_current + e_i] - } { - a += 1; - b += 1; - } - v1[modulo(j, vec_size)] = a; - let bound = if inverse { i - 1 } else { i }; - if (total_span % 2 == 1) == inverse - && w - j >= -bound - && w - j <= bound - && v1[modulo(j, vec_size)] + v2[modulo(w - j, vec_size)] >= old_span - { - let (x, y, u, v) = if inverse { - (s, t, a, b) - } else { - (old_span - a, new_span - b, old_span - s, new_span - t) - }; - if i + bound > 1 || (x != u && y != v) { - diff_offsets( - diff, - old, - old_current, - old_current + x, - new, - new_current, - new_current + y, - )?; - if x != u { - diff.equal(old_current + x, new_current + y, u - x)?; - } - diff_offsets( - diff, - old, - old_current + u, - old_end, - new, - new_current + v, - new_end, - )?; - return Ok(()); - } else if new_span > old_span { - diff.equal(old_current, new_current, old_span)?; - diff.insert( - old_current + old_span, - new_current + old_span, - new_span - old_span, - )?; - return Ok(()); - } else if new_span < old_span { - diff.equal(old_current, new_current, new_span)?; - diff.delete( - old_current + new_span, - old_span - new_span, - new_current + new_span, - )?; - return Ok(()); - } else { - return Ok(()); - } - } } } } - } else if old_end > old_current { - diff.delete(old_current, old_end - old_current, new_current)? - } else if new_end > new_current { - diff.insert(old_current, new_current, new_end - new_current)? + + // Backward path + for k in (-d..=d).rev().step_by(2) { + let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) { + vb[k + 1] + } else { + vb[k - 1] + 1 + }; + let mut y = (x as isize - k) as usize; + + // The coordinate of the start of a snake + if x < n && y < m { + let advance = common_suffix_len( + old, + old_range.start..old_range.start + n - x, + new, + new_range.start..new_range.start + m - y, + ); + x += advance; + y += advance; + } + + // This is the new best x value + vb[k] = x; + + if !odd && (k - delta).abs() <= d { + // TODO optimize this so we don't have to compare against n + if vb[k] + vf[-(k - delta)] >= n { + // Return the snake + return Snake { + x_start: n - x + old_range.start, + y_start: m - y + new_range.start, + }; + } + } + } + + // TODO: Maybe there's an opportunity to optimize and bail early? } + + unreachable!("unable to find a middle snake"); +} + +fn conquer( + d: &mut D, + old: &Old, + mut old_range: Range, + new: &New, + mut new_range: Range, + vf: &mut V, + vb: &mut V, +) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + D: DiffHook, + New::Output: PartialEq, +{ + // Check for common prefix + let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); + if common_prefix_len > 0 { + d.equal(old_range.start, new_range.start, common_prefix_len)?; + } + old_range.start += common_prefix_len; + new_range.start += common_prefix_len; + + // Check for common suffix + let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone()); + let common_suffix = ( + old_range.end - common_suffix_len, + new_range.end - common_suffix_len, + ); + old_range.end -= common_suffix_len; + new_range.end -= common_suffix_len; + + if old_range.is_empty() && new_range.is_empty() { + // Do nothing + } else if new_range.is_empty() { + d.delete( + old_range.start, + old_range.end - old_range.start, + new_range.start, + )?; + } else if old_range.is_empty() { + d.insert( + old_range.start, + new_range.start, + new_range.end - new_range.start, + )?; + } else { + let snake = find_middle_snake(old, old_range.clone(), new, new_range.clone(), vf, vb); + let (old_a, old_b) = split_at(old_range, snake.x_start); + let (new_a, new_b) = split_at(new_range, snake.y_start); + conquer(d, old, old_a, new, new_a, vf, vb)?; + conquer(d, old, old_b, new, new_b, vf, vb)?; + } + + if common_suffix_len > 0 { + d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?; + } + Ok(()) } #[test] -fn test_modulo() { - assert_eq!(modulo(-11, 10), 9); - assert_eq!(modulo(23, 7), 2); - assert_eq!(modulo(-12, 6), 0); +fn test_find_middle_snake() { + let a = &b"ABCABBA"[..]; + let b = &b"CBABAC"[..]; + let max_d = max_d(a.len(), b.len()); + let mut vf = V::new(max_d); + let mut vb = V::new(max_d); + let snake = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb); + assert_eq!(snake.x_start, 4); + assert_eq!(snake.y_start, 1); } #[test] diff --git a/src/algorithms/patience.rs b/src/algorithms/patience.rs index b70b8b2..a442316 100644 --- a/src/algorithms/patience.rs +++ b/src/algorithms/patience.rs @@ -5,12 +5,15 @@ //! //! Tends to give more human-readable outputs. See [Bram Cohen's blog //! post](https://bramcohen.livejournal.com/73318.html) describing it. +//! +//! This is based on the patience implementation of [pijul](https://pijul.org/) +//! by Pierre-Étienne Meunier. use std::collections::hash_map::Entry; use std::collections::HashMap; use std::hash::Hash; use std::ops::{Index, Range}; -use crate::algorithms::{myers, DiffHook, Replace}; +use crate::algorithms::{myers, DiffHook, NoFinishHook, Replace}; /// Patience diff algorithm. /// @@ -147,16 +150,15 @@ where self.new_current += 1; } if self.old_current > a0 { - self.d.equal(a0, b0, self.old_current - a0)? + self.d.equal(a0, b0, self.old_current - a0)?; } - myers::diff_offsets( - self.d, + let mut no_finish_d = NoFinishHook::new(&mut self.d); + myers::diff( + &mut no_finish_d, self.old, - self.old_current, - self.old_indexes[old].index, + self.old_current..self.old_indexes[old].index, self.new, - self.new_current, - self.new_indexes[new].index, + self.new_current..self.new_indexes[new].index, )?; self.old_current = self.old_indexes[old].index; self.new_current = self.new_indexes[new].index; diff --git a/src/algorithms/snapshots/similar__algorithms__myers__contiguous.snap b/src/algorithms/snapshots/similar__algorithms__myers__contiguous.snap index 05b117c..38cca88 100644 --- a/src/algorithms/snapshots/similar__algorithms__myers__contiguous.snap +++ b/src/algorithms/snapshots/similar__algorithms__myers__contiguous.snap @@ -10,18 +10,18 @@ expression: d.into_inner().ops() }, Replace { old_index: 3, - old_len: 2, + old_len: 1, new_index: 3, new_len: 2, }, Equal { - old_index: 5, + old_index: 4, new_index: 5, len: 2, }, Replace { - old_index: 7, - old_len: 1, + old_index: 6, + old_len: 2, new_index: 7, new_len: 1, }, diff --git a/src/algorithms/snapshots/similar__algorithms__patience__patience.snap b/src/algorithms/snapshots/similar__algorithms__patience__patience.snap index d86a588..3ebd78a 100644 --- a/src/algorithms/snapshots/similar__algorithms__patience__patience.snap +++ b/src/algorithms/snapshots/similar__algorithms__patience__patience.snap @@ -16,18 +16,18 @@ expression: d.into_inner().ops() }, Replace { old_index: 4, - old_len: 2, + old_len: 1, new_index: 4, new_len: 2, }, Equal { - old_index: 6, + old_index: 5, new_index: 6, len: 2, }, Replace { - old_index: 8, - old_len: 1, + old_index: 7, + old_len: 2, new_index: 8, new_len: 1, }, diff --git a/src/lib.rs b/src/lib.rs index 19a4fec..fc7f9a3 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ //! This crate implements diffing utilities. It attempts to provide an abstraction -//! interface over different types of diffing algorithms. It's based on the -//! the diff algorithm implementations of [pijul](https://pijul.org/). +//! interface over different types of diffing algorithms. The design of the +//! library is inspired by pijul's diff library by Pierre-Étienne Meunier and +//! also inherits the patience diff algorithm from there. //! //! The API of the crate is split into high and low level functionality. Most //! of what you probably want to use is available top level. Additionally the diff --git a/src/text/snapshots/similar__text__captured_word_ops.snap b/src/text/snapshots/similar__text__captured_word_ops.snap index 203ee83..9232c8d 100644 --- a/src/text/snapshots/similar__text__captured_word_ops.snap +++ b/src/text/snapshots/similar__text__captured_word_ops.snap @@ -54,8 +54,10 @@ expression: "&changes" value: "some", }, Change { - tag: Insert, - old_index: None, + tag: Equal, + old_index: Some( + 5, + ), new_index: Some( 5, ), @@ -70,10 +72,8 @@ expression: "&changes" value: "amazing", }, Change { - tag: Equal, - old_index: Some( - 5, - ), + tag: Insert, + old_index: None, new_index: Some( 7, ),