commit 3308d7b4d67b859f563cdb4528ba3d4e1fff5f06 Author: Armin Ronacher Date: Sun Jan 17 22:18:57 2021 +0100 Initial implementation diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..96ef6c0 --- /dev/null +++ b/.gitignore @@ -0,0 +1,2 @@ +/target +Cargo.lock diff --git a/.vscode/settings.json b/.vscode/settings.json new file mode 100644 index 0000000..7649dc6 --- /dev/null +++ b/.vscode/settings.json @@ -0,0 +1,4 @@ +{ + "rust-analyzer.cargo.allFeatures": true, + "rust-analyzer.assist.importMergeBehavior": "last" +} \ No newline at end of file diff --git a/Cargo.toml b/Cargo.toml new file mode 100644 index 0000000..18f59c0 --- /dev/null +++ b/Cargo.toml @@ -0,0 +1,14 @@ +[package] +name = "similar" +version = "0.1.0" +authors = ["Armin Ronacher ", "Pierre-Étienne Meunier "] +edition = "2018" +license = "MIT/Apache-2.0" + +# See more keys and their definitions at https://doc.rust-lang.org/cargo/reference/manifest.html + +[dependencies] +diffs = "0.4.0" + +[dev-dependencies] +insta = "1.5.2" diff --git a/src/algorithms/hook.rs b/src/algorithms/hook.rs new file mode 100644 index 0000000..4e38f2c --- /dev/null +++ b/src/algorithms/hook.rs @@ -0,0 +1,85 @@ +/// A trait for reacting to an edit script from the "old" version to +/// the "new" version. +pub trait DiffHook: Sized { + type Error; + + /// Called when lines with indices `old` (in the old version) and + /// `new` (in the new version) start an section equal in both + /// versions, of length `len`. + fn equal(&mut self, old: usize, new: usize, len: usize) -> Result<(), Self::Error> { + let _old = old; + let _new = new; + let _len = len; + Ok(()) + } + + /// Called when a section of length `len`, starting at `old`, + /// needs to be deleted from the old version. + fn delete(&mut self, old: usize, old_len: usize, new: usize) -> Result<(), Self::Error> { + let _old = old; + let _old_len = old_len; + let _new = new; + Ok(()) + } + + /// Called when a section of the new version, of length `new_len` + /// and starting at `new`, needs to be inserted at position `old'. + fn insert(&mut self, old: usize, new: usize, new_len: usize) -> Result<(), Self::Error> { + let _old = old; + let _new = new; + let _new_len = new_len; + Ok(()) + } + + /// Called when a section of the old version, starting at index + /// `old` and of length `old_len`, needs to be replaced with a + /// section of length `new_len`, starting at `new`, of the new + /// version. + /// + /// The default implementations invokes `delete` and `insert`. + fn replace( + &mut self, + old: usize, + old_len: usize, + new: usize, + new_len: usize, + ) -> Result<(), Self::Error> { + self.delete(old, old_len, new)?; + self.insert(old, new, new_len) + } + + /// Always called at the end of the algorithm. + fn finish(&mut self) -> Result<(), Self::Error> { + Ok(()) + } +} + +impl<'a, D: DiffHook + 'a> DiffHook for &'a mut D { + type Error = D::Error; + + fn equal(&mut self, old: usize, new: usize, len: usize) -> Result<(), Self::Error> { + (*self).equal(old, new, len) + } + + fn delete(&mut self, old: usize, len: usize, new: usize) -> Result<(), Self::Error> { + (*self).delete(old, len, new) + } + + fn insert(&mut self, old: usize, new: usize, new_len: usize) -> Result<(), Self::Error> { + (*self).insert(old, new, new_len) + } + + fn replace( + &mut self, + old: usize, + old_len: usize, + new: usize, + new_len: usize, + ) -> Result<(), Self::Error> { + (*self).replace(old, old_len, new, new_len) + } + + fn finish(&mut self) -> Result<(), Self::Error> { + (*self).finish() + } +} diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs new file mode 100644 index 0000000..f9d7019 --- /dev/null +++ b/src/algorithms/mod.rs @@ -0,0 +1,22 @@ +//! Various diff (longest common subsequence) algorithms. +//! +//! The implementations of the algorithms in this module are relatively low +//! level and expose the most generic bounds possible for the algorithm. To +//! use them you would typically use the higher level API if possible but +//! direct access to these algorithms can be useful in some cases. +//! +//! All these algorithms provide a `diff` function which takes two indexable +//! objects (for instance slices) and a [`DiffHook`]. As the diff is generated +//! the diff hook is invoked. Note that the diff hook does not get access to +//! the actual values but only the indexes. This is why the diff hook is not +//! used outside of the raw algorithm implementations as for most situations +//! access to the values is useful of required. + +mod hook; +mod replace; + +pub use hook::*; +pub use replace::*; + +pub mod myers; +pub mod patience; diff --git a/src/algorithms/myers.rs b/src/algorithms/myers.rs new file mode 100644 index 0000000..1a01916 --- /dev/null +++ b/src/algorithms/myers.rs @@ -0,0 +1,251 @@ +//! Myers' diff algorithm. +//! +//! * time: `O((N+M)D)` +//! * space `O(N+M)` +//! +//! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) +//! describing it. + +use std::cmp::{max, min}; +use std::ops::{Index, Range}; + +use crate::algorithms::DiffHook; + +fn modulo(a: isize, b: usize) -> usize { + a.rem_euclid(b as isize) as usize +} + +/// Myers' diff algorithm. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +pub fn diff( + d: &mut D, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, +) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + D: DiffHook, + New::Output: PartialEq, +{ + diff_offsets( + d, + old, + old_range.start, + old_range.end, + new, + new_range.start, + new_range.end, + )?; + d.finish() +} + +/// Shortcut for diffing slices. +pub fn diff_slices(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> +where + D: DiffHook, + T: PartialEq, +{ + diff(d, old, 0..old.len(), new, 0..new.len()) +} + +pub(crate) fn diff_offsets( + diff: &mut D, + e: &S, + i: usize, + i_: usize, + f: &T, + j: usize, + j_: usize, +) -> Result<(), D::Error> +where + D: DiffHook + ?Sized, + S: Index + ?Sized, + T: Index + ?Sized, + T::Output: PartialEq, +{ + if i_ > i && j_ > j { + let n = i_ - i; + let m = j_ - j; + let l = (n + m) as isize; + let z = (2 * min(n, m) + 2) as usize; + let w = n as isize - m as isize; + let mut g = vec![0; z as usize]; + let mut p = vec![0; z as usize]; + for h in 0..=(l / 2 + l % 2) { + macro_rules! search { + ($e: expr, $c: expr, $d: expr) => { + let (k0, k1) = { + let (m, n) = (m as isize, n as isize); + (-(h - 2*max(0, h - m)), h-2*max(0, h-n)+1) + }; + for k in (k0..k1).step_by(2) { + let mut a: usize = if k == -h || k != h && $c[modulo(k-1, z)] < $c[modulo(k+1, z)] { + $c[modulo(k+1, z)] + } else { + $c[modulo(k-1, z)] + 1 + }; + let mut b = (a as isize - k) as usize; + let (s, t) = (a, b); + while a < n && b < m && { + let (e_i, f_i) = if $e { (a, b) } else { (n - a - 1, m - b - 1) }; + f[j + f_i] == e[i + e_i] + } { + a += 1; + b += 1; + } + $c[modulo(k, z)] = a; + let bound = if $e { h-1 } else { h }; + if (l%2 == 1) == $e + && w-k >= -bound && w-k <= bound + && $c[modulo(k, z)]+$d[modulo(w-k, z)] >= n + { + let (x, y, u, v) = if $e { + (s, t, a, b) + } else { + (n-a, m-b, n-s, m-t) + }; + if h + bound > 1 || (x != u && y != v) { + diff_offsets(diff, e, i, i+x, f, j, j+y)?; + if x != u { + diff.equal(i + x, j + y, u-x)?; + } + diff_offsets(diff, e, i+u, i_, f, j+v, j_)?; + return Ok(()) + } else if m > n { + diff.equal(i, j, n)?; + diff.insert(i+n, j+n, m-n)?; + return Ok(()) + } else if m < n { + diff.equal(i, j, m)?; + diff.delete(i+m, n-m, j+m)?; + return Ok(()) + } else { + return Ok(()) + } + } + } + } + } + search!(true, g, p); + search!(false, p, g); + } + } else if i_ > i { + diff.delete(i, i_ - i, j)? + } else if j_ > j { + diff.insert(i, j, j_ - j)? + } + Ok(()) +} + +#[test] +fn test_modulo() { + assert_eq!(modulo(-11, 10), 9); + assert_eq!(modulo(23, 7), 2); + assert_eq!(modulo(-12, 6), 0); +} + +#[test] +fn test_diff() { + let a: &[usize] = &[0, 1, 2, 3, 4]; + let b: &[usize] = &[0, 1, 2, 9, 4]; + + struct D; + impl DiffHook for D { + type Error = (); + fn delete(&mut self, o: usize, len: usize, new: usize) -> Result<(), ()> { + assert_eq!(o, 3); + assert_eq!(len, 1); + assert_eq!(new, 3); + println!("delete"); + Ok(()) + } + fn insert(&mut self, o: usize, n: usize, len: usize) -> Result<(), ()> { + assert_eq!(o, 3); + assert_eq!(n, 3); + assert_eq!(len, 1); + println!("insert"); + Ok(()) + } + } + + let mut d = crate::algorithms::Replace::new(D); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap() +} + +#[test] +fn test_contiguous() { + let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5]; + let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; + struct D; + + impl DiffHook for D { + type Error = (); + fn delete(&mut self, _o: usize, _len: usize, _new: usize) -> Result<(), ()> { + panic!("Should not delete") + } + fn insert(&mut self, _o: usize, _n: usize, _len: usize) -> Result<(), ()> { + panic!("Should not insert") + } + fn replace(&mut self, o: usize, l: usize, n: usize, nl: usize) -> Result<(), ()> { + assert!(o != 3 || (l == 2 && nl == 2)); + assert!(o != 7 || (l == 1 && nl == 1)); + println!("replace {:?} {:?} {:?} {:?}", o, l, n, nl); + Ok(()) + } + } + + let mut d = crate::algorithms::Replace::new(D); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); +} + +#[test] +fn test_replace() { + let a: &[usize] = &[0, 1, 2, 3, 4]; + let b: &[usize] = &[0, 1, 2, 7, 8, 9]; + + struct D; + impl DiffHook for D { + type Error = (); + fn delete(&mut self, _o: usize, _len: usize, _new: usize) -> Result<(), ()> { + panic!("should not delete") + } + fn insert(&mut self, _o: usize, _n: usize, _len: usize) -> Result<(), ()> { + panic!("should not insert") + } + fn replace(&mut self, _o: usize, _l: usize, _n: usize, _nl: usize) -> Result<(), ()> { + Ok(()) + } + } + let mut d = crate::algorithms::Replace::new(D); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); +} + +#[test] +fn test_pat() { + let a: &[usize] = &[0, 1, 3, 4, 5]; + let b: &[usize] = &[0, 1, 4, 5, 8, 9]; + + struct D; + impl DiffHook for D { + type Error = (); + fn delete(&mut self, _o: usize, _len: usize, _new: usize) -> Result<(), ()> { + println!("delete {:?} {:?} {:?}", _o, _len, _new); + Ok(()) + } + fn insert(&mut self, _o: usize, _n: usize, _len: usize) -> Result<(), ()> { + println!("insert {:?} {:?} {:?}", _o, _n, _len); + Ok(()) + } + fn replace(&mut self, _o: usize, _l: usize, _n: usize, _nl: usize) -> Result<(), ()> { + println!("replace {:?} {:?} {:?} {:?}", _o, _l, _n, _nl); + Ok(()) + } + } + + let mut d = crate::algorithms::Replace::new(D); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); +} diff --git a/src/algorithms/patience.rs b/src/algorithms/patience.rs new file mode 100644 index 0000000..5470c41 --- /dev/null +++ b/src/algorithms/patience.rs @@ -0,0 +1,234 @@ +//! Patience diff algorithm. +//! +//! * time: `O(N log N + M log M + (N+M)D)` +//! * space: `O(N+M)` +//! +//! Tends to give more human-readable outputs. See [Bram Cohen's blog +//! post](https://bramcohen.livejournal.com/73318.html) describing it. +use std::collections::hash_map::Entry; +use std::collections::HashMap; +use std::hash::Hash; +use std::ops::{Index, Range}; + +use crate::algorithms::{myers, DiffHook, Replace}; + +struct Indexable<'a, T: ?Sized> { + p: &'a T, + i: usize, +} + +impl<'a, T: Index + 'a> std::fmt::Debug for Indexable<'a, T> +where + T::Output: std::fmt::Debug, +{ + fn fmt(&self, fmt: &mut std::fmt::Formatter) -> std::fmt::Result { + write!(fmt, "{:?}", &self.p[self.i]) + } +} + +impl<'a, 'b, A, B> PartialEq> for Indexable<'b, B> +where + A: Index + 'b + ?Sized, + B: Index + 'b + ?Sized, + B::Output: PartialEq, +{ + fn eq(&self, b: &Indexable<'a, A>) -> bool { + self.p[self.i] == b.p[b.i] + } +} + +fn unique(p: &T, e0: usize, e1: usize) -> Vec> +where + T: Index + ?Sized, + T::Output: Hash + Eq, +{ + let mut aa = HashMap::new(); + for i in e0..e1 { + match aa.entry(&p[i]) { + Entry::Vacant(e) => { + e.insert(Some(i)); + } + Entry::Occupied(mut e) => { + let e = e.get_mut(); + if e.is_some() { + *e = None + } + } + } + } + let mut v: Vec<_> = aa + .into_iter() + .filter_map(|(_, x)| x) + .map(|i| Indexable { p, i }) + .collect(); + v.sort_by(|a, b| a.i.cmp(&b.i)); + v +} + +/// Patience diff algorithm. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +pub fn diff( + d: &mut D, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, +) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + Old::Output: Hash + Eq, + New::Output: PartialEq + Hash + Eq, + D: DiffHook, +{ + let au = unique(old, old_range.start, old_range.end); + let bu = unique(new, old_range.start, old_range.end); + + struct Patience< + 'a, + 'b, + 'd, + S: 'a + Index + ?Sized, + T: 'b + Index + ?Sized, + D: DiffHook + 'd, + > { + current_a: usize, + current_b: usize, + a1: usize, + b1: usize, + a: &'a S, + b: &'b T, + d: &'d mut D, + au: &'a [Indexable<'a, S>], + bu: &'b [Indexable<'b, T>], + } + impl< + 'a, + 'b, + 'd, + S: 'a + Index + ?Sized, + T: 'b + Index + ?Sized, + D: DiffHook + 'd, + > DiffHook for Patience<'a, 'b, 'd, S, T, D> + where + T::Output: PartialEq, + { + type Error = D::Error; + fn equal(&mut self, old: usize, new: usize, len: usize) -> Result<(), D::Error> { + for (old, new) in (old..old + len).zip(new..new + len) { + let a0 = self.current_a; + let b0 = self.current_b; + while self.current_a < self.au[old].i + && self.current_b < self.bu[new].i + && self.b[self.current_b] == self.a[self.current_a] + { + self.current_a += 1; + self.current_b += 1; + } + if self.current_a > a0 { + self.d.equal(a0, b0, self.current_a - a0)? + } + myers::diff_offsets( + self.d, + self.a, + self.current_a, + self.au[old].i, + self.b, + self.current_b, + self.bu[new].i, + )?; + self.current_a = self.au[old].i; + self.current_b = self.bu[new].i; + } + Ok(()) + } + + fn finish(&mut self) -> Result<(), D::Error> { + myers::diff( + self.d, + self.a, + self.current_a..self.a1, + self.b, + self.current_b..self.b1, + ) + } + } + let mut d = Replace::new(Patience { + current_a: old_range.start, + current_b: new_range.start, + a: old, + a1: old_range.end, + b: new, + b1: new_range.end, + d, + au: &au, + bu: &bu, + }); + myers::diff(&mut d, &au, 0..au.len(), &bu, 0..bu.len())?; + Ok(()) +} + +/// Shortcut for diffing slices. +pub fn diff_slices(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> +where + D: DiffHook, + T: Eq + Hash, +{ + diff(d, old, 0..old.len(), new, 0..new.len()) +} + +#[test] +fn test_patience() { + let a: &[usize] = &[11, 1, 2, 2, 3, 4, 4, 4, 5, 47, 19]; + let b: &[usize] = &[10, 1, 2, 2, 8, 9, 4, 4, 7, 47, 18]; + + struct D(Vec<(usize, usize, usize, usize)>); + impl DiffHook for D { + type Error = (); + fn delete(&mut self, o: usize, len: usize, new: usize) -> Result<(), ()> { + self.0.push((o, len, new, 0)); + Ok(()) + } + fn insert(&mut self, o: usize, n: usize, len: usize) -> Result<(), ()> { + self.0.push((o, 0, n, len)); + Ok(()) + } + fn replace(&mut self, o: usize, l: usize, n: usize, nl: usize) -> Result<(), ()> { + self.0.push((o, l, n, nl)); + Ok(()) + } + } + let mut d = Replace::new(D(Vec::new())); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); + let d: D = d.into_inner(); + + insta::assert_json_snapshot!(&d.0.as_slice(), @r###" + [ + [ + 0, + 1, + 0, + 1 + ], + [ + 4, + 2, + 4, + 2 + ], + [ + 8, + 1, + 8, + 1 + ], + [ + 10, + 1, + 10, + 1 + ] + ] + "###); +} diff --git a/src/algorithms/replace.rs b/src/algorithms/replace.rs new file mode 100644 index 0000000..996cb7a --- /dev/null +++ b/src/algorithms/replace.rs @@ -0,0 +1,172 @@ +use crate::algorithms::DiffHook; + +/// A [`DiffHook`] that combines deletions and insertions to give blocks +/// of maximal length, and replacements when appropriate. +pub struct Replace { + d: D, + del: Option<(usize, usize, usize)>, + ins: Option<(usize, usize, usize)>, + eq: Option<(usize, usize, usize)>, +} + +impl Replace { + pub fn new(d: D) -> Self { + Replace { + d, + del: None, + ins: None, + eq: None, + } + } + pub fn into_inner(self) -> D { + self.d + } +} + +impl AsRef for Replace { + fn as_ref(&self) -> &D { + &self.d + } +} + +impl AsMut for Replace { + fn as_mut(&mut self) -> &mut D { + &mut self.d + } +} + +impl DiffHook for Replace { + type Error = D::Error; + fn equal(&mut self, old: usize, new: usize, len: usize) -> Result<(), D::Error> { + if let Some((old0, len0, new0)) = self.del.take() { + if let Some((_, new1, new_len1)) = self.ins.take() { + self.d.replace(old0, len0, new1, new_len1)? + } else { + self.d.delete(old0, len0, new0)? + } + } else if let Some((old0, new0, new_len0)) = self.ins.take() { + self.d.insert(old0, new0, new_len0)? + } + + if let Some((a, b, c)) = self.eq.take() { + self.eq = Some((a, b, c + len)) + } else { + self.eq = Some((old, new, len)) + } + Ok(()) + } + fn delete(&mut self, old: usize, len: usize, new: usize) -> Result<(), D::Error> { + if let Some((a, b, c)) = self.eq.take() { + self.d.equal(a, b, c)? + } + if let Some((old0, len0, new0)) = self.del.take() { + assert_eq!(old, old0 + len0); + self.del = Some((old0, len0 + len, new0)) + } else { + self.del = Some((old, len, new)) + } + Ok(()) + } + + fn insert(&mut self, old: usize, new: usize, new_len: usize) -> Result<(), D::Error> { + if let Some((a, b, c)) = self.eq.take() { + self.d.equal(a, b, c)? + } + if let Some((old1, new1, new_len1)) = self.ins.take() { + assert_eq!(new1 + new_len1, new); + self.ins = Some((old1, new1, new_len + new_len1)) + } else { + self.ins = Some((old, new, new_len)) + } + Ok(()) + } + + fn replace( + &mut self, + old: usize, + old_len: usize, + new: usize, + new_len: usize, + ) -> Result<(), D::Error> { + if let Some((a, b, c)) = self.eq.take() { + self.d.equal(a, b, c)? + } + self.d.replace(old, old_len, new, new_len) + } + + fn finish(&mut self) -> Result<(), D::Error> { + if let Some((a, b, c)) = self.eq.take() { + self.d.equal(a, b, c)? + } + if let Some((old0, len0, new0)) = self.del.take() { + if let Some((_, new1, new_len1)) = self.ins.take() { + self.d.replace(old0, len0, new1, new_len1)? + } else { + self.d.delete(old0, len0, new0)? + } + } else if let Some((old0, new0, new_len0)) = self.ins.take() { + self.d.insert(old0, new0, new_len0)? + } + self.d.finish() + } +} + +#[test] +fn myers() { + use crate::algorithms::myers; + let a: &[&str] = &[ + ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", + "a\n", + "b\n", + "c\n", + "================================\n", + "d\n", + "e\n", + "f\n", + "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", + ]; + let b: &[&str] = &[ + ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", + "x\n", + "b\n", + "c\n", + "================================\n", + "y\n", + "e\n", + "f\n", + "<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<\n", + ]; + + struct D(Vec); + impl DiffHook for D { + type Error = (); + fn equal(&mut self, o: usize, n: usize, len: usize) -> Result<(), ()> { + self.0.push(format!("equal {:?} {:?} {:?}", o, n, len)); + Ok(()) + } + fn delete(&mut self, o: usize, len: usize, new: usize) -> Result<(), ()> { + self.0.push(format!("delete {:?} {:?} {:?}", o, len, new)); + Ok(()) + } + fn insert(&mut self, o: usize, n: usize, len: usize) -> Result<(), ()> { + self.0.push(format!("insert {:?} {:?} {:?}", o, n, len)); + Ok(()) + } + fn replace(&mut self, o: usize, l: usize, n: usize, nl: usize) -> Result<(), ()> { + self.0 + .push(format!("replace {:?} {:?} {:?} {:?}", o, l, n, nl)); + Ok(()) + } + } + let mut d = Replace::new(D(Vec::new())); + myers::diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); + + insta::assert_yaml_snapshot!(&d.into_inner().0, @r###" + --- + - equal 0 0 1 + - replace 1 1 1 1 + - equal 2 2 3 + - replace 5 1 5 1 + - equal 6 6 3 + "###); +} diff --git a/src/lib.rs b/src/lib.rs new file mode 100644 index 0000000..d7d68f7 --- /dev/null +++ b/src/lib.rs @@ -0,0 +1 @@ +pub mod algorithms;