//! Myers' diff algorithm. //! //! * time: `O((N+M)D)` //! * space `O(N+M)` //! //! See [the original article by Eugene W. Myers](http://www.xmailserver.org/diff2.pdf) //! describing it. //! //! The implementation of this algorithm is based on the implementation by //! Brandon Williams. use std::ops::{Index, IndexMut, Range}; use crate::algorithms::DiffHook; /// Myers' diff algorithm. /// /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. pub fn diff( d: &mut D, old: &Old, old_range: Range, new: &New, new_range: Range, ) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, D: DiffHook, New::Output: PartialEq, { let max_d = max_d(old_range.len(), new_range.len()); let mut vf = V::new(max_d); let mut vb = V::new(max_d); conquer(d, old, old_range, new, new_range, &mut vf, &mut vb)?; d.finish() } /// Shortcut for diffing slices. pub fn diff_slices(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> where D: DiffHook, T: PartialEq, { diff(d, old, 0..old.len(), new, 0..new.len()) } // A D-path is a path which starts at (0,0) that has exactly D non-diagonal // edges. All D-paths consist of a (D - 1)-path followed by a non-diagonal edge // and then a possibly empty sequence of diagonal edges called a snake. /// `V` contains the endpoints of the furthest reaching `D-paths`. For each /// recorded endpoint `(x,y)` in diagonal `k`, we only need to retain `x` because /// `y` can be computed from `x - k`. In other words, `V` is an array of integers /// where `V[k]` contains the row index of the endpoint of the furthest reaching /// path in diagonal `k`. /// /// We can't use a traditional Vec to represent `V` since we use `k` as an index /// and it can take on negative values. So instead `V` is represented as a /// light-weight wrapper around a Vec plus an `offset` which is the maximum value /// `k` can take on in order to map negative `k`'s back to a value >= 0. #[derive(Debug)] struct V { offset: isize, v: Vec, // Look into initializing this to -1 and storing isize } impl V { fn new(max_d: usize) -> Self { Self { offset: max_d as isize, v: vec![0; 2 * max_d], } } fn len(&self) -> usize { self.v.len() } } impl Index for V { type Output = usize; fn index(&self, index: isize) -> &Self::Output { &self.v[(index + self.offset) as usize] } } impl IndexMut for V { fn index_mut(&mut self, index: isize) -> &mut Self::Output { &mut self.v[(index + self.offset) as usize] } } /// A `Snake` is a sequence of diagonal edges in the edit graph. Normally /// a snake has a start end end point (and it is possible for a snake to have /// a length of zero, meaning the start and end points are the same) however /// we do not need the end point which is why it's not implemented here. #[derive(Debug)] struct Snake { x_start: usize, y_start: usize, } fn max_d(len1: usize, len2: usize) -> usize { // XXX look into reducing the need to have the additional '+ 1' (len1 + len2 + 1) / 2 + 1 } fn common_prefix_len( old: &Old, old_range: Range, new: &New, new_range: Range, ) -> usize where Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { if old_range.is_empty() || new_range.is_empty() { return 0; } new_range .zip(old_range) .take_while( #[inline(always)] |x| new[x.0] == old[x.1], ) .count() } fn common_suffix_len( old: &Old, old_range: Range, new: &New, new_range: Range, ) -> usize where Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { if old_range.is_empty() || new_range.is_empty() { return 0; } new_range .rev() .zip(old_range.rev()) .take_while( #[inline(always)] |x| new[x.0] == old[x.1], ) .count() } #[inline(always)] fn split_at(range: Range, at: usize) -> (Range, Range) { (range.start..at, at..range.end) } // The divide part of a divide-and-conquer strategy. A D-path has D+1 snakes // some of which may be empty. The divide step requires finding the ceil(D/2) + // 1 or middle snake of an optimal D-path. The idea for doing so is to // simultaneously run the basic algorithm in both the forward and reverse // directions until furthest reaching forward and reverse paths starting at // opposing corners 'overlap'. fn find_middle_snake( old: &Old, old_range: Range, new: &New, new_range: Range, vf: &mut V, vb: &mut V, ) -> Snake where Old: Index + ?Sized, New: Index + ?Sized, New::Output: PartialEq, { let n = old_range.len(); let m = new_range.len(); // By Lemma 1 in the paper, the optimal edit script length is odd or even as // `delta` is odd or even. let delta = n as isize - m as isize; let odd = delta & 1 == 1; // The initial point at (0, -1) vf[1] = 0; // The initial point at (N, M+1) vb[1] = 0; // We only need to explore ceil(D/2) + 1 let d_max = max_d(n, m); assert!(vf.len() >= d_max); assert!(vb.len() >= d_max); for d in 0..d_max as isize { // Forward path for k in (-d..=d).rev().step_by(2) { let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { vf[k + 1] } else { vf[k - 1] + 1 }; let y = (x as isize - k) as usize; // The coordinate of the start of a snake let (x0, y0) = (x, y); // While these sequences are identical, keep moving through the // graph with no cost if x < old_range.len() && y < new_range.len() { let advance = common_prefix_len( old, old_range.start + x..old_range.end, new, new_range.start + y..new_range.end, ); x += advance; } // This is the new best x value vf[k] = x; // Only check for connections from the forward search when N - M is // odd and when there is a reciprocal k line coming from the other // direction. if odd && (k - delta).abs() <= (d - 1) { // TODO optimize this so we don't have to compare against n if vf[k] + vb[-(k - delta)] >= n { // Return the snake return Snake { x_start: x0 + old_range.start, y_start: y0 + new_range.start, }; } } } // Backward path for k in (-d..=d).rev().step_by(2) { let mut x = if k == -d || (k != d && vb[k - 1] < vb[k + 1]) { vb[k + 1] } else { vb[k - 1] + 1 }; let mut y = (x as isize - k) as usize; // The coordinate of the start of a snake if x < n && y < m { let advance = common_suffix_len( old, old_range.start..old_range.start + n - x, new, new_range.start..new_range.start + m - y, ); x += advance; y += advance; } // This is the new best x value vb[k] = x; if !odd && (k - delta).abs() <= d { // TODO optimize this so we don't have to compare against n if vb[k] + vf[-(k - delta)] >= n { // Return the snake return Snake { x_start: n - x + old_range.start, y_start: m - y + new_range.start, }; } } } // TODO: Maybe there's an opportunity to optimize and bail early? } unreachable!("unable to find a middle snake"); } fn conquer( d: &mut D, old: &Old, mut old_range: Range, new: &New, mut new_range: Range, vf: &mut V, vb: &mut V, ) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, D: DiffHook, New::Output: PartialEq, { // Check for common prefix let common_prefix_len = common_prefix_len(old, old_range.clone(), new, new_range.clone()); if common_prefix_len > 0 { d.equal(old_range.start, new_range.start, common_prefix_len)?; } old_range.start += common_prefix_len; new_range.start += common_prefix_len; // Check for common suffix let common_suffix_len = common_suffix_len(old, old_range.clone(), new, new_range.clone()); let common_suffix = ( old_range.end - common_suffix_len, new_range.end - common_suffix_len, ); old_range.end -= common_suffix_len; new_range.end -= common_suffix_len; if old_range.is_empty() && new_range.is_empty() { // Do nothing } else if new_range.is_empty() { d.delete( old_range.start, old_range.end - old_range.start, new_range.start, )?; } else if old_range.is_empty() { d.insert( old_range.start, new_range.start, new_range.end - new_range.start, )?; } else { let snake = find_middle_snake(old, old_range.clone(), new, new_range.clone(), vf, vb); let (old_a, old_b) = split_at(old_range, snake.x_start); let (new_a, new_b) = split_at(new_range, snake.y_start); conquer(d, old, old_a, new, new_a, vf, vb)?; conquer(d, old, old_b, new, new_b, vf, vb)?; } if common_suffix_len > 0 { d.equal(common_suffix.0, common_suffix.1, common_suffix_len)?; } Ok(()) } #[test] fn test_find_middle_snake() { let a = &b"ABCABBA"[..]; let b = &b"CBABAC"[..]; let max_d = max_d(a.len(), b.len()); let mut vf = V::new(max_d); let mut vb = V::new(max_d); let snake = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb); assert_eq!(snake.x_start, 4); assert_eq!(snake.y_start, 1); } #[test] fn test_diff() { let a: &[usize] = &[0, 1, 2, 3, 4]; let b: &[usize] = &[0, 1, 2, 9, 4]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); diff_slices(&mut d, a, b).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } #[test] fn test_contiguous() { let a: &[usize] = &[0, 1, 2, 3, 4, 4, 4, 5]; let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); diff_slices(&mut d, a, b).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } #[test] fn test_pat() { let a: &[usize] = &[0, 1, 3, 4, 5]; let b: &[usize] = &[0, 1, 4, 5, 8, 9]; let mut d = crate::algorithms::Capture::new(); diff_slices(&mut d, a, b).unwrap(); insta::assert_debug_snapshot!(d.ops()); }