From 74e2805a959c85517579dd88f86a7f04c4118652 Mon Sep 17 00:00:00 2001 From: Armin Ronacher Date: Sat, 20 Feb 2021 10:12:06 +0100 Subject: [PATCH] Added deadlines to all algorithms (#18) This adds a deadline to all algorithms which lets one bail in a fixed amount of time to avoid pathological cases. --- CHANGELOG.md | 2 + src/algorithms/capture.rs | 4 +- src/algorithms/lcs.rs | 93 +++++++++--- src/algorithms/mod.rs | 50 ++++++- src/algorithms/myers.rs | 138 ++++++++++++++++-- src/algorithms/patience.rs | 44 +++++- src/algorithms/replace.rs | 8 +- ...__algorithms__myers__deadline_reached.snap | 22 +++ src/common.rs | 47 +++++- src/lib.rs | 16 ++ src/text/mod.rs | 45 +++++- 11 files changed, 412 insertions(+), 57 deletions(-) create mode 100644 src/algorithms/snapshots/similar__algorithms__myers__deadline_reached.snap diff --git a/CHANGELOG.md b/CHANGELOG.md index 653e8d3..517920d 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,6 +6,8 @@ All notable changes to similar are documented here. * Performance improvements for the LCS algorithm. * Small performance improvments by adding an early opt-out for and inline highlighting. +* Added deadlines to all diffing algorithms to bail early. +* Deprecated slice diffing methods in the individual algorithm modules. ## 1.2.2 diff --git a/src/algorithms/capture.rs b/src/algorithms/capture.rs index 49375a5..4f4990d 100644 --- a/src/algorithms/capture.rs +++ b/src/algorithms/capture.rs @@ -94,7 +94,7 @@ impl DiffHook for Capture { #[test] fn test_capture_hook_grouping() { - use crate::algorithms::{myers, Replace}; + use crate::algorithms::{diff_slices, Algorithm, Replace}; let rng = (1..100).collect::>(); let mut rng_new = rng.clone(); @@ -104,7 +104,7 @@ fn test_capture_hook_grouping() { rng_new[34] = 1000; let mut d = Replace::new(Capture::new()); - myers::diff_slices(&mut d, &rng, &rng_new).unwrap(); + diff_slices(Algorithm::Myers, &mut d, &rng, &rng_new).unwrap(); let ops = d.into_inner().into_grouped_ops(3); let tags = ops diff --git a/src/algorithms/lcs.rs b/src/algorithms/lcs.rs index 8dd7fe9..206e115 100644 --- a/src/algorithms/lcs.rs +++ b/src/algorithms/lcs.rs @@ -4,12 +4,18 @@ //! * space `O(MN)` use std::collections::BTreeMap; use std::ops::{Index, Range}; +use std::time::Instant; use crate::algorithms::DiffHook; /// Hunt–McIlroy / Hunt–Szymanski LCS diff algorithm. /// /// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +/// +/// This diff is done with an optional deadline that defines the maximal +/// execution time permitted before it bails and falls back to an very bad +/// approximation. Deadlines with LCS do not make a lot of sense and should +/// not be used. pub fn diff( d: &mut D, old: &Old, @@ -17,6 +23,29 @@ pub fn diff( new: &New, new_range: Range, ) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + D: DiffHook, + New::Output: PartialEq, +{ + diff_deadline(d, old, old_range, new, new_range, None) +} + +/// Hunt–McIlroy / Hunt–Szymanski LCS diff algorithm. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +/// +/// This diff is done with an optional deadline that defines the maximal +/// execution time permitted before it bails and falls back to an approximation. +pub fn diff_deadline( + d: &mut D, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, + deadline: Option, +) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, @@ -44,11 +73,12 @@ where .take_while(|x| new[x.1] == old[x.0]) .count(); - let table = make_table( + let maybe_table = make_table( old, prefix_len..(old_range.len() - suffix_len), new, prefix_len..(new_range.len() - suffix_len), + deadline, ); let mut old_idx = 0; let mut new_idx = 0; @@ -59,23 +89,30 @@ where d.equal(old_range.start, new_range.start, prefix_len)?; } - while new_idx < new_len && old_idx < old_len { + if let Some(table) = maybe_table { + while new_idx < new_len && old_idx < old_len { + let old_orig_idx = old_range.start + prefix_len + old_idx; + let new_orig_idx = new_range.start + prefix_len + new_idx; + + if new[new_orig_idx] == old[old_orig_idx] { + d.equal(old_orig_idx, new_orig_idx, 1)?; + old_idx += 1; + new_idx += 1; + } else if table.get(&(new_idx, old_idx + 1)).map_or(0, |&x| x) + >= table.get(&(new_idx + 1, old_idx)).map_or(0, |&x| x) + { + d.delete(old_orig_idx, 1, new_orig_idx)?; + old_idx += 1; + } else { + d.insert(old_orig_idx, new_orig_idx, 1)?; + new_idx += 1; + } + } + } else { let old_orig_idx = old_range.start + prefix_len + old_idx; let new_orig_idx = new_range.start + prefix_len + new_idx; - - if new[new_orig_idx] == old[old_orig_idx] { - d.equal(old_orig_idx, new_orig_idx, 1)?; - old_idx += 1; - new_idx += 1; - } else if table.get(&(new_idx, old_idx + 1)).map_or(0, |&x| x) - >= table.get(&(new_idx + 1, old_idx)).map_or(0, |&x| x) - { - d.delete(old_orig_idx, 1, new_orig_idx)?; - old_idx += 1; - } else { - d.insert(old_orig_idx, new_orig_idx, 1)?; - new_idx += 1; - } + d.delete(old_orig_idx, old_len, new_orig_idx)?; + d.insert(old_orig_idx, new_orig_idx, new_len)?; } if old_idx < old_len { @@ -107,6 +144,10 @@ where } /// Shortcut for diffing slices. +#[deprecated( + since = "1.4.0", + note = "slice utility function is now only available via similar::algorithms::diff_slices" +)] pub fn diff_slices(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> where D: DiffHook, @@ -120,7 +161,8 @@ fn make_table( old_range: Range, new: &New, new_range: Range, -) -> BTreeMap<(usize, usize), u32> + deadline: Option, +) -> Option> where Old: Index + ?Sized, New: Index + ?Sized, @@ -131,6 +173,13 @@ where let mut table = BTreeMap::new(); for i in (0..new_len).rev() { + // are we running for too long? give up on the table + if let Some(deadline) = deadline { + if Instant::now() > deadline { + return None; + } + } + for j in (0..old_len).rev() { let val = if new[i] == old[j] { table.get(&(i + 1, j + 1)).map_or(0, |&x| x) + 1 @@ -146,12 +195,12 @@ where } } - table + Some(table) } #[test] fn test_table() { - let table = make_table(&vec![2, 3], 0..2, &vec![0, 1, 2], 0..3); + let table = make_table(&vec![2, 3], 0..2, &vec![0, 1, 2], 0..3, None).unwrap(); let expected = { let mut m = BTreeMap::new(); m.insert((1, 0), 1); @@ -168,7 +217,7 @@ fn test_diff() { let b: &[usize] = &[0, 1, 2, 9, 4]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } @@ -178,7 +227,7 @@ fn test_contiguous() { let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } @@ -188,6 +237,6 @@ fn test_pat() { let b: &[usize] = &[0, 1, 4, 5, 8, 9]; let mut d = crate::algorithms::Capture::new(); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.ops()); } diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs index 0654142..20136c4 100644 --- a/src/algorithms/mod.rs +++ b/src/algorithms/mod.rs @@ -39,6 +39,7 @@ mod replace; use std::hash::Hash; use std::ops::{Index, Range}; +use std::time::Instant; pub use capture::Capture; pub use hook::{DiffHook, NoFinishHook}; @@ -62,6 +63,34 @@ pub fn diff( new: &New, new_range: Range, ) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + D: DiffHook, + Old::Output: Hash + Eq + Ord, + New::Output: PartialEq + Hash + Eq + Ord, +{ + diff_deadline(alg, d, old, old_range, new, new_range, None) +} + +/// Creates a diff between old and new with the given algorithm with deadline. +/// +/// Diffs `old`, between indices `old_range` and `new` between indices `new_range`. +/// +/// This diff is done with an optional deadline that defines the maximal +/// execution time permitted before it bails and falls back to an approximation. +/// Note that not all algorithms behave well if they reach the deadline (LCS +/// for instance produces a very simplistic diff when the deadline is reached +/// in all cases). +pub fn diff_deadline( + alg: Algorithm, + d: &mut D, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, + deadline: Option, +) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, @@ -70,9 +99,9 @@ where New::Output: PartialEq + Hash + Eq + Ord, { match alg { - Algorithm::Myers => myers::diff(d, old, old_range, new, new_range), - Algorithm::Patience => patience::diff(d, old, old_range, new, new_range), - Algorithm::Lcs => lcs::diff(d, old, old_range, new, new_range), + Algorithm::Myers => myers::diff_deadline(d, old, old_range, new, new_range, deadline), + Algorithm::Patience => patience::diff_deadline(d, old, old_range, new, new_range, deadline), + Algorithm::Lcs => lcs::diff_deadline(d, old, old_range, new, new_range, deadline), } } @@ -84,3 +113,18 @@ where { diff(alg, d, old, 0..old.len(), new, 0..new.len()) } + +/// Shortcut for diffing slices with a specific algorithm. +pub fn diff_slices_deadline( + alg: Algorithm, + d: &mut D, + old: &[T], + new: &[T], + deadline: Option, +) -> Result<(), D::Error> +where + D: DiffHook, + T: Eq + Hash + Ord, +{ + diff_deadline(alg, d, old, 0..old.len(), new, 0..new.len(), deadline) +} diff --git a/src/algorithms/myers.rs b/src/algorithms/myers.rs index ef38520..6b3124a 100644 --- a/src/algorithms/myers.rs +++ b/src/algorithms/myers.rs @@ -8,8 +8,19 @@ //! //! The implementation of this algorithm is based on the implementation by //! Brandon Williams. +//! +//! # Heuristics +//! +//! At present this implementation of Myers' does not implement any more advanced +//! heuristics that would solve some pathological cases. For instane passing two +//! large and completely distinct sequences to the algorithm will make it spin +//! without making reasonable progress. Currently the only protection in the +//! library against this is to pass a deadline to the diffing algorithm. +//! +//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15). use std::ops::{Index, IndexMut, Range}; +use std::time::Instant; use crate::algorithms::DiffHook; @@ -23,6 +34,29 @@ pub fn diff( new: &New, new_range: Range, ) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + D: DiffHook, + New::Output: PartialEq, +{ + diff_deadline(d, old, old_range, new, new_range, None) +} + +/// Myers' diff algorithm with deadline. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +/// +/// This diff is done with an optional deadline that defines the maximal +/// execution time permitted before it bails and falls back to an approximation. +pub fn diff_deadline( + d: &mut D, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, + deadline: Option, +) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, @@ -32,11 +66,17 @@ where let max_d = max_d(old_range.len(), new_range.len()); let mut vf = V::new(max_d); let mut vb = V::new(max_d); - conquer(d, old, old_range, new, new_range, &mut vf, &mut vb)?; + conquer( + d, old, old_range, new, new_range, &mut vf, &mut vb, deadline, + )?; d.finish() } /// Shortcut for diffing slices. +#[deprecated( + since = "1.4.0", + note = "slice utility function is now only available via similar::algorithms::diff_slices" +)] pub fn diff_slices(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> where D: DiffHook, @@ -172,7 +212,8 @@ fn find_middle_snake( new_range: Range, vf: &mut V, vb: &mut V, -) -> Snake + deadline: Option, +) -> Option where Old: Index + ?Sized, New: Index + ?Sized, @@ -197,6 +238,13 @@ where assert!(vb.len() >= d_max); for d in 0..d_max as isize { + // are we running for too long? + if let Some(deadline) = deadline { + if Instant::now() > deadline { + break; + } + } + // Forward path for k in (-d..=d).rev().step_by(2) { let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { @@ -230,10 +278,10 @@ where // TODO optimize this so we don't have to compare against n if vf[k] + vb[-(k - delta)] >= n { // Return the snake - return Snake { + return Some(Snake { x_start: x0 + old_range.start, y_start: y0 + new_range.start, - }; + }); } } } @@ -266,10 +314,10 @@ where // TODO optimize this so we don't have to compare against n if vb[k] + vf[-(k - delta)] >= n { // Return the snake - return Snake { + return Some(Snake { x_start: n - x + old_range.start, y_start: m - y + new_range.start, - }; + }); } } } @@ -277,9 +325,11 @@ where // TODO: Maybe there's an opportunity to optimize and bail early? } - unreachable!("unable to find a middle snake"); + // deadline reached + None } +#[allow(clippy::too_many_arguments)] fn conquer( d: &mut D, old: &Old, @@ -288,6 +338,7 @@ fn conquer( mut new_range: Range, vf: &mut V, vb: &mut V, + deadline: Option, ) -> Result<(), D::Error> where Old: Index + ?Sized, @@ -326,12 +377,30 @@ where new_range.start, new_range.end - new_range.start, )?; - } else { - let snake = find_middle_snake(old, old_range.clone(), new, new_range.clone(), vf, vb); + } else if let Some(snake) = find_middle_snake( + old, + old_range.clone(), + new, + new_range.clone(), + vf, + vb, + deadline, + ) { let (old_a, old_b) = split_at(old_range, snake.x_start); let (new_a, new_b) = split_at(new_range, snake.y_start); - conquer(d, old, old_a, new, new_a, vf, vb)?; - conquer(d, old, old_b, new, new_b, vf, vb)?; + conquer(d, old, old_a, new, new_a, vf, vb, deadline)?; + conquer(d, old, old_b, new, new_b, vf, vb, deadline)?; + } else { + d.delete( + old_range.start, + old_range.end - old_range.start, + new_range.start, + )?; + d.insert( + old_range.start, + new_range.start, + new_range.end - new_range.start, + )?; } if common_suffix_len > 0 { @@ -348,7 +417,7 @@ fn test_find_middle_snake() { let max_d = max_d(a.len(), b.len()); let mut vf = V::new(max_d); let mut vb = V::new(max_d); - let snake = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb); + let snake = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap(); assert_eq!(snake.x_start, 4); assert_eq!(snake.y_start, 1); } @@ -359,7 +428,7 @@ fn test_diff() { let b: &[usize] = &[0, 1, 2, 9, 4]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } @@ -369,7 +438,7 @@ fn test_contiguous() { let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } @@ -379,6 +448,45 @@ fn test_pat() { let b: &[usize] = &[0, 1, 4, 5, 8, 9]; let mut d = crate::algorithms::Capture::new(); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.ops()); } + +#[test] +fn test_deadline_reached() { + use std::ops::Index; + use std::time::Duration; + + let a = (0..100).collect::>(); + let mut b = (0..100).collect::>(); + b[10] = 99; + b[50] = 99; + b[25] = 99; + + struct SlowIndex<'a>(&'a [usize]); + + impl<'a> Index for SlowIndex<'a> { + type Output = usize; + + fn index(&self, index: usize) -> &Self::Output { + std::thread::sleep(Duration::from_millis(1)); + &self.0[index] + } + } + + let slow_a = SlowIndex(&a); + let slow_b = SlowIndex(&b); + + // don't give it enough time to do anything interesting + let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); + diff_deadline( + &mut d, + &slow_a, + 0..a.len(), + &slow_b, + 0..b.len(), + Some(Instant::now() + Duration::from_millis(50)), + ) + .unwrap(); + insta::assert_debug_snapshot!(d.into_inner().ops()); +} diff --git a/src/algorithms/patience.rs b/src/algorithms/patience.rs index a442316..c2f6e1a 100644 --- a/src/algorithms/patience.rs +++ b/src/algorithms/patience.rs @@ -12,6 +12,7 @@ use std::collections::hash_map::Entry; use std::collections::HashMap; use std::hash::Hash; use std::ops::{Index, Range}; +use std::time::Instant; use crate::algorithms::{myers, DiffHook, NoFinishHook, Replace}; @@ -25,6 +26,30 @@ pub fn diff( new: &New, new_range: Range, ) -> Result<(), D::Error> +where + Old: Index + ?Sized, + New: Index + ?Sized, + Old::Output: Hash + Eq, + New::Output: PartialEq + Hash + Eq, + D: DiffHook, +{ + diff_deadline(d, old, old_range, new, new_range, None) +} + +/// Patience diff algorithm with deadline. +/// +/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. +/// +/// This diff is done with an optional deadline that defines the maximal +/// execution time permitted before it bails and falls back to an approximation. +pub fn diff_deadline( + d: &mut D, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, + deadline: Option, +) -> Result<(), D::Error> where Old: Index + ?Sized, New: Index + ?Sized, @@ -45,18 +70,24 @@ where new_current: new_range.start, new_end: new_range.end, new_indexes: &new_indexes, + deadline, }); - myers::diff( + myers::diff_deadline( &mut d, &old_indexes, 0..old_indexes.len(), &new_indexes, 0..new_indexes.len(), + deadline, )?; Ok(()) } /// Shortcut for diffing slices. +#[deprecated( + since = "1.4.0", + note = "slice utility function is now only available via similar::algorithms::diff_slices" +)] pub fn diff_slices(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> where D: DiffHook, @@ -128,6 +159,7 @@ struct Patience<'old, 'new, 'd, Old: ?Sized, New: ?Sized, D> { new_current: usize, new_end: usize, new_indexes: &'new [Indexable<'new, New>], + deadline: Option, } impl<'old, 'new, 'd, Old, New, D> DiffHook for Patience<'old, 'new, 'd, Old, New, D> @@ -153,12 +185,13 @@ where self.d.equal(a0, b0, self.old_current - a0)?; } let mut no_finish_d = NoFinishHook::new(&mut self.d); - myers::diff( + myers::diff_deadline( &mut no_finish_d, self.old, self.old_current..self.old_indexes[old].index, self.new, self.new_current..self.new_indexes[new].index, + self.deadline, )?; self.old_current = self.old_indexes[old].index; self.new_current = self.new_indexes[new].index; @@ -167,12 +200,13 @@ where } fn finish(&mut self) -> Result<(), D::Error> { - myers::diff( + myers::diff_deadline( self.d, self.old, self.old_current..self.old_end, self.new, self.new_current..self.new_end, + self.deadline, ) } } @@ -183,7 +217,7 @@ fn test_patience() { let b: &[usize] = &[10, 1, 2, 2, 8, 9, 4, 4, 7, 47, 18]; let mut d = Replace::new(crate::algorithms::Capture::new()); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } @@ -195,7 +229,7 @@ fn test_patience_out_of_bounds_bug() { let b: &[usize] = &[1, 2, 3]; let mut d = Replace::new(crate::algorithms::Capture::new()); - diff_slices(&mut d, a, b).unwrap(); + diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops()); } diff --git a/src/algorithms/replace.rs b/src/algorithms/replace.rs index 9e70047..309ba05 100644 --- a/src/algorithms/replace.rs +++ b/src/algorithms/replace.rs @@ -134,7 +134,7 @@ impl DiffHook for Replace { #[test] fn test_mayers_replace() { - use crate::algorithms::myers; + use crate::algorithms::{diff_slices, Algorithm}; let a: &[&str] = &[ ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", "a\n", @@ -159,7 +159,7 @@ fn test_mayers_replace() { ]; let mut d = Replace::new(crate::algorithms::Capture::new()); - myers::diff_slices(&mut d, a, b).unwrap(); + diff_slices(Algorithm::Myers, &mut d, a, b).unwrap(); insta::assert_debug_snapshot!(&d.into_inner().ops(), @r###" [ @@ -196,11 +196,13 @@ fn test_mayers_replace() { #[test] fn test_replace() { + use crate::algorithms::{diff_slices, Algorithm}; + let a: &[usize] = &[0, 1, 2, 3, 4]; let b: &[usize] = &[0, 1, 2, 7, 8, 9]; let mut d = Replace::new(crate::algorithms::Capture::new()); - crate::algorithms::myers::diff_slices(&mut d, a, b).unwrap(); + diff_slices(Algorithm::Myers, &mut d, a, b).unwrap(); insta::assert_debug_snapshot!(d.into_inner().ops(), @r###" [ Equal { diff --git a/src/algorithms/snapshots/similar__algorithms__myers__deadline_reached.snap b/src/algorithms/snapshots/similar__algorithms__myers__deadline_reached.snap new file mode 100644 index 0000000..f972cca --- /dev/null +++ b/src/algorithms/snapshots/similar__algorithms__myers__deadline_reached.snap @@ -0,0 +1,22 @@ +--- +source: src/algorithms/myers.rs +expression: d.into_inner().ops() +--- +[ + Equal { + old_index: 0, + new_index: 0, + len: 10, + }, + Replace { + old_index: 10, + old_len: 41, + new_index: 10, + new_len: 41, + }, + Equal { + old_index: 51, + new_index: 51, + len: 49, + }, +] diff --git a/src/common.rs b/src/common.rs index 0fb052c..6e4d6a4 100644 --- a/src/common.rs +++ b/src/common.rs @@ -1,13 +1,15 @@ use std::hash::Hash; use std::ops::{Index, Range}; +use std::time::Instant; -use crate::algorithms::{diff, diff_slices, Capture, Replace}; +use crate::algorithms::{diff_deadline, diff_slices_deadline, Capture, Replace}; use crate::{Algorithm, DiffOp}; /// Creates a diff between old and new with the given algorithm capturing the ops. /// -/// This is like [`diff`] but instead of using an arbitrary hook this will -/// always use [`Replace`] + [`Capture`] and return the captured [`DiffOp`]s. +/// This is like [`diff`](crate::algorithms::diff) but instead of using an +/// arbitrary hook this will always use [`Replace`] + [`Capture`] and return the +/// captured [`DiffOp`]s. pub fn capture_diff( alg: Algorithm, old: &Old, @@ -15,6 +17,26 @@ pub fn capture_diff( new: &New, new_range: Range, ) -> Vec +where + Old: Index + ?Sized, + New: Index + ?Sized, + Old::Output: Hash + Eq + Ord, + New::Output: PartialEq + Hash + Eq + Ord, +{ + capture_diff_deadline(alg, old, old_range, new, new_range, None) +} + +/// Creates a diff between old and new with the given algorithm capturing the ops. +/// +/// Works like [`capture_diff`] but with an optional deadline. +pub fn capture_diff_deadline( + alg: Algorithm, + old: &Old, + old_range: Range, + new: &New, + new_range: Range, + deadline: Option, +) -> Vec where Old: Index + ?Sized, New: Index + ?Sized, @@ -22,17 +44,32 @@ where New::Output: PartialEq + Hash + Eq + Ord, { let mut d = Replace::new(Capture::new()); - diff(alg, &mut d, old, old_range, new, new_range).unwrap(); + diff_deadline(alg, &mut d, old, old_range, new, new_range, deadline).unwrap(); d.into_inner().into_ops() } /// Creates a diff between old and new with the given algorithm capturing the ops. pub fn capture_diff_slices(alg: Algorithm, old: &[T], new: &[T]) -> Vec +where + T: Eq + Hash + Ord, +{ + capture_diff_slices_deadline(alg, old, new, None) +} + +/// Creates a diff between old and new with the given algorithm capturing the ops. +/// +/// Works like [`capture_diff_slices`] but with an optional deadline. +pub fn capture_diff_slices_deadline( + alg: Algorithm, + old: &[T], + new: &[T], + deadline: Option, +) -> Vec where T: Eq + Hash + Ord, { let mut d = Replace::new(Capture::new()); - diff_slices(alg, &mut d, old, new).unwrap(); + diff_slices_deadline(alg, &mut d, old, new, deadline).unwrap(); d.into_inner().into_ops() } diff --git a/src/lib.rs b/src/lib.rs index 69257dd..57b63c2 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -106,6 +106,22 @@ //! As the [`TextDiff::grouped_ops`] method can isolate clusters of changes //! this even works for very long files if paired with this method. //! +//! # Deadlines and Performance +//! +//! For large and very distinct inputs the algorithms as implemented can take +//! a very, very long time to execute. Too long to make sense in practice. +//! To work around this issue all diffing algorithms also provide a version +//! that accepts a deadline which is the point in time as defined by an +//! [`Instant`](std::time::Instant) after which the algorithm should give up. +//! What giving up means depends on the algorithm. For instance due to the +//! recursive, divide and conquer nature of Myer's diff you will still get a +//! pretty decent diff in many cases when a deadline is reached. Whereas on the +//! other hand the LCS diff is unlikely to give any decent results in such a +//! situation. +//! +//! The [`TextDiff`] type also lets you configure a deadline and/or timeout +//! when performing a text diff. +//! //! # Feature Flags //! //! The crate by default does not have any dependencies however for some use diff --git a/src/text/mod.rs b/src/text/mod.rs index 6ff7c36..dd9cf13 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -2,6 +2,7 @@ use std::borrow::Cow; use std::cmp::Reverse; use std::collections::BinaryHeap; +use std::time::{Duration, Instant}; mod abstraction; #[cfg(feature = "inline")] @@ -15,7 +16,22 @@ pub use self::inline::InlineChange; use self::utils::{upper_seq_ratio, QuickSeqRatio}; use crate::iter::{AllChangesIter, ChangesIter}; use crate::udiff::UnifiedDiff; -use crate::{capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp}; +use crate::{capture_diff_slices_deadline, get_diff_ratio, group_diff_ops, Algorithm, DiffOp}; + +#[derive(Debug, Clone, Copy)] +enum Deadline { + Absolute(Instant), + Relative(Duration), +} + +impl Deadline { + fn into_instant(self) -> Instant { + match self { + Deadline::Absolute(instant) => instant, + Deadline::Relative(duration) => Instant::now() + duration, + } + } +} /// A builder type config for more complex uses of [`TextDiff`]. /// @@ -24,6 +40,7 @@ use crate::{capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, Diff pub struct TextDiffConfig { algorithm: Algorithm, newline_terminated: Option, + deadline: Option, } impl Default for TextDiffConfig { @@ -31,6 +48,7 @@ impl Default for TextDiffConfig { TextDiffConfig { algorithm: Algorithm::default(), newline_terminated: None, + deadline: None, } } } @@ -44,6 +62,24 @@ impl TextDiffConfig { self } + /// Sets a deadline for the diff operation. + /// + /// By default a diff will take as long as it takes. For certain diff + /// algorthms like Myer's and Patience a maximum running time can be + /// defined after which the algorithm gives up and approximates. + pub fn deadline(&mut self, deadline: Instant) -> &mut Self { + self.deadline = Some(Deadline::Absolute(deadline)); + self + } + + /// Sets a timeout for thediff operation. + /// + /// This is like [`deadline`](Self::deadline) but accepts a duration. + pub fn timeout(&mut self, timeout: Duration) -> &mut Self { + self.deadline = Some(Deadline::Relative(timeout)); + self + } + /// Changes the newline termination flag. /// /// The default is automatic based on input. This flag controls the @@ -291,7 +327,12 @@ impl TextDiffConfig { new: Cow<'bufs, [&'new T]>, newline_terminated: bool, ) -> TextDiff<'old, 'new, 'bufs, T> { - let ops = capture_diff_slices(self.algorithm, &old, &new); + let ops = capture_diff_slices_deadline( + self.algorithm, + &old, + &new, + self.deadline.map(|x| x.into_instant()), + ); TextDiff { old, new,