Added deadlines to all algorithms (#18)

This adds a deadline to all algorithms which lets one bail in a fixed amount of
time to avoid pathological cases.
This commit is contained in:
Armin Ronacher 2021-02-20 10:12:06 +01:00 committed by GitHub
parent 99386e8106
commit 74e2805a95
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
11 changed files with 412 additions and 57 deletions

View file

@ -6,6 +6,8 @@ All notable changes to similar are documented here.
* Performance improvements for the LCS algorithm. * Performance improvements for the LCS algorithm.
* Small performance improvments by adding an early opt-out for and inline highlighting. * Small performance improvments by adding an early opt-out for and inline highlighting.
* Added deadlines to all diffing algorithms to bail early.
* Deprecated slice diffing methods in the individual algorithm modules.
## 1.2.2 ## 1.2.2

View file

@ -94,7 +94,7 @@ impl DiffHook for Capture {
#[test] #[test]
fn test_capture_hook_grouping() { fn test_capture_hook_grouping() {
use crate::algorithms::{myers, Replace}; use crate::algorithms::{diff_slices, Algorithm, Replace};
let rng = (1..100).collect::<Vec<_>>(); let rng = (1..100).collect::<Vec<_>>();
let mut rng_new = rng.clone(); let mut rng_new = rng.clone();
@ -104,7 +104,7 @@ fn test_capture_hook_grouping() {
rng_new[34] = 1000; rng_new[34] = 1000;
let mut d = Replace::new(Capture::new()); let mut d = Replace::new(Capture::new());
myers::diff_slices(&mut d, &rng, &rng_new).unwrap(); diff_slices(Algorithm::Myers, &mut d, &rng, &rng_new).unwrap();
let ops = d.into_inner().into_grouped_ops(3); let ops = d.into_inner().into_grouped_ops(3);
let tags = ops let tags = ops

View file

@ -4,12 +4,18 @@
//! * space `O(MN)` //! * space `O(MN)`
use std::collections::BTreeMap; use std::collections::BTreeMap;
use std::ops::{Index, Range}; use std::ops::{Index, Range};
use std::time::Instant;
use crate::algorithms::DiffHook; use crate::algorithms::DiffHook;
/// HuntMcIlroy / HuntSzymanski LCS diff algorithm. /// HuntMcIlroy / HuntSzymanski LCS diff algorithm.
/// ///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`. /// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an very bad
/// approximation. Deadlines with LCS do not make a lot of sense and should
/// not be used.
pub fn diff<Old, New, D>( pub fn diff<Old, New, D>(
d: &mut D, d: &mut D,
old: &Old, old: &Old,
@ -17,6 +23,29 @@ pub fn diff<Old, New, D>(
new: &New, new: &New,
new_range: Range<usize>, new_range: Range<usize>,
) -> Result<(), D::Error> ) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
diff_deadline(d, old, old_range, new, new_range, None)
}
/// HuntMcIlroy / HuntSzymanski LCS diff algorithm.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff_deadline<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -44,11 +73,12 @@ where
.take_while(|x| new[x.1] == old[x.0]) .take_while(|x| new[x.1] == old[x.0])
.count(); .count();
let table = make_table( let maybe_table = make_table(
old, old,
prefix_len..(old_range.len() - suffix_len), prefix_len..(old_range.len() - suffix_len),
new, new,
prefix_len..(new_range.len() - suffix_len), prefix_len..(new_range.len() - suffix_len),
deadline,
); );
let mut old_idx = 0; let mut old_idx = 0;
let mut new_idx = 0; let mut new_idx = 0;
@ -59,23 +89,30 @@ where
d.equal(old_range.start, new_range.start, prefix_len)?; d.equal(old_range.start, new_range.start, prefix_len)?;
} }
while new_idx < new_len && old_idx < old_len { if let Some(table) = maybe_table {
while new_idx < new_len && old_idx < old_len {
let old_orig_idx = old_range.start + prefix_len + old_idx;
let new_orig_idx = new_range.start + prefix_len + new_idx;
if new[new_orig_idx] == old[old_orig_idx] {
d.equal(old_orig_idx, new_orig_idx, 1)?;
old_idx += 1;
new_idx += 1;
} else if table.get(&(new_idx, old_idx + 1)).map_or(0, |&x| x)
>= table.get(&(new_idx + 1, old_idx)).map_or(0, |&x| x)
{
d.delete(old_orig_idx, 1, new_orig_idx)?;
old_idx += 1;
} else {
d.insert(old_orig_idx, new_orig_idx, 1)?;
new_idx += 1;
}
}
} else {
let old_orig_idx = old_range.start + prefix_len + old_idx; let old_orig_idx = old_range.start + prefix_len + old_idx;
let new_orig_idx = new_range.start + prefix_len + new_idx; let new_orig_idx = new_range.start + prefix_len + new_idx;
d.delete(old_orig_idx, old_len, new_orig_idx)?;
if new[new_orig_idx] == old[old_orig_idx] { d.insert(old_orig_idx, new_orig_idx, new_len)?;
d.equal(old_orig_idx, new_orig_idx, 1)?;
old_idx += 1;
new_idx += 1;
} else if table.get(&(new_idx, old_idx + 1)).map_or(0, |&x| x)
>= table.get(&(new_idx + 1, old_idx)).map_or(0, |&x| x)
{
d.delete(old_orig_idx, 1, new_orig_idx)?;
old_idx += 1;
} else {
d.insert(old_orig_idx, new_orig_idx, 1)?;
new_idx += 1;
}
} }
if old_idx < old_len { if old_idx < old_len {
@ -107,6 +144,10 @@ where
} }
/// Shortcut for diffing slices. /// Shortcut for diffing slices.
#[deprecated(
since = "1.4.0",
note = "slice utility function is now only available via similar::algorithms::diff_slices"
)]
pub fn diff_slices<D, T>(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> pub fn diff_slices<D, T>(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error>
where where
D: DiffHook, D: DiffHook,
@ -120,7 +161,8 @@ fn make_table<Old, New>(
old_range: Range<usize>, old_range: Range<usize>,
new: &New, new: &New,
new_range: Range<usize>, new_range: Range<usize>,
) -> BTreeMap<(usize, usize), u32> deadline: Option<Instant>,
) -> Option<BTreeMap<(usize, usize), u32>>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -131,6 +173,13 @@ where
let mut table = BTreeMap::new(); let mut table = BTreeMap::new();
for i in (0..new_len).rev() { for i in (0..new_len).rev() {
// are we running for too long? give up on the table
if let Some(deadline) = deadline {
if Instant::now() > deadline {
return None;
}
}
for j in (0..old_len).rev() { for j in (0..old_len).rev() {
let val = if new[i] == old[j] { let val = if new[i] == old[j] {
table.get(&(i + 1, j + 1)).map_or(0, |&x| x) + 1 table.get(&(i + 1, j + 1)).map_or(0, |&x| x) + 1
@ -146,12 +195,12 @@ where
} }
} }
table Some(table)
} }
#[test] #[test]
fn test_table() { fn test_table() {
let table = make_table(&vec![2, 3], 0..2, &vec![0, 1, 2], 0..3); let table = make_table(&vec![2, 3], 0..2, &vec![0, 1, 2], 0..3, None).unwrap();
let expected = { let expected = {
let mut m = BTreeMap::new(); let mut m = BTreeMap::new();
m.insert((1, 0), 1); m.insert((1, 0), 1);
@ -168,7 +217,7 @@ fn test_diff() {
let b: &[usize] = &[0, 1, 2, 9, 4]; let b: &[usize] = &[0, 1, 2, 9, 4];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops()); insta::assert_debug_snapshot!(d.into_inner().ops());
} }
@ -178,7 +227,7 @@ fn test_contiguous() {
let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops()); insta::assert_debug_snapshot!(d.into_inner().ops());
} }
@ -188,6 +237,6 @@ fn test_pat() {
let b: &[usize] = &[0, 1, 4, 5, 8, 9]; let b: &[usize] = &[0, 1, 4, 5, 8, 9];
let mut d = crate::algorithms::Capture::new(); let mut d = crate::algorithms::Capture::new();
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.ops()); insta::assert_debug_snapshot!(d.ops());
} }

View file

@ -39,6 +39,7 @@ mod replace;
use std::hash::Hash; use std::hash::Hash;
use std::ops::{Index, Range}; use std::ops::{Index, Range};
use std::time::Instant;
pub use capture::Capture; pub use capture::Capture;
pub use hook::{DiffHook, NoFinishHook}; pub use hook::{DiffHook, NoFinishHook};
@ -62,6 +63,34 @@ pub fn diff<Old, New, D>(
new: &New, new: &New,
new_range: Range<usize>, new_range: Range<usize>,
) -> Result<(), D::Error> ) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
Old::Output: Hash + Eq + Ord,
New::Output: PartialEq<Old::Output> + Hash + Eq + Ord,
{
diff_deadline(alg, d, old, old_range, new, new_range, None)
}
/// Creates a diff between old and new with the given algorithm with deadline.
///
/// Diffs `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
/// Note that not all algorithms behave well if they reach the deadline (LCS
/// for instance produces a very simplistic diff when the deadline is reached
/// in all cases).
pub fn diff_deadline<Old, New, D>(
alg: Algorithm,
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -70,9 +99,9 @@ where
New::Output: PartialEq<Old::Output> + Hash + Eq + Ord, New::Output: PartialEq<Old::Output> + Hash + Eq + Ord,
{ {
match alg { match alg {
Algorithm::Myers => myers::diff(d, old, old_range, new, new_range), Algorithm::Myers => myers::diff_deadline(d, old, old_range, new, new_range, deadline),
Algorithm::Patience => patience::diff(d, old, old_range, new, new_range), Algorithm::Patience => patience::diff_deadline(d, old, old_range, new, new_range, deadline),
Algorithm::Lcs => lcs::diff(d, old, old_range, new, new_range), Algorithm::Lcs => lcs::diff_deadline(d, old, old_range, new, new_range, deadline),
} }
} }
@ -84,3 +113,18 @@ where
{ {
diff(alg, d, old, 0..old.len(), new, 0..new.len()) diff(alg, d, old, 0..old.len(), new, 0..new.len())
} }
/// Shortcut for diffing slices with a specific algorithm.
pub fn diff_slices_deadline<D, T>(
alg: Algorithm,
d: &mut D,
old: &[T],
new: &[T],
deadline: Option<Instant>,
) -> Result<(), D::Error>
where
D: DiffHook,
T: Eq + Hash + Ord,
{
diff_deadline(alg, d, old, 0..old.len(), new, 0..new.len(), deadline)
}

View file

@ -8,8 +8,19 @@
//! //!
//! The implementation of this algorithm is based on the implementation by //! The implementation of this algorithm is based on the implementation by
//! Brandon Williams. //! Brandon Williams.
//!
//! # Heuristics
//!
//! At present this implementation of Myers' does not implement any more advanced
//! heuristics that would solve some pathological cases. For instane passing two
//! large and completely distinct sequences to the algorithm will make it spin
//! without making reasonable progress. Currently the only protection in the
//! library against this is to pass a deadline to the diffing algorithm.
//!
//! For potential improvements here see [similar#15](https://github.com/mitsuhiko/similar/issues/15).
use std::ops::{Index, IndexMut, Range}; use std::ops::{Index, IndexMut, Range};
use std::time::Instant;
use crate::algorithms::DiffHook; use crate::algorithms::DiffHook;
@ -23,6 +34,29 @@ pub fn diff<Old, New, D>(
new: &New, new: &New,
new_range: Range<usize>, new_range: Range<usize>,
) -> Result<(), D::Error> ) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
D: DiffHook,
New::Output: PartialEq<Old::Output>,
{
diff_deadline(d, old, old_range, new, new_range, None)
}
/// Myers' diff algorithm with deadline.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff_deadline<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -32,11 +66,17 @@ where
let max_d = max_d(old_range.len(), new_range.len()); let max_d = max_d(old_range.len(), new_range.len());
let mut vf = V::new(max_d); let mut vf = V::new(max_d);
let mut vb = V::new(max_d); let mut vb = V::new(max_d);
conquer(d, old, old_range, new, new_range, &mut vf, &mut vb)?; conquer(
d, old, old_range, new, new_range, &mut vf, &mut vb, deadline,
)?;
d.finish() d.finish()
} }
/// Shortcut for diffing slices. /// Shortcut for diffing slices.
#[deprecated(
since = "1.4.0",
note = "slice utility function is now only available via similar::algorithms::diff_slices"
)]
pub fn diff_slices<D, T>(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> pub fn diff_slices<D, T>(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error>
where where
D: DiffHook, D: DiffHook,
@ -172,7 +212,8 @@ fn find_middle_snake<Old, New>(
new_range: Range<usize>, new_range: Range<usize>,
vf: &mut V, vf: &mut V,
vb: &mut V, vb: &mut V,
) -> Snake deadline: Option<Instant>,
) -> Option<Snake>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -197,6 +238,13 @@ where
assert!(vb.len() >= d_max); assert!(vb.len() >= d_max);
for d in 0..d_max as isize { for d in 0..d_max as isize {
// are we running for too long?
if let Some(deadline) = deadline {
if Instant::now() > deadline {
break;
}
}
// Forward path // Forward path
for k in (-d..=d).rev().step_by(2) { for k in (-d..=d).rev().step_by(2) {
let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) { let mut x = if k == -d || (k != d && vf[k - 1] < vf[k + 1]) {
@ -230,10 +278,10 @@ where
// TODO optimize this so we don't have to compare against n // TODO optimize this so we don't have to compare against n
if vf[k] + vb[-(k - delta)] >= n { if vf[k] + vb[-(k - delta)] >= n {
// Return the snake // Return the snake
return Snake { return Some(Snake {
x_start: x0 + old_range.start, x_start: x0 + old_range.start,
y_start: y0 + new_range.start, y_start: y0 + new_range.start,
}; });
} }
} }
} }
@ -266,10 +314,10 @@ where
// TODO optimize this so we don't have to compare against n // TODO optimize this so we don't have to compare against n
if vb[k] + vf[-(k - delta)] >= n { if vb[k] + vf[-(k - delta)] >= n {
// Return the snake // Return the snake
return Snake { return Some(Snake {
x_start: n - x + old_range.start, x_start: n - x + old_range.start,
y_start: m - y + new_range.start, y_start: m - y + new_range.start,
}; });
} }
} }
} }
@ -277,9 +325,11 @@ where
// TODO: Maybe there's an opportunity to optimize and bail early? // TODO: Maybe there's an opportunity to optimize and bail early?
} }
unreachable!("unable to find a middle snake"); // deadline reached
None
} }
#[allow(clippy::too_many_arguments)]
fn conquer<Old, New, D>( fn conquer<Old, New, D>(
d: &mut D, d: &mut D,
old: &Old, old: &Old,
@ -288,6 +338,7 @@ fn conquer<Old, New, D>(
mut new_range: Range<usize>, mut new_range: Range<usize>,
vf: &mut V, vf: &mut V,
vb: &mut V, vb: &mut V,
deadline: Option<Instant>,
) -> Result<(), D::Error> ) -> Result<(), D::Error>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
@ -326,12 +377,30 @@ where
new_range.start, new_range.start,
new_range.end - new_range.start, new_range.end - new_range.start,
)?; )?;
} else { } else if let Some(snake) = find_middle_snake(
let snake = find_middle_snake(old, old_range.clone(), new, new_range.clone(), vf, vb); old,
old_range.clone(),
new,
new_range.clone(),
vf,
vb,
deadline,
) {
let (old_a, old_b) = split_at(old_range, snake.x_start); let (old_a, old_b) = split_at(old_range, snake.x_start);
let (new_a, new_b) = split_at(new_range, snake.y_start); let (new_a, new_b) = split_at(new_range, snake.y_start);
conquer(d, old, old_a, new, new_a, vf, vb)?; conquer(d, old, old_a, new, new_a, vf, vb, deadline)?;
conquer(d, old, old_b, new, new_b, vf, vb)?; conquer(d, old, old_b, new, new_b, vf, vb, deadline)?;
} else {
d.delete(
old_range.start,
old_range.end - old_range.start,
new_range.start,
)?;
d.insert(
old_range.start,
new_range.start,
new_range.end - new_range.start,
)?;
} }
if common_suffix_len > 0 { if common_suffix_len > 0 {
@ -348,7 +417,7 @@ fn test_find_middle_snake() {
let max_d = max_d(a.len(), b.len()); let max_d = max_d(a.len(), b.len());
let mut vf = V::new(max_d); let mut vf = V::new(max_d);
let mut vb = V::new(max_d); let mut vb = V::new(max_d);
let snake = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb); let snake = find_middle_snake(a, 0..a.len(), b, 0..b.len(), &mut vf, &mut vb, None).unwrap();
assert_eq!(snake.x_start, 4); assert_eq!(snake.x_start, 4);
assert_eq!(snake.y_start, 1); assert_eq!(snake.y_start, 1);
} }
@ -359,7 +428,7 @@ fn test_diff() {
let b: &[usize] = &[0, 1, 2, 9, 4]; let b: &[usize] = &[0, 1, 2, 9, 4];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops()); insta::assert_debug_snapshot!(d.into_inner().ops());
} }
@ -369,7 +438,7 @@ fn test_contiguous() {
let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7]; let b: &[usize] = &[0, 1, 2, 8, 9, 4, 4, 7];
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new()); let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops()); insta::assert_debug_snapshot!(d.into_inner().ops());
} }
@ -379,6 +448,45 @@ fn test_pat() {
let b: &[usize] = &[0, 1, 4, 5, 8, 9]; let b: &[usize] = &[0, 1, 4, 5, 8, 9];
let mut d = crate::algorithms::Capture::new(); let mut d = crate::algorithms::Capture::new();
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.ops()); insta::assert_debug_snapshot!(d.ops());
} }
#[test]
fn test_deadline_reached() {
use std::ops::Index;
use std::time::Duration;
let a = (0..100).collect::<Vec<_>>();
let mut b = (0..100).collect::<Vec<_>>();
b[10] = 99;
b[50] = 99;
b[25] = 99;
struct SlowIndex<'a>(&'a [usize]);
impl<'a> Index<usize> for SlowIndex<'a> {
type Output = usize;
fn index(&self, index: usize) -> &Self::Output {
std::thread::sleep(Duration::from_millis(1));
&self.0[index]
}
}
let slow_a = SlowIndex(&a);
let slow_b = SlowIndex(&b);
// don't give it enough time to do anything interesting
let mut d = crate::algorithms::Replace::new(crate::algorithms::Capture::new());
diff_deadline(
&mut d,
&slow_a,
0..a.len(),
&slow_b,
0..b.len(),
Some(Instant::now() + Duration::from_millis(50)),
)
.unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops());
}

View file

@ -12,6 +12,7 @@ use std::collections::hash_map::Entry;
use std::collections::HashMap; use std::collections::HashMap;
use std::hash::Hash; use std::hash::Hash;
use std::ops::{Index, Range}; use std::ops::{Index, Range};
use std::time::Instant;
use crate::algorithms::{myers, DiffHook, NoFinishHook, Replace}; use crate::algorithms::{myers, DiffHook, NoFinishHook, Replace};
@ -25,6 +26,30 @@ pub fn diff<Old, New, D>(
new: &New, new: &New,
new_range: Range<usize>, new_range: Range<usize>,
) -> Result<(), D::Error> ) -> Result<(), D::Error>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
Old::Output: Hash + Eq,
New::Output: PartialEq<Old::Output> + Hash + Eq,
D: DiffHook,
{
diff_deadline(d, old, old_range, new, new_range, None)
}
/// Patience diff algorithm with deadline.
///
/// Diff `old`, between indices `old_range` and `new` between indices `new_range`.
///
/// This diff is done with an optional deadline that defines the maximal
/// execution time permitted before it bails and falls back to an approximation.
pub fn diff_deadline<Old, New, D>(
d: &mut D,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Result<(), D::Error>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -45,18 +70,24 @@ where
new_current: new_range.start, new_current: new_range.start,
new_end: new_range.end, new_end: new_range.end,
new_indexes: &new_indexes, new_indexes: &new_indexes,
deadline,
}); });
myers::diff( myers::diff_deadline(
&mut d, &mut d,
&old_indexes, &old_indexes,
0..old_indexes.len(), 0..old_indexes.len(),
&new_indexes, &new_indexes,
0..new_indexes.len(), 0..new_indexes.len(),
deadline,
)?; )?;
Ok(()) Ok(())
} }
/// Shortcut for diffing slices. /// Shortcut for diffing slices.
#[deprecated(
since = "1.4.0",
note = "slice utility function is now only available via similar::algorithms::diff_slices"
)]
pub fn diff_slices<D, T>(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error> pub fn diff_slices<D, T>(d: &mut D, old: &[T], new: &[T]) -> Result<(), D::Error>
where where
D: DiffHook, D: DiffHook,
@ -128,6 +159,7 @@ struct Patience<'old, 'new, 'd, Old: ?Sized, New: ?Sized, D> {
new_current: usize, new_current: usize,
new_end: usize, new_end: usize,
new_indexes: &'new [Indexable<'new, New>], new_indexes: &'new [Indexable<'new, New>],
deadline: Option<Instant>,
} }
impl<'old, 'new, 'd, Old, New, D> DiffHook for Patience<'old, 'new, 'd, Old, New, D> impl<'old, 'new, 'd, Old, New, D> DiffHook for Patience<'old, 'new, 'd, Old, New, D>
@ -153,12 +185,13 @@ where
self.d.equal(a0, b0, self.old_current - a0)?; self.d.equal(a0, b0, self.old_current - a0)?;
} }
let mut no_finish_d = NoFinishHook::new(&mut self.d); let mut no_finish_d = NoFinishHook::new(&mut self.d);
myers::diff( myers::diff_deadline(
&mut no_finish_d, &mut no_finish_d,
self.old, self.old,
self.old_current..self.old_indexes[old].index, self.old_current..self.old_indexes[old].index,
self.new, self.new,
self.new_current..self.new_indexes[new].index, self.new_current..self.new_indexes[new].index,
self.deadline,
)?; )?;
self.old_current = self.old_indexes[old].index; self.old_current = self.old_indexes[old].index;
self.new_current = self.new_indexes[new].index; self.new_current = self.new_indexes[new].index;
@ -167,12 +200,13 @@ where
} }
fn finish(&mut self) -> Result<(), D::Error> { fn finish(&mut self) -> Result<(), D::Error> {
myers::diff( myers::diff_deadline(
self.d, self.d,
self.old, self.old,
self.old_current..self.old_end, self.old_current..self.old_end,
self.new, self.new,
self.new_current..self.new_end, self.new_current..self.new_end,
self.deadline,
) )
} }
} }
@ -183,7 +217,7 @@ fn test_patience() {
let b: &[usize] = &[10, 1, 2, 2, 8, 9, 4, 4, 7, 47, 18]; let b: &[usize] = &[10, 1, 2, 2, 8, 9, 4, 4, 7, 47, 18];
let mut d = Replace::new(crate::algorithms::Capture::new()); let mut d = Replace::new(crate::algorithms::Capture::new());
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops()); insta::assert_debug_snapshot!(d.into_inner().ops());
} }
@ -195,7 +229,7 @@ fn test_patience_out_of_bounds_bug() {
let b: &[usize] = &[1, 2, 3]; let b: &[usize] = &[1, 2, 3];
let mut d = Replace::new(crate::algorithms::Capture::new()); let mut d = Replace::new(crate::algorithms::Capture::new());
diff_slices(&mut d, a, b).unwrap(); diff(&mut d, a, 0..a.len(), b, 0..b.len()).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops()); insta::assert_debug_snapshot!(d.into_inner().ops());
} }

View file

@ -134,7 +134,7 @@ impl<D: DiffHook> DiffHook for Replace<D> {
#[test] #[test]
fn test_mayers_replace() { fn test_mayers_replace() {
use crate::algorithms::myers; use crate::algorithms::{diff_slices, Algorithm};
let a: &[&str] = &[ let a: &[&str] = &[
">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n", ">>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>\n",
"a\n", "a\n",
@ -159,7 +159,7 @@ fn test_mayers_replace() {
]; ];
let mut d = Replace::new(crate::algorithms::Capture::new()); let mut d = Replace::new(crate::algorithms::Capture::new());
myers::diff_slices(&mut d, a, b).unwrap(); diff_slices(Algorithm::Myers, &mut d, a, b).unwrap();
insta::assert_debug_snapshot!(&d.into_inner().ops(), @r###" insta::assert_debug_snapshot!(&d.into_inner().ops(), @r###"
[ [
@ -196,11 +196,13 @@ fn test_mayers_replace() {
#[test] #[test]
fn test_replace() { fn test_replace() {
use crate::algorithms::{diff_slices, Algorithm};
let a: &[usize] = &[0, 1, 2, 3, 4]; let a: &[usize] = &[0, 1, 2, 3, 4];
let b: &[usize] = &[0, 1, 2, 7, 8, 9]; let b: &[usize] = &[0, 1, 2, 7, 8, 9];
let mut d = Replace::new(crate::algorithms::Capture::new()); let mut d = Replace::new(crate::algorithms::Capture::new());
crate::algorithms::myers::diff_slices(&mut d, a, b).unwrap(); diff_slices(Algorithm::Myers, &mut d, a, b).unwrap();
insta::assert_debug_snapshot!(d.into_inner().ops(), @r###" insta::assert_debug_snapshot!(d.into_inner().ops(), @r###"
[ [
Equal { Equal {

View file

@ -0,0 +1,22 @@
---
source: src/algorithms/myers.rs
expression: d.into_inner().ops()
---
[
Equal {
old_index: 0,
new_index: 0,
len: 10,
},
Replace {
old_index: 10,
old_len: 41,
new_index: 10,
new_len: 41,
},
Equal {
old_index: 51,
new_index: 51,
len: 49,
},
]

View file

@ -1,13 +1,15 @@
use std::hash::Hash; use std::hash::Hash;
use std::ops::{Index, Range}; use std::ops::{Index, Range};
use std::time::Instant;
use crate::algorithms::{diff, diff_slices, Capture, Replace}; use crate::algorithms::{diff_deadline, diff_slices_deadline, Capture, Replace};
use crate::{Algorithm, DiffOp}; use crate::{Algorithm, DiffOp};
/// Creates a diff between old and new with the given algorithm capturing the ops. /// Creates a diff between old and new with the given algorithm capturing the ops.
/// ///
/// This is like [`diff`] but instead of using an arbitrary hook this will /// This is like [`diff`](crate::algorithms::diff) but instead of using an
/// always use [`Replace`] + [`Capture`] and return the captured [`DiffOp`]s. /// arbitrary hook this will always use [`Replace`] + [`Capture`] and return the
/// captured [`DiffOp`]s.
pub fn capture_diff<Old, New>( pub fn capture_diff<Old, New>(
alg: Algorithm, alg: Algorithm,
old: &Old, old: &Old,
@ -15,6 +17,26 @@ pub fn capture_diff<Old, New>(
new: &New, new: &New,
new_range: Range<usize>, new_range: Range<usize>,
) -> Vec<DiffOp> ) -> Vec<DiffOp>
where
Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized,
Old::Output: Hash + Eq + Ord,
New::Output: PartialEq<Old::Output> + Hash + Eq + Ord,
{
capture_diff_deadline(alg, old, old_range, new, new_range, None)
}
/// Creates a diff between old and new with the given algorithm capturing the ops.
///
/// Works like [`capture_diff`] but with an optional deadline.
pub fn capture_diff_deadline<Old, New>(
alg: Algorithm,
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
deadline: Option<Instant>,
) -> Vec<DiffOp>
where where
Old: Index<usize> + ?Sized, Old: Index<usize> + ?Sized,
New: Index<usize> + ?Sized, New: Index<usize> + ?Sized,
@ -22,17 +44,32 @@ where
New::Output: PartialEq<Old::Output> + Hash + Eq + Ord, New::Output: PartialEq<Old::Output> + Hash + Eq + Ord,
{ {
let mut d = Replace::new(Capture::new()); let mut d = Replace::new(Capture::new());
diff(alg, &mut d, old, old_range, new, new_range).unwrap(); diff_deadline(alg, &mut d, old, old_range, new, new_range, deadline).unwrap();
d.into_inner().into_ops() d.into_inner().into_ops()
} }
/// Creates a diff between old and new with the given algorithm capturing the ops. /// Creates a diff between old and new with the given algorithm capturing the ops.
pub fn capture_diff_slices<T>(alg: Algorithm, old: &[T], new: &[T]) -> Vec<DiffOp> pub fn capture_diff_slices<T>(alg: Algorithm, old: &[T], new: &[T]) -> Vec<DiffOp>
where
T: Eq + Hash + Ord,
{
capture_diff_slices_deadline(alg, old, new, None)
}
/// Creates a diff between old and new with the given algorithm capturing the ops.
///
/// Works like [`capture_diff_slices`] but with an optional deadline.
pub fn capture_diff_slices_deadline<T>(
alg: Algorithm,
old: &[T],
new: &[T],
deadline: Option<Instant>,
) -> Vec<DiffOp>
where where
T: Eq + Hash + Ord, T: Eq + Hash + Ord,
{ {
let mut d = Replace::new(Capture::new()); let mut d = Replace::new(Capture::new());
diff_slices(alg, &mut d, old, new).unwrap(); diff_slices_deadline(alg, &mut d, old, new, deadline).unwrap();
d.into_inner().into_ops() d.into_inner().into_ops()
} }

View file

@ -106,6 +106,22 @@
//! As the [`TextDiff::grouped_ops`] method can isolate clusters of changes //! As the [`TextDiff::grouped_ops`] method can isolate clusters of changes
//! this even works for very long files if paired with this method. //! this even works for very long files if paired with this method.
//! //!
//! # Deadlines and Performance
//!
//! For large and very distinct inputs the algorithms as implemented can take
//! a very, very long time to execute. Too long to make sense in practice.
//! To work around this issue all diffing algorithms also provide a version
//! that accepts a deadline which is the point in time as defined by an
//! [`Instant`](std::time::Instant) after which the algorithm should give up.
//! What giving up means depends on the algorithm. For instance due to the
//! recursive, divide and conquer nature of Myer's diff you will still get a
//! pretty decent diff in many cases when a deadline is reached. Whereas on the
//! other hand the LCS diff is unlikely to give any decent results in such a
//! situation.
//!
//! The [`TextDiff`] type also lets you configure a deadline and/or timeout
//! when performing a text diff.
//!
//! # Feature Flags //! # Feature Flags
//! //!
//! The crate by default does not have any dependencies however for some use //! The crate by default does not have any dependencies however for some use

View file

@ -2,6 +2,7 @@
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::collections::BinaryHeap; use std::collections::BinaryHeap;
use std::time::{Duration, Instant};
mod abstraction; mod abstraction;
#[cfg(feature = "inline")] #[cfg(feature = "inline")]
@ -15,7 +16,22 @@ pub use self::inline::InlineChange;
use self::utils::{upper_seq_ratio, QuickSeqRatio}; use self::utils::{upper_seq_ratio, QuickSeqRatio};
use crate::iter::{AllChangesIter, ChangesIter}; use crate::iter::{AllChangesIter, ChangesIter};
use crate::udiff::UnifiedDiff; use crate::udiff::UnifiedDiff;
use crate::{capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp}; use crate::{capture_diff_slices_deadline, get_diff_ratio, group_diff_ops, Algorithm, DiffOp};
#[derive(Debug, Clone, Copy)]
enum Deadline {
Absolute(Instant),
Relative(Duration),
}
impl Deadline {
fn into_instant(self) -> Instant {
match self {
Deadline::Absolute(instant) => instant,
Deadline::Relative(duration) => Instant::now() + duration,
}
}
}
/// A builder type config for more complex uses of [`TextDiff`]. /// A builder type config for more complex uses of [`TextDiff`].
/// ///
@ -24,6 +40,7 @@ use crate::{capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, Diff
pub struct TextDiffConfig { pub struct TextDiffConfig {
algorithm: Algorithm, algorithm: Algorithm,
newline_terminated: Option<bool>, newline_terminated: Option<bool>,
deadline: Option<Deadline>,
} }
impl Default for TextDiffConfig { impl Default for TextDiffConfig {
@ -31,6 +48,7 @@ impl Default for TextDiffConfig {
TextDiffConfig { TextDiffConfig {
algorithm: Algorithm::default(), algorithm: Algorithm::default(),
newline_terminated: None, newline_terminated: None,
deadline: None,
} }
} }
} }
@ -44,6 +62,24 @@ impl TextDiffConfig {
self self
} }
/// Sets a deadline for the diff operation.
///
/// By default a diff will take as long as it takes. For certain diff
/// algorthms like Myer's and Patience a maximum running time can be
/// defined after which the algorithm gives up and approximates.
pub fn deadline(&mut self, deadline: Instant) -> &mut Self {
self.deadline = Some(Deadline::Absolute(deadline));
self
}
/// Sets a timeout for thediff operation.
///
/// This is like [`deadline`](Self::deadline) but accepts a duration.
pub fn timeout(&mut self, timeout: Duration) -> &mut Self {
self.deadline = Some(Deadline::Relative(timeout));
self
}
/// Changes the newline termination flag. /// Changes the newline termination flag.
/// ///
/// The default is automatic based on input. This flag controls the /// The default is automatic based on input. This flag controls the
@ -291,7 +327,12 @@ impl TextDiffConfig {
new: Cow<'bufs, [&'new T]>, new: Cow<'bufs, [&'new T]>,
newline_terminated: bool, newline_terminated: bool,
) -> TextDiff<'old, 'new, 'bufs, T> { ) -> TextDiff<'old, 'new, 'bufs, T> {
let ops = capture_diff_slices(self.algorithm, &old, &new); let ops = capture_diff_slices_deadline(
self.algorithm,
&old,
&new,
self.deadline.map(|x| x.into_instant()),
);
TextDiff { TextDiff {
old, old,
new, new,