likewise/src/algorithms/utils.rs

use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::{Hash, Hasher};
use std::ops::{Add, Index, Range};

/// Utility function to check if a range is empty that works on older rust versions
#[inline(always)]
#[allow(clippy::neg_cmp_op_on_partial_ord)]
pub fn is_empty_range<T: PartialOrd<T>>(range: &Range<T>) -> bool {
    !(range.start < range.end)
}

/// Represents an item in the vector returend by [`unique`].
///
/// It compares like the underlying item does it was created from but
/// carries the index it was originally created from.
pub struct UniqueItem<'a, Idx: ?Sized> {
    lookup: &'a Idx,
    index: usize,
}

impl<'a, Idx: ?Sized> UniqueItem<'a, Idx>
where
    Idx: Index<usize>,
{
    /// Returns the value.
    #[inline(always)]
    pub fn value(&self) -> &Idx::Output {
        &self.lookup[self.index]
    }

    /// Returns the original index.
    #[inline(always)]
    pub fn original_index(&self) -> usize {
        self.index
    }
}

impl<'a, Idx: Index<usize> + 'a> Debug for UniqueItem<'a, Idx>
where
    Idx::Output: Debug,
{
    fn fmt(&self, f: &mut std::fmt::Formatter) -> std::fmt::Result {
        f.debug_struct("UniqueItem")
            .field("value", &self.value())
            .field("original_index", &self.original_index())
            .finish()
    }
}

impl<'a, 'b, A, B> PartialEq<UniqueItem<'a, A>> for UniqueItem<'b, B>
where
    A: Index<usize> + 'b + ?Sized,
    B: Index<usize> + 'b + ?Sized,
    B::Output: PartialEq<A::Output>,
{
    #[inline(always)]
    fn eq(&self, other: &UniqueItem<'a, A>) -> bool {
        self.value() == other.value()
    }
}

/// Returns only unique items in the sequence as vector.
///
/// Each item is wrapped in a [`UniqueItem`] so that both the value and the
/// index can be extracted.
pub fn unique<Idx>(lookup: &Idx, range: Range<usize>) -> Vec<UniqueItem<Idx>>
where
    Idx: Index<usize> + ?Sized,
    Idx::Output: Hash + Eq,
{
    let mut by_item = HashMap::new();
    for index in range {
        match by_item.entry(&lookup[index]) {
            Entry::Vacant(entry) => {
                entry.insert(Some(index));
            }
            Entry::Occupied(mut entry) => {
                let entry = entry.get_mut();
                if entry.is_some() {
                    *entry = None
                }
            }
        }
    }
    let mut rv = by_item
        .into_iter()
        .filter_map(|(_, x)| x)
        .map(|index| UniqueItem { lookup, index })
        .collect::<Vec<_>>();
    rv.sort_by_key(|a| a.original_index());
    rv
}

/// Given two lookups and ranges calculates the length of the common prefix.
pub fn common_prefix_len<Old, New>(
    old: &Old,
    old_range: Range<usize>,
    new: &New,
    new_range: Range<usize>,
) -> usize
where
    Old: Index<usize> + ?Sized,
    New: Index<usize> + ?Sized,
    New::Output: PartialEq<Old::Output>,
{
    if is_empty_range(&old_range) || is_empty_range(&new_range) {
        return 0;
    }
    new_range
        .zip(old_range)
        .take_while(
            #[inline(always)]
            |x| new[x.0] == old[x.1],
        )
        .count()
}

/// Given two lookups and ranges calculates the length of common suffix.
pub fn common_suffix_len<Old, New>(
    old: &Old,
    old_range: Range<usize>,
    new: &New,
    new_range: Range<usize>,
) -> usize
where
    Old: Index<usize> + ?Sized,
    New: Index<usize> + ?Sized,
    New::Output: PartialEq<Old::Output>,
{
    if is_empty_range(&old_range) || is_empty_range(&new_range) {
        return 0;
    }
    new_range
        .rev()
        .zip(old_range.rev())
        .take_while(
            #[inline(always)]
            |x| new[x.0] == old[x.1],
        )
        .count()
}

struct OffsetLookup<Int> {
    offset: usize,
    vec: Vec<Int>,
}

impl<Int> Index<usize> for OffsetLookup<Int> {
    type Output = Int;

    #[inline(always)]
    fn index(&self, index: usize) -> &Self::Output {
        &self.vec[index - self.offset]
    }
}

/// A utility struct to convert distinct items to unique integers.
///
/// This can be helpful on larger inputs to speed up the comparisons
/// performed by doing a first pass where the data set gets reduced
/// to (small) integers.
///
/// The idea is that instead of passing two sequences to a diffling algorithm
/// you first pass it via [`IdentifyDistinct`]:
///
/// ```rust
/// use similar::capture_diff;
/// use similar::algorithms::{Algorithm, IdentifyDistinct};
///
/// let old = &["foo", "bar", "baz"][..];
/// let new = &["foo", "blah", "baz"][..];
/// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
/// let ops = capture_diff(
///     Algorithm::Myers,
///     h.old_lookup(),
///     h.old_range(),
///     h.new_lookup(),
///     h.new_range(),
/// );
/// ```
///
/// The indexes are the same as with the passed source ranges.
pub struct IdentifyDistinct<Int> {
    old: OffsetLookup<Int>,
    new: OffsetLookup<Int>,
}

impl<Int> IdentifyDistinct<Int>
where
    Int: Add<Output = Int> + From<u8> + Default + Copy,
{
    /// Creates an int hasher for two sequences.
    pub fn new<Old, New>(
        old: &Old,
        old_range: Range<usize>,
        new: &New,
        new_range: Range<usize>,
    ) -> Self
    where
        Old: Index<usize> + ?Sized,
        Old::Output: Eq + Hash,
        New: Index<usize> + ?Sized,
        New::Output: Eq + Hash + PartialEq<Old::Output>,
    {
        enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
            Old(&'old Old),
            New(&'new New),
        }

        impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
        where
            Old: Hash + ?Sized,
            New: Hash + ?Sized,
        {
            fn hash<H: Hasher>(&self, state: &mut H) {
                match *self {
                    Key::Old(val) => val.hash(state),
                    Key::New(val) => val.hash(state),
                }
            }
        }

        impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
        where
            Old: Eq + ?Sized,
            New: Eq + PartialEq<Old> + ?Sized,
        {
            #[inline(always)]
            fn eq(&self, other: &Self) -> bool {
                match (self, other) {
                    (Key::Old(a), Key::Old(b)) => a == b,
                    (Key::New(a), Key::New(b)) => a == b,
                    (Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
                }
            }
        }

        impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
        where
            Old: Eq + ?Sized,
            New: Eq + PartialEq<Old> + ?Sized,
        {
        }

        let mut map = HashMap::new();
        let mut old_seq = Vec::new();
        let mut new_seq = Vec::new();
        let mut next_id = Int::default();
        let step = Int::from(1);
        let old_start = old_range.start;
        let new_start = new_range.start;

        for idx in old_range {
            let item = Key::Old(&old[idx]);
            let id = match map.entry(item) {
                Entry::Occupied(o) => *o.get(),
                Entry::Vacant(v) => {
                    let id = next_id;
                    next_id = next_id + step;
                    *v.insert(id)
                }
            };
            old_seq.push(id);
        }

        for idx in new_range {
            let item = Key::New(&new[idx]);
            let id = match map.entry(item) {
                Entry::Occupied(o) => *o.get(),
                Entry::Vacant(v) => {
                    let id = next_id;
                    next_id = next_id + step;
                    *v.insert(id)
                }
            };
            new_seq.push(id);
        }

        IdentifyDistinct {
            old: OffsetLookup {
                offset: old_start,
                vec: old_seq,
            },
            new: OffsetLookup {
                offset: new_start,
                vec: new_seq,
            },
        }
    }

    /// Returns a lookup for the old side.
    pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
        &self.old
    }

    /// Returns a lookup for the new side.
    pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
        &self.new
    }

    /// Convenience method to get back the old range.
    pub fn old_range(&self) -> Range<usize> {
        self.old.offset..self.old.offset + self.old.vec.len()
    }

    /// Convenience method to get back the new range.
    pub fn new_range(&self) -> Range<usize> {
        self.new.offset..self.new.offset + self.new.vec.len()
    }
}

#[test]
fn test_unique() {
    let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
        .into_iter()
        .map(|x| (*x.value(), x.original_index()))
        .collect::<Vec<_>>();
    assert_eq!(u, vec![('a', 0), ('c', 2)]);
}

#[test]
fn test_int_hasher() {
    let ih = IdentifyDistinct::<u8>::new(
        &["", "foo", "bar", "baz"][..],
        1..4,
        &["", "foo", "blah", "baz"][..],
        1..4,
    );
    assert_eq!(ih.old_lookup()[1], 0);
    assert_eq!(ih.old_lookup()[2], 1);
    assert_eq!(ih.old_lookup()[3], 2);
    assert_eq!(ih.new_lookup()[1], 0);
    assert_eq!(ih.new_lookup()[2], 3);
    assert_eq!(ih.new_lookup()[3], 2);
    assert_eq!(ih.old_range(), 1..4);
    assert_eq!(ih.new_range(), 1..4);
}

#[test]
fn test_common_prefix_len() {
    assert_eq!(
        common_prefix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
        0
    );
    assert_eq!(
        common_prefix_len("foobarbaz".as_bytes(), 0..9, "foobarblah".as_bytes(), 0..10),
        7
    );
    assert_eq!(
        common_prefix_len("foobarbaz".as_bytes(), 0..9, "blablabla".as_bytes(), 0..9),
        0
    );
    assert_eq!(
        common_prefix_len("foobarbaz".as_bytes(), 3..9, "foobarblah".as_bytes(), 3..10),
        4
    );
}

#[test]
fn test_common_suffix_len() {
    assert_eq!(
        common_suffix_len("".as_bytes(), 0..0, "".as_bytes(), 0..0),
        0
    );
    assert_eq!(
        common_suffix_len("1234".as_bytes(), 0..4, "X0001234".as_bytes(), 0..8),
        4
    );
    assert_eq!(
        common_suffix_len("1234".as_bytes(), 0..4, "Xxxx".as_bytes(), 0..4),
        0
    );
    assert_eq!(
        common_suffix_len("1234".as_bytes(), 2..4, "01234".as_bytes(), 2..5),
        2
    );
}