Add compaction support (#22)

This commit is contained in:
Armin Ronacher 2021-02-25 22:13:43 +01:00 committed by GitHub
parent ddb73d8351
commit 7e628d78d8
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
9 changed files with 712 additions and 19 deletions

View file

@ -1,8 +1,8 @@
use std::collections::hash_map::Entry;
use std::collections::HashMap;
use std::fmt::Debug;
use std::hash::Hash;
use std::ops::{Index, Range};
use std::hash::{Hash, Hasher};
use std::ops::{Add, Index, Range};
/// Utility function to check if a range is empty that works on older rust versions
#[inline(always)]
@ -142,6 +142,175 @@ where
.count()
}
struct OffsetLookup<Int> {
offset: usize,
vec: Vec<Int>,
}
impl<Int> Index<usize> for OffsetLookup<Int> {
type Output = Int;
#[inline(always)]
fn index(&self, index: usize) -> &Self::Output {
&self.vec[index - self.offset]
}
}
/// A utility struct to convert distinct items to unique integers.
///
/// This can be helpful on larger inputs to speed up the comparisons
/// performed by doing a first pass where the data set gets reduced
/// to (small) integers.
///
/// The idea is that instead of passing two sequences to a diffling algorithm
/// you first pass it via [`IdentifyDistinct`]:
///
/// ```rust
/// use similar::capture_diff;
/// use similar::algorithms::{Algorithm, IdentifyDistinct};
///
/// let old = &["foo", "bar", "baz"][..];
/// let new = &["foo", "blah", "baz"][..];
/// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
/// let ops = capture_diff(
/// Algorithm::Myers,
/// h.old_lookup(),
/// h.old_range(),
/// h.new_lookup(),
/// h.new_range(),
/// );
/// ```
///
/// The indexes are the same as with the passed source ranges.
pub struct IdentifyDistinct<Int> {
old: OffsetLookup<Int>,
new: OffsetLookup<Int>,
}
impl<Int> IdentifyDistinct<Int>
where
Int: Add<Output = Int> + From<u8> + Default + Copy,
{
/// Creates an int hasher for two sequences.
pub fn new<Old, New>(
old: &Old,
old_range: Range<usize>,
new: &New,
new_range: Range<usize>,
) -> Self
where
Old: Index<usize> + ?Sized,
Old::Output: Eq + Hash,
New: Index<usize> + ?Sized,
New::Output: Eq + Hash + PartialEq<Old::Output>,
{
enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
Old(&'old Old),
New(&'new New),
}
impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
where
Old: Hash + ?Sized,
New: Hash + ?Sized,
{
fn hash<H: Hasher>(&self, state: &mut H) {
match *self {
Key::Old(val) => val.hash(state),
Key::New(val) => val.hash(state),
}
}
}
impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
where
Old: Eq + ?Sized,
New: Eq + PartialEq<Old> + ?Sized,
{
#[inline(always)]
fn eq(&self, other: &Self) -> bool {
match (self, other) {
(Key::Old(a), Key::Old(b)) => a == b,
(Key::New(a), Key::New(b)) => a == b,
(Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
}
}
}
impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
where
Old: Eq + ?Sized,
New: Eq + PartialEq<Old> + ?Sized,
{
}
let mut map = HashMap::new();
let mut old_seq = Vec::new();
let mut new_seq = Vec::new();
let mut next_id = Int::default();
let step = Int::from(1);
let old_start = old_range.start;
let new_start = new_range.start;
for idx in old_range {
let item = Key::Old(&old[idx]);
let id = match map.entry(item) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let id = next_id;
next_id = next_id + step;
*v.insert(id)
}
};
old_seq.push(id);
}
for idx in new_range {
let item = Key::New(&new[idx]);
let id = match map.entry(item) {
Entry::Occupied(o) => *o.get(),
Entry::Vacant(v) => {
let id = next_id;
next_id = next_id + step;
*v.insert(id)
}
};
new_seq.push(id);
}
IdentifyDistinct {
old: OffsetLookup {
offset: old_start,
vec: old_seq,
},
new: OffsetLookup {
offset: new_start,
vec: new_seq,
},
}
}
/// Returns a lookup for the old side.
pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
&self.old
}
/// Returns a lookup for the new side.
pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
&self.new
}
/// Convenience method to get back the old range.
pub fn old_range(&self) -> Range<usize> {
self.old.offset..self.old.offset + self.old.vec.len()
}
/// Convenience method to get back the new range.
pub fn new_range(&self) -> Range<usize> {
self.new.offset..self.new.offset + self.new.vec.len()
}
}
#[test]
fn test_unique() {
let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
@ -151,6 +320,24 @@ fn test_unique() {
assert_eq!(u, vec![('a', 0), ('c', 2)]);
}
#[test]
fn test_int_hasher() {
let ih = IdentifyDistinct::<u8>::new(
&["", "foo", "bar", "baz"][..],
1..4,
&["", "foo", "blah", "baz"][..],
1..4,
);
assert_eq!(ih.old_lookup()[1], 0);
assert_eq!(ih.old_lookup()[2], 1);
assert_eq!(ih.old_lookup()[3], 2);
assert_eq!(ih.new_lookup()[1], 0);
assert_eq!(ih.new_lookup()[2], 3);
assert_eq!(ih.new_lookup()[3], 2);
assert_eq!(ih.old_range(), 1..4);
assert_eq!(ih.new_range(), 1..4);
}
#[test]
fn test_common_prefix_len() {
assert_eq!(