Add compaction support (#22)
This commit is contained in:
parent
ddb73d8351
commit
7e628d78d8
9 changed files with 712 additions and 19 deletions
|
|
@ -1,8 +1,8 @@
|
|||
use std::collections::hash_map::Entry;
|
||||
use std::collections::HashMap;
|
||||
use std::fmt::Debug;
|
||||
use std::hash::Hash;
|
||||
use std::ops::{Index, Range};
|
||||
use std::hash::{Hash, Hasher};
|
||||
use std::ops::{Add, Index, Range};
|
||||
|
||||
/// Utility function to check if a range is empty that works on older rust versions
|
||||
#[inline(always)]
|
||||
|
|
@ -142,6 +142,175 @@ where
|
|||
.count()
|
||||
}
|
||||
|
||||
struct OffsetLookup<Int> {
|
||||
offset: usize,
|
||||
vec: Vec<Int>,
|
||||
}
|
||||
|
||||
impl<Int> Index<usize> for OffsetLookup<Int> {
|
||||
type Output = Int;
|
||||
|
||||
#[inline(always)]
|
||||
fn index(&self, index: usize) -> &Self::Output {
|
||||
&self.vec[index - self.offset]
|
||||
}
|
||||
}
|
||||
|
||||
/// A utility struct to convert distinct items to unique integers.
|
||||
///
|
||||
/// This can be helpful on larger inputs to speed up the comparisons
|
||||
/// performed by doing a first pass where the data set gets reduced
|
||||
/// to (small) integers.
|
||||
///
|
||||
/// The idea is that instead of passing two sequences to a diffling algorithm
|
||||
/// you first pass it via [`IdentifyDistinct`]:
|
||||
///
|
||||
/// ```rust
|
||||
/// use similar::capture_diff;
|
||||
/// use similar::algorithms::{Algorithm, IdentifyDistinct};
|
||||
///
|
||||
/// let old = &["foo", "bar", "baz"][..];
|
||||
/// let new = &["foo", "blah", "baz"][..];
|
||||
/// let h = IdentifyDistinct::<u32>::new(old, 0..old.len(), new, 0..new.len());
|
||||
/// let ops = capture_diff(
|
||||
/// Algorithm::Myers,
|
||||
/// h.old_lookup(),
|
||||
/// h.old_range(),
|
||||
/// h.new_lookup(),
|
||||
/// h.new_range(),
|
||||
/// );
|
||||
/// ```
|
||||
///
|
||||
/// The indexes are the same as with the passed source ranges.
|
||||
pub struct IdentifyDistinct<Int> {
|
||||
old: OffsetLookup<Int>,
|
||||
new: OffsetLookup<Int>,
|
||||
}
|
||||
|
||||
impl<Int> IdentifyDistinct<Int>
|
||||
where
|
||||
Int: Add<Output = Int> + From<u8> + Default + Copy,
|
||||
{
|
||||
/// Creates an int hasher for two sequences.
|
||||
pub fn new<Old, New>(
|
||||
old: &Old,
|
||||
old_range: Range<usize>,
|
||||
new: &New,
|
||||
new_range: Range<usize>,
|
||||
) -> Self
|
||||
where
|
||||
Old: Index<usize> + ?Sized,
|
||||
Old::Output: Eq + Hash,
|
||||
New: Index<usize> + ?Sized,
|
||||
New::Output: Eq + Hash + PartialEq<Old::Output>,
|
||||
{
|
||||
enum Key<'old, 'new, Old: ?Sized, New: ?Sized> {
|
||||
Old(&'old Old),
|
||||
New(&'new New),
|
||||
}
|
||||
|
||||
impl<'old, 'new, Old, New> Hash for Key<'old, 'new, Old, New>
|
||||
where
|
||||
Old: Hash + ?Sized,
|
||||
New: Hash + ?Sized,
|
||||
{
|
||||
fn hash<H: Hasher>(&self, state: &mut H) {
|
||||
match *self {
|
||||
Key::Old(val) => val.hash(state),
|
||||
Key::New(val) => val.hash(state),
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'old, 'new, Old, New> PartialEq for Key<'old, 'new, Old, New>
|
||||
where
|
||||
Old: Eq + ?Sized,
|
||||
New: Eq + PartialEq<Old> + ?Sized,
|
||||
{
|
||||
#[inline(always)]
|
||||
fn eq(&self, other: &Self) -> bool {
|
||||
match (self, other) {
|
||||
(Key::Old(a), Key::Old(b)) => a == b,
|
||||
(Key::New(a), Key::New(b)) => a == b,
|
||||
(Key::Old(a), Key::New(b)) | (Key::New(b), Key::Old(a)) => b == a,
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
impl<'old, 'new, Old, New> Eq for Key<'old, 'new, Old, New>
|
||||
where
|
||||
Old: Eq + ?Sized,
|
||||
New: Eq + PartialEq<Old> + ?Sized,
|
||||
{
|
||||
}
|
||||
|
||||
let mut map = HashMap::new();
|
||||
let mut old_seq = Vec::new();
|
||||
let mut new_seq = Vec::new();
|
||||
let mut next_id = Int::default();
|
||||
let step = Int::from(1);
|
||||
let old_start = old_range.start;
|
||||
let new_start = new_range.start;
|
||||
|
||||
for idx in old_range {
|
||||
let item = Key::Old(&old[idx]);
|
||||
let id = match map.entry(item) {
|
||||
Entry::Occupied(o) => *o.get(),
|
||||
Entry::Vacant(v) => {
|
||||
let id = next_id;
|
||||
next_id = next_id + step;
|
||||
*v.insert(id)
|
||||
}
|
||||
};
|
||||
old_seq.push(id);
|
||||
}
|
||||
|
||||
for idx in new_range {
|
||||
let item = Key::New(&new[idx]);
|
||||
let id = match map.entry(item) {
|
||||
Entry::Occupied(o) => *o.get(),
|
||||
Entry::Vacant(v) => {
|
||||
let id = next_id;
|
||||
next_id = next_id + step;
|
||||
*v.insert(id)
|
||||
}
|
||||
};
|
||||
new_seq.push(id);
|
||||
}
|
||||
|
||||
IdentifyDistinct {
|
||||
old: OffsetLookup {
|
||||
offset: old_start,
|
||||
vec: old_seq,
|
||||
},
|
||||
new: OffsetLookup {
|
||||
offset: new_start,
|
||||
vec: new_seq,
|
||||
},
|
||||
}
|
||||
}
|
||||
|
||||
/// Returns a lookup for the old side.
|
||||
pub fn old_lookup(&self) -> &impl Index<usize, Output = Int> {
|
||||
&self.old
|
||||
}
|
||||
|
||||
/// Returns a lookup for the new side.
|
||||
pub fn new_lookup(&self) -> &impl Index<usize, Output = Int> {
|
||||
&self.new
|
||||
}
|
||||
|
||||
/// Convenience method to get back the old range.
|
||||
pub fn old_range(&self) -> Range<usize> {
|
||||
self.old.offset..self.old.offset + self.old.vec.len()
|
||||
}
|
||||
|
||||
/// Convenience method to get back the new range.
|
||||
pub fn new_range(&self) -> Range<usize> {
|
||||
self.new.offset..self.new.offset + self.new.vec.len()
|
||||
}
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_unique() {
|
||||
let u = unique(&vec!['a', 'b', 'c', 'd', 'd', 'b'], 0..6)
|
||||
|
|
@ -151,6 +320,24 @@ fn test_unique() {
|
|||
assert_eq!(u, vec![('a', 0), ('c', 2)]);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_int_hasher() {
|
||||
let ih = IdentifyDistinct::<u8>::new(
|
||||
&["", "foo", "bar", "baz"][..],
|
||||
1..4,
|
||||
&["", "foo", "blah", "baz"][..],
|
||||
1..4,
|
||||
);
|
||||
assert_eq!(ih.old_lookup()[1], 0);
|
||||
assert_eq!(ih.old_lookup()[2], 1);
|
||||
assert_eq!(ih.old_lookup()[3], 2);
|
||||
assert_eq!(ih.new_lookup()[1], 0);
|
||||
assert_eq!(ih.new_lookup()[2], 3);
|
||||
assert_eq!(ih.new_lookup()[3], 2);
|
||||
assert_eq!(ih.old_range(), 1..4);
|
||||
assert_eq!(ih.new_range(), 1..4);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn test_common_prefix_len() {
|
||||
assert_eq!(
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue