Improvements to get_close_matches
This commit is contained in:
parent
42ae311b1c
commit
8a8e1c4822
3 changed files with 94 additions and 4 deletions
|
|
@ -2,6 +2,11 @@
|
||||||
|
|
||||||
All notable changes to similar are documented here.
|
All notable changes to similar are documented here.
|
||||||
|
|
||||||
|
## 0.4.0
|
||||||
|
|
||||||
|
* Change `get_close_matches` to use Python's quick ratio optimization
|
||||||
|
and order lexicographically when tied.
|
||||||
|
|
||||||
## 0.3.0
|
## 0.3.0
|
||||||
|
|
||||||
* Added grapheme and character level diffing utilities.
|
* Added grapheme and character level diffing utilities.
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,7 @@
|
||||||
//! This crate implements diffing utilities. It attempts to provide an abstraction
|
//! This crate implements diffing utilities. It attempts to provide an abstraction
|
||||||
//! interface over different types of diffing algorithms. It's based on the
|
//! interface over different types of diffing algorithms. It's based on the
|
||||||
//! the diff algorithm implementations of [pijul](https://pijul.org/).
|
//! the diff algorithm implementations of [pijul](https://pijul.org/).
|
||||||
|
//!
|
||||||
//! ```rust
|
//! ```rust
|
||||||
//! # #[cfg(feature = "text")] {
|
//! # #[cfg(feature = "text")] {
|
||||||
//! use similar::text::TextDiff;
|
//! use similar::text::TextDiff;
|
||||||
|
|
|
||||||
92
src/text.rs
92
src/text.rs
|
|
@ -8,6 +8,9 @@
|
||||||
//! It can produce a unified diff and also let you iterate over the changeset
|
//! It can produce a unified diff and also let you iterate over the changeset
|
||||||
//! directly if you want.
|
//! directly if you want.
|
||||||
//!
|
//!
|
||||||
|
//! Text diffing is available by default but can be disabled by turning off the
|
||||||
|
//! default features. The feature to enable to get it back is `text`.
|
||||||
|
//!
|
||||||
//! ## Examples
|
//! ## Examples
|
||||||
//!
|
//!
|
||||||
//! A super simple example for how to generate a unified diff with three lines
|
//! A super simple example for how to generate a unified diff with three lines
|
||||||
|
|
@ -49,7 +52,8 @@
|
||||||
//! this even works for very long files if paired with this method.
|
//! this even works for very long files if paired with this method.
|
||||||
#![cfg(feature = "text")]
|
#![cfg(feature = "text")]
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
use std::collections::BinaryHeap;
|
use std::cmp::Reverse;
|
||||||
|
use std::collections::{BinaryHeap, HashMap};
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
@ -253,22 +257,30 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a diff of lines.
|
/// Creates a diff of lines.
|
||||||
|
///
|
||||||
|
/// Equivalent to `TextDiff::configure().diff_lines(old, new)`.
|
||||||
pub fn from_lines(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
pub fn from_lines(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
||||||
Self::configure().diff_lines(old, new)
|
Self::configure().diff_lines(old, new)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a diff of words.
|
/// Creates a diff of words.
|
||||||
|
///
|
||||||
|
/// Equivalent to `TextDiff::configure().diff_words(old, new)`.
|
||||||
pub fn from_words(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
pub fn from_words(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
||||||
Self::configure().diff_words(old, new)
|
Self::configure().diff_words(old, new)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a diff of chars.
|
/// Creates a diff of chars.
|
||||||
|
///
|
||||||
|
/// Equivalent to `TextDiff::configure().diff_chars(old, new)`.
|
||||||
pub fn from_chars(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
pub fn from_chars(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
||||||
Self::configure().diff_chars(old, new)
|
Self::configure().diff_chars(old, new)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a diff of graphemes.
|
/// Creates a diff of graphemes.
|
||||||
///
|
///
|
||||||
|
/// Equivalent to `TextDiff::configure().diff_graphemes(old, new)`.
|
||||||
|
///
|
||||||
/// This requires the `unicode` feature.
|
/// This requires the `unicode` feature.
|
||||||
#[cfg(feature = "unicode")]
|
#[cfg(feature = "unicode")]
|
||||||
pub fn from_graphemes(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
pub fn from_graphemes(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
||||||
|
|
@ -276,6 +288,8 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a diff of arbitrary slices.
|
/// Creates a diff of arbitrary slices.
|
||||||
|
///
|
||||||
|
/// Equivalent to `TextDiff::configure().diff_slices(old, new)`.
|
||||||
pub fn from_slices(
|
pub fn from_slices(
|
||||||
old: &'bufs [&'old str],
|
old: &'bufs [&'old str],
|
||||||
new: &'bufs [&'new str],
|
new: &'bufs [&'new str],
|
||||||
|
|
@ -289,6 +303,9 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Returns `true` if items in the slice are newline terminated.
|
/// Returns `true` if items in the slice are newline terminated.
|
||||||
|
///
|
||||||
|
/// This flag is used by the unified diff writer to determine if extra
|
||||||
|
/// newlines have to be added.
|
||||||
pub fn newline_terminated(&self) -> bool {
|
pub fn newline_terminated(&self) -> bool {
|
||||||
self.newline_terminated
|
self.newline_terminated
|
||||||
}
|
}
|
||||||
|
|
@ -304,6 +321,15 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Return a measure of the sequences' similarity in the range `0..=1`.
|
/// Return a measure of the sequences' similarity in the range `0..=1`.
|
||||||
|
///
|
||||||
|
/// A ratio of `1.0` means the two sequences are a complete match, a
|
||||||
|
/// ratio of `0.0` would indicate completely distinct sequences.
|
||||||
|
///
|
||||||
|
/// ```rust
|
||||||
|
/// # use similar::text::TextDiff;
|
||||||
|
/// let diff = TextDiff::from_chars("abcd", "bcde");
|
||||||
|
/// assert_eq!(diff.ratio(), 0.75);
|
||||||
|
/// ```
|
||||||
pub fn ratio(&self) -> f32 {
|
pub fn ratio(&self) -> f32 {
|
||||||
let matches = self
|
let matches = self
|
||||||
.ops()
|
.ops()
|
||||||
|
|
@ -581,6 +607,47 @@ fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Internal utility to calculate an upper bound for a ratio for
|
||||||
|
/// [`get_close_matches`]. This is based on Python's difflib approach
|
||||||
|
/// of considering the two sets to be multisets.
|
||||||
|
///
|
||||||
|
/// It counts the number of matches without regard to order, which is an
|
||||||
|
/// obvious upper bound.
|
||||||
|
struct QuickSeqRatio<'a>(HashMap<&'a str, i32>);
|
||||||
|
|
||||||
|
impl<'a> QuickSeqRatio<'a> {
|
||||||
|
pub fn new(seq: &[&'a str]) -> QuickSeqRatio<'a> {
|
||||||
|
let mut counts = HashMap::new();
|
||||||
|
for &word in seq {
|
||||||
|
*counts.entry(word).or_insert(0) += 1;
|
||||||
|
}
|
||||||
|
QuickSeqRatio(counts)
|
||||||
|
}
|
||||||
|
|
||||||
|
pub fn calc(&self, seq: &[&str]) -> f32 {
|
||||||
|
let n = self.0.len() + seq.len();
|
||||||
|
if n == 0 {
|
||||||
|
return 1.0;
|
||||||
|
}
|
||||||
|
|
||||||
|
let mut available = HashMap::new();
|
||||||
|
let mut matches = 0;
|
||||||
|
for &word in seq {
|
||||||
|
let x = if let Some(count) = available.get(&word) {
|
||||||
|
*count
|
||||||
|
} else {
|
||||||
|
self.0.get(&word).copied().unwrap_or(0)
|
||||||
|
};
|
||||||
|
available.insert(word, x - 1);
|
||||||
|
if x > 0 {
|
||||||
|
matches += 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
2.0 * matches as f32 / n as f32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Quick way to get a unified diff as string.
|
/// Quick way to get a unified diff as string.
|
||||||
pub fn unified_diff<'old, 'new>(
|
pub fn unified_diff<'old, 'new>(
|
||||||
alg: Algorithm,
|
alg: Algorithm,
|
||||||
|
|
@ -618,25 +685,33 @@ pub fn get_close_matches<'a>(
|
||||||
) -> Vec<&'a str> {
|
) -> Vec<&'a str> {
|
||||||
let mut matches = BinaryHeap::new();
|
let mut matches = BinaryHeap::new();
|
||||||
let seq1 = split_chars(word).collect::<Vec<_>>();
|
let seq1 = split_chars(word).collect::<Vec<_>>();
|
||||||
|
let quick_ratio = QuickSeqRatio::new(&seq1);
|
||||||
|
|
||||||
for &possibility in possibilities {
|
for &possibility in possibilities {
|
||||||
let seq2 = split_chars(possibility).collect::<Vec<_>>();
|
let seq2 = split_chars(possibility).collect::<Vec<_>>();
|
||||||
if upper_seq_ratio(&seq1, &seq2) < cutoff {
|
|
||||||
|
if upper_seq_ratio(&seq1, &seq2) < cutoff || quick_ratio.calc(&seq2) < cutoff {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
let diff = TextDiff::from_slices(&seq1, &seq2);
|
let diff = TextDiff::from_slices(&seq1, &seq2);
|
||||||
let ratio = diff.ratio();
|
let ratio = diff.ratio();
|
||||||
if ratio >= cutoff {
|
if ratio >= cutoff {
|
||||||
matches.push(((ratio * u32::MAX as f32) as u32, possibility));
|
// we're putting the word iself in reverse in so that matches with
|
||||||
|
// the same ratio are ordered lexicographically.
|
||||||
|
matches.push(((ratio * u32::MAX as f32) as u32, Reverse(possibility)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
let mut rv = vec![];
|
let mut rv = vec![];
|
||||||
for _ in 0..n {
|
for _ in 0..n {
|
||||||
if let Some((_, elt)) = matches.pop() {
|
if let Some((_, elt)) = matches.pop() {
|
||||||
rv.push(elt);
|
rv.push(elt.0);
|
||||||
} else {
|
} else {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
rv
|
rv
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -728,4 +803,13 @@ fn test_ratio() {
|
||||||
fn test_get_close_matches() {
|
fn test_get_close_matches() {
|
||||||
let matches = get_close_matches("appel", &["ape", "apple", "peach", "puppy"][..], 3, 0.6);
|
let matches = get_close_matches("appel", &["ape", "apple", "peach", "puppy"][..], 3, 0.6);
|
||||||
assert_eq!(matches, vec!["apple", "ape"]);
|
assert_eq!(matches, vec!["apple", "ape"]);
|
||||||
|
let matches = get_close_matches(
|
||||||
|
"hulo",
|
||||||
|
&[
|
||||||
|
"hi", "hulu", "hali", "hoho", "amaz", "zulo", "blah", "hopp", "uulo", "aulo",
|
||||||
|
][..],
|
||||||
|
5,
|
||||||
|
0.7,
|
||||||
|
);
|
||||||
|
assert_eq!(matches, vec!["aulo", "hulu", "uulo", "zulo"]);
|
||||||
}
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue