Added get_close_matches
This commit is contained in:
parent
7603aeb708
commit
892851d060
2 changed files with 64 additions and 2 deletions
|
|
@ -7,6 +7,7 @@ All notable changes to similar are documented here.
|
||||||
* Added grapheme and character level diffing utilities.
|
* Added grapheme and character level diffing utilities.
|
||||||
* `DiffOp::as_tag_tuple` is now taking the argument by reference.
|
* `DiffOp::as_tag_tuple` is now taking the argument by reference.
|
||||||
* Added `TextDiff::ratio`.
|
* Added `TextDiff::ratio`.
|
||||||
|
* Added `get_close_matches`.
|
||||||
|
|
||||||
## 0.2.0
|
## 0.2.0
|
||||||
|
|
||||||
|
|
|
||||||
65
src/text.rs
65
src/text.rs
|
|
@ -49,6 +49,7 @@
|
||||||
//! this even works for very long files if paired with this method.
|
//! this even works for very long files if paired with this method.
|
||||||
#![cfg(feature = "text")]
|
#![cfg(feature = "text")]
|
||||||
use std::borrow::Cow;
|
use std::borrow::Cow;
|
||||||
|
use std::collections::BinaryHeap;
|
||||||
use std::fmt;
|
use std::fmt;
|
||||||
use std::io;
|
use std::io;
|
||||||
use std::ops::Range;
|
use std::ops::Range;
|
||||||
|
|
@ -270,13 +271,12 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
|
||||||
///
|
///
|
||||||
/// This requires the `unicode` feature.
|
/// This requires the `unicode` feature.
|
||||||
#[cfg(feature = "unicode")]
|
#[cfg(feature = "unicode")]
|
||||||
pub fn from_graphemes(&self, old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
pub fn from_graphemes(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
|
||||||
Self::configure().diff_graphemes(old, new)
|
Self::configure().diff_graphemes(old, new)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Creates a diff of arbitrary slices.
|
/// Creates a diff of arbitrary slices.
|
||||||
pub fn from_slices(
|
pub fn from_slices(
|
||||||
&self,
|
|
||||||
old: &'bufs [&'old str],
|
old: &'bufs [&'old str],
|
||||||
new: &'bufs [&'new str],
|
new: &'bufs [&'new str],
|
||||||
) -> TextDiff<'old, 'new, 'bufs> {
|
) -> TextDiff<'old, 'new, 'bufs> {
|
||||||
|
|
@ -571,6 +571,16 @@ fn split_graphemes(s: &str) -> impl Iterator<Item = &str> {
|
||||||
unicode_segmentation::UnicodeSegmentation::graphemes(s, true)
|
unicode_segmentation::UnicodeSegmentation::graphemes(s, true)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// quick and dirty way to get an upper sequence ratio.
|
||||||
|
fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
|
||||||
|
let n = seq1.len() + seq2.len();
|
||||||
|
if n == 0 {
|
||||||
|
1.0
|
||||||
|
} else {
|
||||||
|
2.0 * seq1.len().min(seq2.len()) as f32 / n as f32
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/// Quick way to get a unified diff as string.
|
/// Quick way to get a unified diff as string.
|
||||||
pub fn unified_diff<'old, 'new>(
|
pub fn unified_diff<'old, 'new>(
|
||||||
alg: Algorithm,
|
alg: Algorithm,
|
||||||
|
|
@ -585,6 +595,51 @@ pub fn unified_diff<'old, 'new>(
|
||||||
.unified_diff(n, header)
|
.unified_diff(n, header)
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/// Use the text differ to find `n` close matches.
|
||||||
|
///
|
||||||
|
/// `cutoff` defines the threshold which needs to be reached for a word
|
||||||
|
/// to be considered similar. See [`TextDiff::ratio`] for more information.
|
||||||
|
///
|
||||||
|
/// ```
|
||||||
|
/// # use similar::text::get_close_matches;
|
||||||
|
/// let matches = get_close_matches(
|
||||||
|
/// "appel",
|
||||||
|
/// &["ape", "apple", "peach", "puppy"][..],
|
||||||
|
/// 3,
|
||||||
|
/// 0.6
|
||||||
|
/// );
|
||||||
|
/// assert_eq!(matches, vec!["apple", "ape"]);
|
||||||
|
/// ```
|
||||||
|
pub fn get_close_matches<'a>(
|
||||||
|
word: &str,
|
||||||
|
possibilities: &[&'a str],
|
||||||
|
n: usize,
|
||||||
|
cutoff: f32,
|
||||||
|
) -> Vec<&'a str> {
|
||||||
|
let mut matches = BinaryHeap::new();
|
||||||
|
let seq1 = split_chars(word).collect::<Vec<_>>();
|
||||||
|
for &possibility in possibilities {
|
||||||
|
let seq2 = split_chars(possibility).collect::<Vec<_>>();
|
||||||
|
if upper_seq_ratio(&seq1, &seq2) < cutoff {
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
let diff = TextDiff::from_slices(&seq1, &seq2);
|
||||||
|
let ratio = diff.ratio();
|
||||||
|
if ratio >= cutoff {
|
||||||
|
matches.push(((ratio * u32::MAX as f32) as u32, possibility));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
let mut rv = vec![];
|
||||||
|
for _ in 0..n {
|
||||||
|
if let Some((_, elt)) = matches.pop() {
|
||||||
|
rv.push(elt);
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
rv
|
||||||
|
}
|
||||||
|
|
||||||
#[test]
|
#[test]
|
||||||
fn test_split_lines() {
|
fn test_split_lines() {
|
||||||
assert_eq!(
|
assert_eq!(
|
||||||
|
|
@ -668,3 +723,9 @@ fn test_ratio() {
|
||||||
let diff = TextDiff::from_chars("", "");
|
let diff = TextDiff::from_chars("", "");
|
||||||
assert_eq!(diff.ratio(), 1.0);
|
assert_eq!(diff.ratio(), 1.0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#[test]
|
||||||
|
fn test_get_close_matches() {
|
||||||
|
let matches = get_close_matches("appel", &["ape", "apple", "peach", "puppy"][..], 3, 0.6);
|
||||||
|
assert_eq!(matches, vec!["apple", "ape"]);
|
||||||
|
}
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue