From 892851d0605088b42d2311b780d5e008f77cb7b1 Mon Sep 17 00:00:00 2001 From: Armin Ronacher Date: Sun, 24 Jan 2021 14:09:24 +0100 Subject: [PATCH] Added get_close_matches --- CHANGELOG.md | 1 + src/text.rs | 65 ++++++++++++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 64 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3a0e254..86a19cd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,7 @@ All notable changes to similar are documented here. * Added grapheme and character level diffing utilities. * `DiffOp::as_tag_tuple` is now taking the argument by reference. * Added `TextDiff::ratio`. +* Added `get_close_matches`. ## 0.2.0 diff --git a/src/text.rs b/src/text.rs index 627dadc..f0e6f8c 100644 --- a/src/text.rs +++ b/src/text.rs @@ -49,6 +49,7 @@ //! this even works for very long files if paired with this method. #![cfg(feature = "text")] use std::borrow::Cow; +use std::collections::BinaryHeap; use std::fmt; use std::io; use std::ops::Range; @@ -270,13 +271,12 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { /// /// This requires the `unicode` feature. #[cfg(feature = "unicode")] - pub fn from_graphemes(&self, old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { + pub fn from_graphemes(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { Self::configure().diff_graphemes(old, new) } /// Creates a diff of arbitrary slices. pub fn from_slices( - &self, old: &'bufs [&'old str], new: &'bufs [&'new str], ) -> TextDiff<'old, 'new, 'bufs> { @@ -571,6 +571,16 @@ fn split_graphemes(s: &str) -> impl Iterator { unicode_segmentation::UnicodeSegmentation::graphemes(s, true) } +// quick and dirty way to get an upper sequence ratio. +fn upper_seq_ratio(seq1: &[T], seq2: &[T]) -> f32 { + let n = seq1.len() + seq2.len(); + if n == 0 { + 1.0 + } else { + 2.0 * seq1.len().min(seq2.len()) as f32 / n as f32 + } +} + /// Quick way to get a unified diff as string. pub fn unified_diff<'old, 'new>( alg: Algorithm, @@ -585,6 +595,51 @@ pub fn unified_diff<'old, 'new>( .unified_diff(n, header) } +/// Use the text differ to find `n` close matches. +/// +/// `cutoff` defines the threshold which needs to be reached for a word +/// to be considered similar. See [`TextDiff::ratio`] for more information. +/// +/// ``` +/// # use similar::text::get_close_matches; +/// let matches = get_close_matches( +/// "appel", +/// &["ape", "apple", "peach", "puppy"][..], +/// 3, +/// 0.6 +/// ); +/// assert_eq!(matches, vec!["apple", "ape"]); +/// ``` +pub fn get_close_matches<'a>( + word: &str, + possibilities: &[&'a str], + n: usize, + cutoff: f32, +) -> Vec<&'a str> { + let mut matches = BinaryHeap::new(); + let seq1 = split_chars(word).collect::>(); + for &possibility in possibilities { + let seq2 = split_chars(possibility).collect::>(); + if upper_seq_ratio(&seq1, &seq2) < cutoff { + continue; + } + let diff = TextDiff::from_slices(&seq1, &seq2); + let ratio = diff.ratio(); + if ratio >= cutoff { + matches.push(((ratio * u32::MAX as f32) as u32, possibility)); + } + } + let mut rv = vec![]; + for _ in 0..n { + if let Some((_, elt)) = matches.pop() { + rv.push(elt); + } else { + break; + } + } + rv +} + #[test] fn test_split_lines() { assert_eq!( @@ -668,3 +723,9 @@ fn test_ratio() { let diff = TextDiff::from_chars("", ""); assert_eq!(diff.ratio(), 1.0); } + +#[test] +fn test_get_close_matches() { + let matches = get_close_matches("appel", &["ape", "apple", "peach", "puppy"][..], 3, 0.6); + assert_eq!(matches, vec!["apple", "ape"]); +}