From 8a8e1c4822b65e50af35ea0eebf9eb69b2af573d Mon Sep 17 00:00:00 2001 From: Armin Ronacher Date: Sun, 24 Jan 2021 22:14:37 +0100 Subject: [PATCH] Improvements to get_close_matches --- CHANGELOG.md | 5 +++ src/lib.rs | 1 + src/text.rs | 92 +++++++++++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 94 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 86a19cd..a27ef1f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -2,6 +2,11 @@ All notable changes to similar are documented here. +## 0.4.0 + +* Change `get_close_matches` to use Python's quick ratio optimization + and order lexicographically when tied. + ## 0.3.0 * Added grapheme and character level diffing utilities. diff --git a/src/lib.rs b/src/lib.rs index 70a1866..1c33e6c 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -1,6 +1,7 @@ //! This crate implements diffing utilities. It attempts to provide an abstraction //! interface over different types of diffing algorithms. It's based on the //! the diff algorithm implementations of [pijul](https://pijul.org/). +//! //! ```rust //! # #[cfg(feature = "text")] { //! use similar::text::TextDiff; diff --git a/src/text.rs b/src/text.rs index f0e6f8c..ba75b8a 100644 --- a/src/text.rs +++ b/src/text.rs @@ -8,6 +8,9 @@ //! It can produce a unified diff and also let you iterate over the changeset //! directly if you want. //! +//! Text diffing is available by default but can be disabled by turning off the +//! default features. The feature to enable to get it back is `text`. +//! //! ## Examples //! //! A super simple example for how to generate a unified diff with three lines @@ -49,7 +52,8 @@ //! this even works for very long files if paired with this method. #![cfg(feature = "text")] use std::borrow::Cow; -use std::collections::BinaryHeap; +use std::cmp::Reverse; +use std::collections::{BinaryHeap, HashMap}; use std::fmt; use std::io; use std::ops::Range; @@ -253,22 +257,30 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { } /// Creates a diff of lines. + /// + /// Equivalent to `TextDiff::configure().diff_lines(old, new)`. pub fn from_lines(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { Self::configure().diff_lines(old, new) } /// Creates a diff of words. + /// + /// Equivalent to `TextDiff::configure().diff_words(old, new)`. pub fn from_words(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { Self::configure().diff_words(old, new) } /// Creates a diff of chars. + /// + /// Equivalent to `TextDiff::configure().diff_chars(old, new)`. pub fn from_chars(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { Self::configure().diff_chars(old, new) } /// Creates a diff of graphemes. /// + /// Equivalent to `TextDiff::configure().diff_graphemes(old, new)`. + /// /// This requires the `unicode` feature. #[cfg(feature = "unicode")] pub fn from_graphemes(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { @@ -276,6 +288,8 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { } /// Creates a diff of arbitrary slices. + /// + /// Equivalent to `TextDiff::configure().diff_slices(old, new)`. pub fn from_slices( old: &'bufs [&'old str], new: &'bufs [&'new str], @@ -289,6 +303,9 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { } /// Returns `true` if items in the slice are newline terminated. + /// + /// This flag is used by the unified diff writer to determine if extra + /// newlines have to be added. pub fn newline_terminated(&self) -> bool { self.newline_terminated } @@ -304,6 +321,15 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { } /// Return a measure of the sequences' similarity in the range `0..=1`. + /// + /// A ratio of `1.0` means the two sequences are a complete match, a + /// ratio of `0.0` would indicate completely distinct sequences. + /// + /// ```rust + /// # use similar::text::TextDiff; + /// let diff = TextDiff::from_chars("abcd", "bcde"); + /// assert_eq!(diff.ratio(), 0.75); + /// ``` pub fn ratio(&self) -> f32 { let matches = self .ops() @@ -581,6 +607,47 @@ fn upper_seq_ratio(seq1: &[T], seq2: &[T]) -> f32 { } } +/// Internal utility to calculate an upper bound for a ratio for +/// [`get_close_matches`]. This is based on Python's difflib approach +/// of considering the two sets to be multisets. +/// +/// It counts the number of matches without regard to order, which is an +/// obvious upper bound. +struct QuickSeqRatio<'a>(HashMap<&'a str, i32>); + +impl<'a> QuickSeqRatio<'a> { + pub fn new(seq: &[&'a str]) -> QuickSeqRatio<'a> { + let mut counts = HashMap::new(); + for &word in seq { + *counts.entry(word).or_insert(0) += 1; + } + QuickSeqRatio(counts) + } + + pub fn calc(&self, seq: &[&str]) -> f32 { + let n = self.0.len() + seq.len(); + if n == 0 { + return 1.0; + } + + let mut available = HashMap::new(); + let mut matches = 0; + for &word in seq { + let x = if let Some(count) = available.get(&word) { + *count + } else { + self.0.get(&word).copied().unwrap_or(0) + }; + available.insert(word, x - 1); + if x > 0 { + matches += 1; + } + } + + 2.0 * matches as f32 / n as f32 + } +} + /// Quick way to get a unified diff as string. pub fn unified_diff<'old, 'new>( alg: Algorithm, @@ -618,25 +685,33 @@ pub fn get_close_matches<'a>( ) -> Vec<&'a str> { let mut matches = BinaryHeap::new(); let seq1 = split_chars(word).collect::>(); + let quick_ratio = QuickSeqRatio::new(&seq1); + for &possibility in possibilities { let seq2 = split_chars(possibility).collect::>(); - if upper_seq_ratio(&seq1, &seq2) < cutoff { + + if upper_seq_ratio(&seq1, &seq2) < cutoff || quick_ratio.calc(&seq2) < cutoff { continue; } + let diff = TextDiff::from_slices(&seq1, &seq2); let ratio = diff.ratio(); if ratio >= cutoff { - matches.push(((ratio * u32::MAX as f32) as u32, possibility)); + // we're putting the word iself in reverse in so that matches with + // the same ratio are ordered lexicographically. + matches.push(((ratio * u32::MAX as f32) as u32, Reverse(possibility))); } } + let mut rv = vec![]; for _ in 0..n { if let Some((_, elt)) = matches.pop() { - rv.push(elt); + rv.push(elt.0); } else { break; } } + rv } @@ -728,4 +803,13 @@ fn test_ratio() { fn test_get_close_matches() { let matches = get_close_matches("appel", &["ape", "apple", "peach", "puppy"][..], 3, 0.6); assert_eq!(matches, vec!["apple", "ape"]); + let matches = get_close_matches( + "hulo", + &[ + "hi", "hulu", "hali", "hoho", "amaz", "zulo", "blah", "hopp", "uulo", "aulo", + ][..], + 5, + 0.7, + ); + assert_eq!(matches, vec!["aulo", "hulu", "uulo", "zulo"]); }