Some internal refactorings

2021-02-02 20:15:31 +01:00 · 2021-02-02 20:15:31 +01:00 · 34e5b3d571
commit 34e5b3d571
parent 4b85e70f91
8 changed files with 174 additions and 157 deletions
--- a/src/text/utils.rs
+++ b/src/text/utils.rs
@ -0,0 +1,55 @@
+use std::collections::HashMap;
+use std::hash::Hash;
+
+use super::DiffableStrRef;
+
+// quick and dirty way to get an upper sequence ratio.
+pub fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
+    let n = seq1.len() + seq2.len();
+    if n == 0 {
+        1.0
+    } else {
+        2.0 * seq1.len().min(seq2.len()) as f32 / n as f32
+    }
+}
+
+/// Internal utility to calculate an upper bound for a ratio for
+/// [`get_close_matches`].  This is based on Python's difflib approach
+/// of considering the two sets to be multisets.
+///
+/// It counts the number of matches without regard to order, which is an
+/// obvious upper bound.
+pub struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>);
+
+impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> {
+    pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> {
+        let mut counts = HashMap::new();
+        for &word in seq {
+            *counts.entry(word).or_insert(0) += 1;
+        }
+        QuickSeqRatio(counts)
+    }
+
+    pub fn calc(&self, seq: &[&T]) -> f32 {
+        let n = self.0.len() + seq.len();
+        if n == 0 {
+            return 1.0;
+        }
+
+        let mut available = HashMap::new();
+        let mut matches = 0;
+        for &word in seq {
+            let x = if let Some(count) = available.get(&word) {
+                *count
+            } else {
+                self.0.get(&word).copied().unwrap_or(0)
+            };
+            available.insert(word, x - 1);
+            if x > 0 {
+                matches += 1;
+            }
+        }
+
+        2.0 * matches as f32 / n as f32
+    }
+}