Added support for unicode word splitting and change inline to use it

2021-01-31 22:13:56 +01:00 · 2021-01-31 22:13:56 +01:00 · 0a4dd224f5
commit 0a4dd224f5
parent d0dd42e4af
6 changed files with 66 additions and 22 deletions
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -12,6 +12,7 @@ All notable changes to similar are documented here.
  does not show any changes.
 * Added inline diff highlighting support.
 * Changed word splitting to split into words and whitespace.
+* Added support for unicode based word splitting (`TextDiff::from_unicode_words`).

 ## 0.4.0

--- a/Cargo.toml
+++ b/Cargo.toml
@ -15,6 +15,7 @@ all-features = true
 [features]
 default = ["text"]
 text = []
+inline = ["unicode"]
 unicode = ["text", "unicode-segmentation"]

 [dev-dependencies]
@ -30,7 +31,7 @@ required-features = ["text"]

 [[example]]
 name = "terminal-inline"
-required-features = ["text"]
+required-features = ["text", "inline"]

 [[example]]
 name = "udiff"
--- a/src/lib.rs
+++ b/src/lib.rs
@ -38,11 +38,14 @@
 //! cases it's useful to pull in extra functionality.  Likewise you can turn
 //! off some functionality.
 //!
+//! * `text`: this feature is enabled by default and enables the [`text`] module.
+//!   If the crate is used without default features it's removed.
 //! * `unicode`: when this feature is enabled the text diffing functionality
 //!   gains the ability to diff on a grapheme instead of character level.  This
 //!   is particularly useful when working with text containing emojis.
-//! * `text`: this feature is enabled by default and enables the [`text`] module.
-//!   If the crate is used without default features it's removed.
+//! * `inline`: this feature gives access to additional functionality of the
+//!   `text` module to provide inline information about which values changed
+//!   in a line diff.  This currently also enables the `unicode` feature.
 #![warn(missing_docs)]
 pub mod algorithms;
 pub mod text;
--- a/src/text/inline.rs
+++ b/src/text/inline.rs
@ -1,9 +1,10 @@
+#![cfg(feature = "inline")]
 use std::{fmt, iter};

 use crate::algorithms::{Algorithm, DiffOp, DiffTag};
 use crate::text::{Change, ChangeTag, TextDiff};

-use super::split_words;
+use super::split_unicode_words;

 use std::ops::Range;

@ -118,8 +119,8 @@ pub(crate) fn iter_inline_changes<'diff>(
                (ChangeTag::Delete, Some(ChangeTag::Insert)) => {
                    let old_value = change.value();
                    let new_value = next_change.unwrap().value();
-                    let old_chars = split_words(&old_value).collect::<Vec<_>>();
-                    let new_chars = split_words(&new_value).collect::<Vec<_>>();
+                    let old_chars = split_unicode_words(&old_value).collect::<Vec<_>>();
+                    let new_chars = split_unicode_words(&new_value).collect::<Vec<_>>();
                    let old_mindex = MultiIndex::new(&old_chars, old_value);
                    let new_mindex = MultiIndex::new(&new_chars, new_value);
                    let inline_diff = TextDiff::configure()
@ -185,3 +186,18 @@ pub(crate) fn iter_inline_changes<'diff>(
    })
    .flatten()
 }
+
+#[test]
+fn test_line_ops_inline() {
+    let diff = TextDiff::from_lines(
+        "Hello World\nsome stuff here\nsome more stuff here\n\nAha stuff here\nand more stuff",
+        "Stuff\nHello World\nsome amazing stuff here\nsome more stuff here\n",
+    );
+    assert_eq!(diff.newline_terminated(), true);
+    let changes = diff
+        .ops()
+        .iter()
+        .flat_map(|op| diff.iter_inline_changes(op))
+        .collect::<Vec<_>>();
+    insta::assert_debug_snapshot!(&changes);
+}
--- a/src/text/mod.rs
+++ b/src/text/mod.rs
@ -74,9 +74,11 @@ use std::cmp::Reverse;
 use std::collections::{BinaryHeap, HashMap};
 use std::fmt;

+#[cfg(feature = "inline")]
 mod inline;
 mod udiff;

+#[cfg(feature = "inline")]
 pub use self::inline::*;
 pub use self::udiff::*;

@ -150,6 +152,25 @@ impl TextDiffConfig {
        )
    }

+    /// Creates a diff of unicode words.
+    ///
+    /// This splits the text into words according to unicode rules.  This is
+    /// generally recommended over [`diff_words`] but requires a dependency.
+    ///
+    /// This requires the `unicode` feature.
+    #[cfg(feature = "unicode")]
+    pub fn diff_unicode_words<'old, 'new, 'bufs>(
+        &self,
+        old: &'old str,
+        new: &'new str,
+    ) -> TextDiff<'old, 'new, 'bufs> {
+        self.diff(
+            Cow::Owned(split_unicode_words(old).collect()),
+            Cow::Owned(split_unicode_words(new).collect()),
+            false,
+        )
+    }
+
    /// Creates a diff of characters.
    pub fn diff_chars<'old, 'new, 'bufs>(
        &self,
@ -301,6 +322,16 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
        Self::configure().diff_words(old, new)
    }

+    /// Creates a diff of unicode words.
+    ///
+    /// Equivalent to `TextDiff::configure().diff_unicode_words(old, new)`.
+    ///
+    /// This requires the `unicode` feature.
+    #[cfg(feature = "unicode")]
+    pub fn from_unicode_words(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> {
+        Self::configure().diff_unicode_words(old, new)
+    }
+
    /// Creates a diff of chars.
    ///
    /// Equivalent to `TextDiff::configure().diff_chars(old, new)`.
@ -486,6 +517,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> {
    /// level diff on adjacent line replacements.  The exact behavior of
    /// this function with regards to how it detects those inline changes
    /// is currently not defined and will likely change over time.
+    #[cfg(feature = "inline")]
    pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator<Item = InlineChange> {
        iter_inline_changes(self, op)
    }
@ -567,6 +599,12 @@ fn split_words(s: &str) -> impl Iterator<Item = &str> {
    })
 }

+/// Splits words according to unicode rules.
+#[cfg(feature = "unicode")]
+fn split_unicode_words(s: &str) -> impl Iterator<Item = &str> {
+    unicode_segmentation::UnicodeSegmentation::split_word_bounds(s)
+}
+
 /// Splits text into characters.
 fn split_chars(s: &str) -> impl Iterator<Item = &str> {
    s.char_indices().map(move |(i, c)| &s[i..i + c.len_utf8()])
@ -782,21 +820,6 @@ fn test_virtual_newlines() {
    insta::assert_debug_snapshot!(&changes);
 }

-#[test]
-fn test_line_ops_inline() {
-    let diff = TextDiff::from_lines(
-        "Hello World\nsome stuff here\nsome more stuff here\n\nAha stuff here\nand more stuff",
-        "Stuff\nHello World\nsome amazing stuff here\nsome more stuff here\n",
-    );
-    assert_eq!(diff.newline_terminated(), true);
-    let changes = diff
-        .ops()
-        .iter()
-        .flat_map(|op| diff.iter_inline_changes(op))
-        .collect::<Vec<_>>();
-    insta::assert_debug_snapshot!(&changes);
-}
-
 #[test]
 fn test_char_diff() {
    let diff = TextDiff::from_chars("Hello World", "Hallo Welt");
--- a/src/text/snapshots/similartextinline__line_ops_inline.snap
+++ b/src/text/snapshots/similartextinline__line_ops_inline.snap
@ -1,5 +1,5 @@
 ---
-source: src/text/mod.rs
+source: src/text/inline.rs
 expression: "&changes"
 ---
 [