diff --git a/CHANGELOG.md b/CHANGELOG.md index 49dd024..adc9c6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,8 @@ All notable changes to similar are documented here. ## 0.6.0 * Add `get_diff_ratio`. +* Add support for byte diffing and change the text interface to abstract + over `DiffableStr`. ## 0.5.0 diff --git a/Cargo.toml b/Cargo.toml index 600f678..c6ac967 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -19,7 +19,11 @@ all-features = true default = ["text"] text = [] inline = ["unicode"] -unicode = ["text", "unicode-segmentation"] + +# this annoyingly currently also turns on bstr and not just bstr/unicode +# unclear if this is fixable +unicode = ["text", "unicode-segmentation", "bstr/unicode"] +bytes = ["bstr", "text"] [dev-dependencies] insta = "1.5.2" @@ -27,6 +31,7 @@ console = "0.14.0" [dependencies] unicode-segmentation = { version = "1.7.1", optional = true } +bstr = { version = "0.2.14", optional = true, default-features = false } [[example]] name = "terminal" @@ -34,11 +39,11 @@ required-features = ["text"] [[example]] name = "terminal-inline" -required-features = ["text", "inline"] +required-features = ["text", "inline", "bytes"] [[example]] name = "udiff" -required-features = ["text"] +required-features = ["text", "bytes"] [[example]] name = "close-matches" diff --git a/Makefile b/Makefile index 1d2a11d..af6e831 100644 --- a/Makefile +++ b/Makefile @@ -10,6 +10,7 @@ test: @cargo test @cargo test --all-features @cargo test --no-default-features + @cargo test --no-default-features --features bytes format: @rustup component add rustfmt 2> /dev/null diff --git a/examples/terminal-inline.rs b/examples/terminal-inline.rs index 5bfbd27..ba6ab50 100644 --- a/examples/terminal-inline.rs +++ b/examples/terminal-inline.rs @@ -1,5 +1,5 @@ use std::fmt; -use std::fs::read_to_string; +use std::fs::read; use std::process::exit; use console::{style, Style}; @@ -23,8 +23,8 @@ fn main() { exit(1); } - let old = read_to_string(&args[1]).unwrap(); - let new = read_to_string(&args[2]).unwrap(); + let old = read(&args[1]).unwrap(); + let new = read(&args[2]).unwrap(); let diff = TextDiff::from_lines(&old, &new); for (idx, group) in diff.grouped_ops(3).iter().enumerate() { @@ -44,7 +44,7 @@ fn main() { style(Line(change.new_index())).dim(), s.apply_to(sign).bold(), ); - for &(emphasized, value) in change.values() { + for (emphasized, value) in change.iter_strings() { if emphasized { print!("{}", s.apply_to(value).underlined().on_black()); } else { diff --git a/examples/terminal.rs b/examples/terminal.rs index a0a4761..26cc49a 100644 --- a/examples/terminal.rs +++ b/examples/terminal.rs @@ -14,7 +14,7 @@ fn main() { ChangeTag::Insert => ("+", Style::new().green()), ChangeTag::Equal => (" ", Style::new()), }; - print!("{}{}", style.apply_to(sign).bold(), style.apply_to(change),); + print!("{}{}", style.apply_to(sign).bold(), style.apply_to(change)); } } } diff --git a/examples/udiff.rs b/examples/udiff.rs index 8d62071..d46f46c 100644 --- a/examples/udiff.rs +++ b/examples/udiff.rs @@ -1,4 +1,5 @@ -use std::fs::read_to_string; +use std::fs::read; +use std::io; use std::process::exit; use similar::text::TextDiff; @@ -10,13 +11,14 @@ fn main() { exit(1); } - let old = read_to_string(&args[1]).unwrap(); - let new = read_to_string(&args[2]).unwrap(); - print!( - "{}", - TextDiff::from_lines(&old, &new).unified_diff().header( + let old = read(&args[1]).unwrap(); + let new = read(&args[2]).unwrap(); + TextDiff::from_lines(&old, &new) + .unified_diff() + .header( &args[1].as_os_str().to_string_lossy(), - &args[2].as_os_str().to_string_lossy() + &args[2].as_os_str().to_string_lossy(), ) - ); + .to_writer(io::stdout()) + .unwrap(); } diff --git a/src/lib.rs b/src/lib.rs index fb0bd1f..ae2d1a4 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -43,6 +43,8 @@ //! * `unicode`: when this feature is enabled the text diffing functionality //! gains the ability to diff on a grapheme instead of character level. This //! is particularly useful when working with text containing emojis. +//! * `bytes`: when this feature is enabled the text module gains support for +//! working with byte slices. //! * `inline`: this feature gives access to additional functionality of the //! `text` module to provide inline information about which values changed //! in a line diff. This currently also enables the `unicode` feature. diff --git a/src/text/abstraction.rs b/src/text/abstraction.rs new file mode 100644 index 0000000..3a5d058 --- /dev/null +++ b/src/text/abstraction.rs @@ -0,0 +1,425 @@ +#[cfg(feature = "bytes")] +use bstr::ByteSlice; + +use std::borrow::Cow; +use std::hash::Hash; +use std::ops::Range; + +/// Reference to a [`DiffableStr`]. +pub trait DiffableStrRef { + /// The type of the resolved [`DiffableStr`]. + type Output: DiffableStr + ?Sized; + + /// Resolves the reference. + fn as_diffable_str(&self) -> &Self::Output; +} + +impl DiffableStrRef for T { + type Output = T; + + fn as_diffable_str(&self) -> &T { + self + } +} + +impl DiffableStrRef for String { + type Output = str; + + fn as_diffable_str(&self) -> &str { + self.as_str() + } +} + +impl<'a, T: DiffableStr + ?Sized> DiffableStrRef for Cow<'a, T> { + type Output = T; + + fn as_diffable_str(&self) -> &T { + &self + } +} + +#[cfg(feature = "bytes")] +impl DiffableStrRef for Vec { + type Output = [u8]; + + fn as_diffable_str(&self) -> &[u8] { + self.as_slice() + } +} + +/// All supported diffable strings. +/// +/// The text module can work with different types of strings depending +/// on how the crate is compiled. Out of the box `&str` is always supported +/// but with the `bytes` feature one can also work with `[u8]` slices for +/// as long as they are ASCII compatible. +pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { + /// Splits the value into newlines with newlines attached. + fn split_lines(&self) -> Vec<&Self>; + + /// Splits the value into newlines with newlines separated. + fn split_lines_and_newlines(&self) -> Vec<&Self>; + + /// Tokenizes into words. + fn split_words(&self) -> Vec<&Self>; + + /// Splits the input into characters. + fn split_chars(&self) -> Vec<&Self>; + + /// Splits into unicode words. + #[cfg(feature = "unicode")] + fn split_unicode_words(&self) -> Vec<&Self>; + + /// Splits into unicode graphemes.. + #[cfg(feature = "unicode")] + fn split_graphemes(&self) -> Vec<&Self>; + + /// Decodes the string (potentially) lossy. + fn as_str(&self) -> Option<&str>; + + /// Decodes the string (potentially) lossy. + fn as_str_lossy(&self) -> Cow<'_, str>; + + /// Checks if the string ends in a newline. + fn ends_with_newline(&self) -> bool; + + /// The length of the string. + fn len(&self) -> usize; + + /// Slices the string. + fn slice(&self, rng: Range) -> &Self; + + /// Returns the strings as slice of raw bytes. + fn as_bytes(&self) -> &[u8]; +} + +impl DiffableStr for str { + fn split_lines(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut last_pos = 0; + let mut lines = vec![]; + + while let Some((idx, c)) = iter.next() { + if c == '\r' { + if iter.peek().map_or(false, |x| x.1 == '\n') { + lines.push(&self[last_pos..=idx + 1]); + iter.next(); + last_pos = idx + 2; + } else { + lines.push(&self[last_pos..=idx]); + last_pos = idx + 1; + } + } else if c == '\n' { + lines.push(&self[last_pos..=idx]); + last_pos = idx + 1; + } + } + + if last_pos < self.len() { + lines.push(&self[last_pos..]); + } + + lines + } + + fn split_lines_and_newlines(&self) -> Vec<&Self> { + let mut rv = vec![]; + let mut iter = self.char_indices().peekable(); + + while let Some((idx, c)) = iter.next() { + let is_newline = c == '\r' || c == '\n'; + let start = idx; + let mut end = idx + c.len_utf8(); + while let Some(&(_, next_char)) = iter.peek() { + if (next_char == '\r' || next_char == '\n') != is_newline { + break; + } + iter.next(); + end += next_char.len_utf8(); + } + rv.push(&self[start..end]); + } + + rv + } + + fn split_words(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut rv = vec![]; + + while let Some((idx, c)) = iter.next() { + let is_whitespace = c.is_whitespace(); + let start = idx; + let mut end = idx + c.len_utf8(); + while let Some(&(_, next_char)) = iter.peek() { + if next_char.is_whitespace() != is_whitespace { + break; + } + iter.next(); + end += next_char.len_utf8(); + } + rv.push(&self[start..end]); + } + + rv + } + + fn split_chars(&self) -> Vec<&Self> { + self.char_indices() + .map(move |(i, c)| &self[i..i + c.len_utf8()]) + .collect() + } + + #[cfg(feature = "unicode")] + fn split_unicode_words(&self) -> Vec<&Self> { + unicode_segmentation::UnicodeSegmentation::split_word_bounds(self).collect() + } + + #[cfg(feature = "unicode")] + fn split_graphemes(&self) -> Vec<&Self> { + unicode_segmentation::UnicodeSegmentation::graphemes(self, true).collect() + } + + fn as_str(&self) -> Option<&str> { + Some(self) + } + + fn as_str_lossy(&self) -> Cow<'_, str> { + Cow::Borrowed(self) + } + + fn ends_with_newline(&self) -> bool { + self.ends_with(&['\r', '\n'][..]) + } + + fn len(&self) -> usize { + str::len(self) + } + + fn slice(&self, rng: Range) -> &Self { + &self[rng] + } + + fn as_bytes(&self) -> &[u8] { + str::as_bytes(self) + } +} + +#[cfg(feature = "bytes")] +impl DiffableStr for [u8] { + fn split_lines(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut last_pos = 0; + let mut lines = vec![]; + + while let Some((_, end, c)) = iter.next() { + if c == '\r' { + if iter.peek().map_or(false, |x| x.2 == '\n') { + lines.push(&self[last_pos..end + 1]); + iter.next(); + last_pos = end + 1; + } else { + lines.push(&self[last_pos..end]); + last_pos = end; + } + } else if c == '\n' { + lines.push(&self[last_pos..end]); + last_pos = end; + } + } + + if last_pos < self.len() { + lines.push(&self[last_pos..]); + } + + lines + } + + fn split_lines_and_newlines(&self) -> Vec<&Self> { + let mut rv = vec![]; + let mut iter = self.char_indices().peekable(); + + while let Some((start, mut end, c)) = iter.next() { + let is_newline = c == '\r' || c == '\n'; + while let Some(&(_, new_end, next_char)) = iter.peek() { + if (next_char == '\r' || next_char == '\n') != is_newline { + break; + } + iter.next(); + end = new_end; + } + rv.push(&self[start..end]); + } + + rv + } + + fn split_words(&self) -> Vec<&Self> { + let mut iter = self.char_indices().peekable(); + let mut rv = vec![]; + + while let Some((start, mut end, c)) = iter.next() { + let is_whitespace = c.is_whitespace(); + while let Some(&(_, new_end, next_char)) = iter.peek() { + if next_char.is_whitespace() != is_whitespace { + break; + } + iter.next(); + end = new_end; + } + rv.push(&self[start..end]); + } + + rv + } + + #[cfg(feature = "unicode")] + fn split_unicode_words(&self) -> Vec<&Self> { + self.words_with_breaks().map(|x| x.as_bytes()).collect() + } + + #[cfg(feature = "unicode")] + fn split_graphemes(&self) -> Vec<&Self> { + self.graphemes().map(|x| x.as_bytes()).collect() + } + + fn split_chars(&self) -> Vec<&Self> { + self.char_indices() + .map(move |(start, end, _)| &self[start..end]) + .collect() + } + + fn as_str(&self) -> Option<&str> { + std::str::from_utf8(self).ok() + } + + fn as_str_lossy(&self) -> Cow<'_, str> { + String::from_utf8_lossy(self) + } + + fn ends_with_newline(&self) -> bool { + matches!(self.last_byte(), Some(b'\r') | Some(b'\n')) + } + + fn len(&self) -> usize { + <[u8]>::len(self) + } + + fn slice(&self, rng: Range) -> &Self { + &self[rng] + } + + fn as_bytes(&self) -> &[u8] { + self + } +} + +#[test] +fn test_split_lines() { + assert_eq!( + DiffableStr::split_lines("first\nsecond\rthird\r\nfourth\nlast"), + vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"] + ); + assert_eq!(DiffableStr::split_lines("\n\n"), vec!["\n", "\n"]); + assert_eq!(DiffableStr::split_lines("\n"), vec!["\n"]); + assert!(DiffableStr::split_lines("").is_empty()); +} + +#[test] +fn test_split_words() { + assert_eq!( + DiffableStr::split_words("foo bar baz\n\n aha"), + ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"] + ); +} + +#[test] +fn test_split_chars() { + assert_eq!( + DiffableStr::split_chars("abcfö❄️"), + vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"] + ); +} + +#[test] +#[cfg(feature = "unicode")] +fn test_split_graphemes() { + assert_eq!( + DiffableStr::split_graphemes("abcfö❄️"), + vec!["a", "b", "c", "f", "ö", "❄️"] + ); +} + +#[test] +#[cfg(feature = "bytes")] +fn test_split_lines_bytes() { + assert_eq!( + DiffableStr::split_lines("first\nsecond\rthird\r\nfourth\nlast".as_bytes()), + vec![ + "first\n".as_bytes(), + "second\r".as_bytes(), + "third\r\n".as_bytes(), + "fourth\n".as_bytes(), + "last".as_bytes() + ] + ); + assert_eq!( + DiffableStr::split_lines("\n\n".as_bytes()), + vec!["\n".as_bytes(), "\n".as_bytes()] + ); + assert_eq!( + DiffableStr::split_lines("\n".as_bytes()), + vec!["\n".as_bytes()] + ); + assert!(DiffableStr::split_lines("".as_bytes()).is_empty()); +} + +#[test] +#[cfg(feature = "bytes")] +fn test_split_words_bytes() { + assert_eq!( + DiffableStr::split_words("foo bar baz\n\n aha".as_bytes()), + [ + &b"foo"[..], + &b" "[..], + &b"bar"[..], + &b" "[..], + &b"baz"[..], + &b"\n\n "[..], + &b"aha"[..] + ] + ); +} + +#[test] +#[cfg(feature = "bytes")] +fn test_split_chars_bytes() { + assert_eq!( + DiffableStr::split_chars("abcfö❄️".as_bytes()), + vec![ + &b"a"[..], + &b"b"[..], + &b"c"[..], + &b"f"[..], + "ö".as_bytes(), + "❄".as_bytes(), + "\u{fe0f}".as_bytes() + ] + ); +} + +#[test] +#[cfg(all(feature = "bytes", feature = "unicode"))] +fn test_split_graphemes_bytes() { + assert_eq!( + DiffableStr::split_graphemes("abcfö❄️".as_bytes()), + vec![ + &b"a"[..], + &b"b"[..], + &b"c"[..], + &b"f"[..], + "ö".as_bytes(), + "❄️".as_bytes() + ] + ); +} diff --git a/src/text/inline.rs b/src/text/inline.rs index 8e0feba..3a55af5 100644 --- a/src/text/inline.rs +++ b/src/text/inline.rs @@ -1,24 +1,23 @@ #![cfg(feature = "inline")] +use std::borrow::Cow; use std::fmt; use crate::algorithms::{capture_diff, get_diff_ratio, Algorithm, DiffOp, DiffTag}; -use crate::text::{Change, ChangeTag, TextDiff}; - -use super::split_unicode_words; +use crate::text::{Change, ChangeTag, DiffableStr, TextDiff}; use std::ops::Index; -struct MultiLookup<'bufs, 's> { - strings: &'bufs [&'s str], - seqs: Vec<(&'s str, usize, usize)>, +struct MultiLookup<'bufs, 's, T: DiffableStr + ?Sized> { + strings: &'bufs [&'s T], + seqs: Vec<(&'s T, usize, usize)>, } -impl<'bufs, 's> MultiLookup<'bufs, 's> { - fn new(strings: &'bufs [&'s str]) -> MultiLookup<'bufs, 's> { +impl<'bufs, 's, T: DiffableStr + ?Sized> MultiLookup<'bufs, 's, T> { + fn new(strings: &'bufs [&'s T]) -> MultiLookup<'bufs, 's, T> { let mut seqs = Vec::new(); for (string_idx, string) in strings.iter().enumerate() { let mut offset = 0; - for word in split_unicode_words(string) { + for word in string.split_unicode_words() { seqs.push((word, string_idx, offset)); offset += word.len(); } @@ -30,7 +29,7 @@ impl<'bufs, 's> MultiLookup<'bufs, 's> { self.seqs.len() } - fn get_original_slices(&self, idx: usize, len: usize) -> Vec<(usize, &'s str)> { + fn get_original_slices(&self, idx: usize, len: usize) -> Vec<(usize, &'s T)> { let mut last = None; let mut rv = Vec::new(); @@ -44,7 +43,8 @@ impl<'bufs, 's> MultiLookup<'bufs, 's> { } else { rv.push(( last_str_idx, - &self.strings[last_str_idx][start_char_idx..start_char_idx + last_len], + self.strings[last_str_idx] + .slice(start_char_idx..start_char_idx + last_len), )); Some((str_idx, char_idx, s.len())) } @@ -55,7 +55,7 @@ impl<'bufs, 's> MultiLookup<'bufs, 's> { if let Some((str_idx, start_char_idx, len)) = last { rv.push(( str_idx, - &self.strings[str_idx][start_char_idx..start_char_idx + len], + self.strings[str_idx].slice(start_char_idx..start_char_idx + len), )); } @@ -63,43 +63,26 @@ impl<'bufs, 's> MultiLookup<'bufs, 's> { } } -impl<'bufs, 's> Index for MultiLookup<'bufs, 's> { - type Output = str; +impl<'bufs, 's, T: DiffableStr + ?Sized> Index for MultiLookup<'bufs, 's, T> { + type Output = T; fn index(&self, index: usize) -> &Self::Output { &self.seqs[index].0 } } -fn partition_newlines(s: &str) -> impl Iterator { - let mut iter = s.char_indices().peekable(); - - std::iter::from_fn(move || { - if let Some((idx, c)) = iter.next() { - let is_newline = c == '\r' || c == '\n'; - let start = idx; - let mut end = idx + c.len_utf8(); - while let Some(&(_, next_char)) = iter.peek() { - if (next_char == '\r' || next_char == '\n') != is_newline { - break; - } - iter.next(); - end += next_char.len_utf8(); - } - Some((&s[start..end], is_newline)) - } else { - None - } - }) -} - -fn push_values<'s>(v: &mut Vec>, idx: usize, emphasized: bool, s: &'s str) { +fn push_values<'s, T: DiffableStr + ?Sized>( + v: &mut Vec>, + idx: usize, + emphasized: bool, + s: &'s T, +) { v.resize_with(v.len().max(idx + 1), Vec::new); // newlines cause all kinds of wacky stuff if they end up highlighted. // because of this we want to unemphasize all newlines we encounter. if emphasized { - for (seg, is_nl) in partition_newlines(s) { - v[idx].push((!is_nl, seg)); + for seg in s.split_lines_and_newlines() { + v[idx].push((!seg.ends_with_newline(), seg)); } } else { v[idx].push((false, s)); @@ -110,15 +93,15 @@ fn push_values<'s>(v: &mut Vec>, idx: usize, emphasized: bo /// /// This is like [`Change`] but with inline highlight info. #[derive(Debug, PartialEq, Eq, Hash, Clone, Ord, PartialOrd)] -pub struct InlineChange<'s> { +pub struct InlineChange<'s, T: DiffableStr + ?Sized> { tag: ChangeTag, old_index: Option, new_index: Option, - values: Vec<(bool, &'s str)>, + values: Vec<(bool, &'s T)>, missing_newline: bool, } -impl<'s> InlineChange<'s> { +impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> { /// Returns the change tag. pub fn tag(&self) -> ChangeTag { self.tag @@ -135,10 +118,23 @@ impl<'s> InlineChange<'s> { } /// Returns the changed values. - pub fn values(&self) -> &[(bool, &'s str)] { + /// + /// Each item is a tuple in the form `(emphasized, value)` where `emphasized` + /// is true if it should be highlighted as an inline diff. + pub fn values(&self) -> &[(bool, &'s T)] { &self.values } + /// Iterates over all (potentially lossy) utf-8 decoded values. + /// + /// Each item is a tuple in the form `(emphasized, value)` where `emphasized` + /// is true if it should be highlighted as an inline diff. + pub fn iter_strings(&self) -> impl Iterator)> { + self.values() + .iter() + .map(|(emphasized, raw_value)| (*emphasized, raw_value.as_str_lossy())) + } + /// Returns `true` if this change needs to be followed up by a /// missing newline. pub fn missing_newline(&self) -> bool { @@ -146,8 +142,8 @@ impl<'s> InlineChange<'s> { } } -impl<'s> From> for InlineChange<'s> { - fn from(change: Change<'s>) -> InlineChange<'s> { +impl<'s, T: DiffableStr + ?Sized> From> for InlineChange<'s, T> { + fn from(change: Change<'s, T>) -> InlineChange<'s, T> { InlineChange { tag: change.tag(), old_index: change.old_index(), @@ -158,9 +154,9 @@ impl<'s> From> for InlineChange<'s> { } } -impl<'s> fmt::Display for InlineChange<'s> { +impl<'s, T: DiffableStr + ?Sized> fmt::Display for InlineChange<'s, T> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for &(emphasized, value) in &self.values { + for (emphasized, value) in self.iter_strings() { let marker = match (emphasized, self.tag) { (false, _) | (true, ChangeTag::Equal) => "", (true, ChangeTag::Delete) => "-", @@ -175,10 +171,13 @@ impl<'s> fmt::Display for InlineChange<'s> { } } -pub(crate) fn iter_inline_changes<'diff>( - diff: &'diff TextDiff, +pub(crate) fn iter_inline_changes<'diff, T>( + diff: &'diff TextDiff<'_, '_, '_, T>, op: &DiffOp, -) -> impl Iterator> { +) -> impl Iterator> +where + T: DiffableStr + ?Sized, +{ let newline_terminated = diff.newline_terminated; let (tag, old_range, new_range) = op.as_tag_tuple(); @@ -267,7 +266,7 @@ pub(crate) fn iter_inline_changes<'diff>( if newline_terminated && !old_slices.is_empty() - && !old_slices[old_slices.len() - 1].ends_with(&['\r', '\n'][..]) + && !old_slices[old_slices.len() - 1].ends_with_newline() { if let Some(last) = rv.last_mut() { last.missing_newline = true; @@ -287,7 +286,7 @@ pub(crate) fn iter_inline_changes<'diff>( if newline_terminated && !new_slices.is_empty() - && !new_slices[new_slices.len() - 1].ends_with(&['\r', '\n'][..]) + && !new_slices[new_slices.len() - 1].ends_with_newline() { if let Some(last) = rv.last_mut() { last.missing_newline = true; diff --git a/src/text/mod.rs b/src/text/mod.rs index b95589d..e588151 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -11,7 +11,7 @@ //! Text diffing is available by default but can be disabled by turning off the //! default features. The feature to enable to get it back is `text`. //! -//! ## Examples +//! # Examples //! //! A super simple example for how to generate a unified diff with three lines //! off context around the changes: @@ -38,7 +38,7 @@ //! } //! ``` //! -//! ## Ops vs Changes +//! # Ops vs Changes //! //! Because very commonly two compared sequences will largely match this module //! splits it's functionality into two layers. The first is inherited from the @@ -51,7 +51,7 @@ //! Because the [`TextDiff::grouped_ops`] method can isolate clusters of changes //! this even works for very long files if paired with this method. //! -//! ## Trailing Newlines +//! # Trailing Newlines //! //! When working with line diffs (and unified diffs in general) there are two //! "philosophies" to look at lines. One is to diff lines without their newline @@ -68,11 +68,30 @@ //! either rendering a virtual newline at that position or to indicate it in //! different ways. For instance the unified diff code will render the special //! `\ No newline at end of file` marker. +//! +//! # Bytes vs Unicode +//! +//! This module concerns itself with a loser definition of "text" than you would +//! normally see in Rust. While by default it can only operate on [`str`] types +//! by enabling the `bytes` feature it gains support for byte slices with some +//! caveats. +//! +//! A lot of text diff functionality assumes that what is being diffed constiutes +//! text, but in the real world it can often be challenging to ensure that this is +//! all valid utf-8. Because of this the crate is built so that most functinality +//! also still works with bytes for as long as they are roughtly ASCII compatible. +//! +//! This means you will be successful in creating a unified diff from latin1 +//! encoded bytes but if you try to do the same with EBCDIC encoded bytes you +//! will only get garbage. #![cfg(feature = "text")] use std::borrow::Cow; use std::cmp::Reverse; use std::collections::{BinaryHeap, HashMap}; use std::fmt; +use std::hash::Hash; + +mod abstraction; #[cfg(feature = "inline")] mod inline; @@ -82,6 +101,8 @@ mod udiff; pub use self::inline::*; pub use self::udiff::*; +pub use crate::text::abstraction::*; + use crate::algorithms::{ capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp, DiffTag, }; @@ -127,14 +148,14 @@ impl TextDiffConfig { /// /// This splits the text `old` and `new` into lines preserving newlines /// in the input. - pub fn diff_lines<'old, 'new, 'bufs>( + pub fn diff_lines<'old, 'new, 'bufs, T: DiffableStrRef + ?Sized>( &self, - old: &'old str, - new: &'new str, - ) -> TextDiff<'old, 'new, 'bufs> { + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { self.diff( - Cow::Owned(split_lines(old).collect()), - Cow::Owned(split_lines(new).collect()), + Cow::Owned(old.as_diffable_str().split_lines()), + Cow::Owned(new.as_diffable_str().split_lines()), true, ) } @@ -142,14 +163,27 @@ impl TextDiffConfig { /// Creates a diff of words. /// /// This splits the text into words and whitespace. - pub fn diff_words<'old, 'new, 'bufs>( + pub fn diff_words<'old, 'new, 'bufs, T: DiffableStrRef + ?Sized>( &self, - old: &'old str, - new: &'new str, - ) -> TextDiff<'old, 'new, 'bufs> { + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { self.diff( - Cow::Owned(split_words(old).collect()), - Cow::Owned(split_words(new).collect()), + Cow::Owned(old.as_diffable_str().split_words()), + Cow::Owned(new.as_diffable_str().split_words()), + false, + ) + } + + /// Creates a diff of characters. + pub fn diff_chars<'old, 'new, 'bufs, T: DiffableStrRef + ?Sized>( + &self, + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { + self.diff( + Cow::Owned(old.as_diffable_str().split_chars()), + Cow::Owned(new.as_diffable_str().split_chars()), false, ) } @@ -162,27 +196,14 @@ impl TextDiffConfig { /// /// This requires the `unicode` feature. #[cfg(feature = "unicode")] - pub fn diff_unicode_words<'old, 'new, 'bufs>( + pub fn diff_unicode_words<'old, 'new, 'bufs, T: DiffableStrRef + ?Sized>( &self, - old: &'old str, - new: &'new str, - ) -> TextDiff<'old, 'new, 'bufs> { + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { self.diff( - Cow::Owned(split_unicode_words(old).collect()), - Cow::Owned(split_unicode_words(new).collect()), - false, - ) - } - - /// Creates a diff of characters. - pub fn diff_chars<'old, 'new, 'bufs>( - &self, - old: &'old str, - new: &'new str, - ) -> TextDiff<'old, 'new, 'bufs> { - self.diff( - Cow::Owned(split_chars(old).collect()), - Cow::Owned(split_chars(new).collect()), + Cow::Owned(old.as_diffable_str().split_unicode_words()), + Cow::Owned(new.as_diffable_str().split_unicode_words()), false, ) } @@ -191,33 +212,33 @@ impl TextDiffConfig { /// /// This requires the `unicode` feature. #[cfg(feature = "unicode")] - pub fn diff_graphemes<'old, 'new, 'bufs>( + pub fn diff_graphemes<'old, 'new, 'bufs, T: DiffableStrRef + ?Sized>( &self, - old: &'old str, - new: &'new str, - ) -> TextDiff<'old, 'new, 'bufs> { + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { self.diff( - Cow::Owned(split_graphemes(old).collect()), - Cow::Owned(split_graphemes(new).collect()), + Cow::Owned(old.as_diffable_str().split_graphemes()), + Cow::Owned(new.as_diffable_str().split_graphemes()), false, ) } /// Creates a diff of arbitrary slices. - pub fn diff_slices<'old, 'new, 'bufs>( + pub fn diff_slices<'old, 'new, 'bufs, T: DiffableStr + ?Sized>( &self, - old: &'bufs [&'old str], - new: &'bufs [&'new str], - ) -> TextDiff<'old, 'new, 'bufs> { + old: &'bufs [&'old T], + new: &'bufs [&'new T], + ) -> TextDiff<'old, 'new, 'bufs, T> { self.diff(Cow::Borrowed(old), Cow::Borrowed(new), false) } - fn diff<'old, 'new, 'bufs>( + fn diff<'old, 'new, 'bufs, T: DiffableStr + ?Sized>( &self, - old: Cow<'bufs, [&'old str]>, - new: Cow<'bufs, [&'new str]>, + old: Cow<'bufs, [&'old T]>, + new: Cow<'bufs, [&'new T]>, newline_terminated: bool, - ) -> TextDiff<'old, 'new, 'bufs> { + ) -> TextDiff<'old, 'new, 'bufs, T> { let ops = capture_diff_slices(self.algorithm, &old, &new); TextDiff { old, @@ -230,9 +251,9 @@ impl TextDiffConfig { } /// Captures diff op codes for textual diffs -pub struct TextDiff<'old, 'new, 'bufs> { - old: Cow<'bufs, [&'old str]>, - new: Cow<'bufs, [&'new str]>, +pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> { + old: Cow<'bufs, [&'old T]>, + new: Cow<'bufs, [&'new T]>, ops: Vec, newline_terminated: bool, algorithm: Algorithm, @@ -255,26 +276,26 @@ pub enum ChangeTag { /// exists so that it's more convenient to work with textual differences as /// the underlying [`DiffOp`] does not know anything about strings. #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Ord, PartialOrd)] -pub struct Change<'s> { +pub struct Change<'s, T: DiffableStr + ?Sized> { tag: ChangeTag, old_index: Option, new_index: Option, - value: &'s str, + value: &'s T, missing_newline: bool, } -impl<'s> fmt::Display for Change<'s> { +impl<'s, T: DiffableStr + ?Sized> fmt::Display for Change<'s, T> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { write!( f, "{}{}", - self.value(), + self.as_str_lossy(), if self.missing_newline { "\n" } else { "" } ) } } -impl<'s> Change<'s> { +impl<'s, T: DiffableStr + ?Sized> Change<'s, T> { /// Returns the change tag. pub fn tag(&self) -> ChangeTag { self.tag @@ -290,11 +311,21 @@ impl<'s> Change<'s> { self.new_index } - /// Returns the changed value. - pub fn value(&self) -> &'s str { + /// Returns the underlying changed value. + pub fn value(&self) -> &'s T { self.value } + /// Returns the value as string if it is utf-8. + pub fn as_str(&self) -> Option<&'s str> { + T::as_str(self.value) + } + + /// Returns the value (lossy) decoded as utf-8 string. + pub fn as_str_lossy(&self) -> Cow<'s, str> { + T::as_str_lossy(self.value) + } + /// Returns `true` if this change needs to be followed up by a /// missing newline. /// @@ -305,7 +336,7 @@ impl<'s> Change<'s> { } } -impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { +impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs, str> { /// Configures a text differ before diffing. pub fn configure() -> TextDiffConfig { TextDiffConfig::default() @@ -314,15 +345,31 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { /// Creates a diff of lines. /// /// Equivalent to `TextDiff::configure().diff_lines(old, new)`. - pub fn from_lines(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { - Self::configure().diff_lines(old, new) + pub fn from_lines( + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { + TextDiff::configure().diff_lines(old, new) } /// Creates a diff of words. /// /// Equivalent to `TextDiff::configure().diff_words(old, new)`. - pub fn from_words(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { - Self::configure().diff_words(old, new) + pub fn from_words( + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { + TextDiff::configure().diff_words(old, new) + } + + /// Creates a diff of chars. + /// + /// Equivalent to `TextDiff::configure().diff_chars(old, new)`. + pub fn from_chars( + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { + TextDiff::configure().diff_chars(old, new) } /// Creates a diff of unicode words. @@ -331,15 +378,11 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { /// /// This requires the `unicode` feature. #[cfg(feature = "unicode")] - pub fn from_unicode_words(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { - Self::configure().diff_unicode_words(old, new) - } - - /// Creates a diff of chars. - /// - /// Equivalent to `TextDiff::configure().diff_chars(old, new)`. - pub fn from_chars(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { - Self::configure().diff_chars(old, new) + pub fn from_unicode_words( + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { + TextDiff::configure().diff_unicode_words(old, new) } /// Creates a diff of graphemes. @@ -348,18 +391,23 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { /// /// This requires the `unicode` feature. #[cfg(feature = "unicode")] - pub fn from_graphemes(old: &'old str, new: &'new str) -> TextDiff<'old, 'new, 'bufs> { - Self::configure().diff_graphemes(old, new) + pub fn from_graphemes( + old: &'old T, + new: &'new T, + ) -> TextDiff<'old, 'new, 'bufs, T::Output> { + TextDiff::configure().diff_graphemes(old, new) } +} +impl<'old, 'new, 'bufs, T: DiffableStr + ?Sized + 'old + 'new> TextDiff<'old, 'new, 'bufs, T> { /// Creates a diff of arbitrary slices. /// /// Equivalent to `TextDiff::configure().diff_slices(old, new)`. pub fn from_slices( - old: &'bufs [&'old str], - new: &'bufs [&'new str], - ) -> TextDiff<'old, 'new, 'bufs> { - Self::configure().diff_slices(old, new) + old: &'bufs [&'old T], + new: &'bufs [&'new T], + ) -> TextDiff<'old, 'new, 'bufs, T> { + TextDiff::configure().diff_slices(old, new) } /// The name of the algorithm that created the diff. @@ -376,12 +424,12 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { } /// Returns all old slices. - pub fn old_slices(&self) -> &[&'old str] { + pub fn old_slices(&self) -> &[&'old T] { &self.old } /// Returns all new slices. - pub fn new_slices(&self) -> &[&'new str] { + pub fn new_slices(&self) -> &[&'new T] { &self.new } @@ -405,7 +453,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { /// ways in which a change could be encoded (insert/delete vs replace), look /// up the value from the appropriate slice and also handle correct index /// handling. - pub fn iter_changes(&self, op: &DiffOp) -> impl Iterator { + pub fn iter_changes(&self, op: &DiffOp) -> impl Iterator> { let newline_terminated = self.newline_terminated; let (tag, old_range, new_range) = op.as_tag_tuple(); let mut old_index = old_range.start; @@ -426,7 +474,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { value: first, missing_newline: newline_terminated && rest.is_empty() - && !first.ends_with(&['\r', '\n'][..]), + && !first.ends_with_newline(), }) } else { None @@ -443,7 +491,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { value: first, missing_newline: newline_terminated && rest.is_empty() - && !first.ends_with(&['\r', '\n'][..]), + && !first.ends_with_newline(), }) } else { None @@ -460,7 +508,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { value: first, missing_newline: newline_terminated && rest.is_empty() - && !first.ends_with(&['\r', '\n'][..]), + && !first.ends_with_newline(), }) } else { None @@ -477,7 +525,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { value: first, missing_newline: newline_terminated && rest.is_empty() - && !first.ends_with(&['\r', '\n'][..]), + && !first.ends_with_newline(), }) } else if let Some((&first, rest)) = new_slices.split_first() { new_slices = rest; @@ -489,7 +537,7 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { value: first, missing_newline: newline_terminated && rest.is_empty() - && !first.ends_with(&['\r', '\n'][..]), + && !first.ends_with_newline(), }) } else { None @@ -498,17 +546,6 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { }) } - /// Iterates over the changes the op expands to with inline emphasis. - /// - /// This is very similar to [`TextDiff::iter_changes`] but it performs a second - /// level diff on adjacent line replacements. The exact behavior of - /// this function with regards to how it detects those inline changes - /// is currently not defined and will likely change over time. - #[cfg(feature = "inline")] - pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator { - iter_inline_changes(self, op) - } - /// Returns the captured diff ops. pub fn ops(&self) -> &[DiffOp] { &self.ops @@ -522,85 +559,20 @@ impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs> { } /// Utility to return a unified diff formatter. - pub fn unified_diff<'diff>(&'diff self) -> UnifiedDiff<'diff, 'old, 'new, 'bufs> { + pub fn unified_diff<'diff>(&'diff self) -> UnifiedDiff<'diff, 'old, 'new, 'bufs, T> { UnifiedDiff::from_text_diff(self) } -} -/// Given a string splits it into lines. -/// -/// This operation will preserve the newline separation character at the end. -/// It supports all common newline sequences (`\r\n`, `\n` as well as `\r`). -fn split_lines(s: &str) -> impl Iterator { - let mut iter = s.char_indices().peekable(); - let mut last_pos = 0; - - std::iter::from_fn(move || { - if let Some((idx, c)) = iter.next() { - let mut rv = None; - if c == '\r' { - if iter.peek().map_or(false, |x| x.1 == '\n') { - rv = Some(&s[last_pos..=idx + 1]); - iter.next(); - last_pos = idx + 2; - } else { - rv = Some(&s[last_pos..=idx]); - last_pos = idx + 1; - } - } else if c == '\n' { - rv = Some(&s[last_pos..=idx]); - last_pos = idx + 1; - } - Some(rv) - } else if last_pos < s.len() { - let tmp = &s[last_pos..]; - last_pos = s.len(); - Some(Some(tmp)) - } else { - None - } - }) - .flatten() -} - -/// Partitions at whitespace. -fn split_words(s: &str) -> impl Iterator { - let mut iter = s.char_indices().peekable(); - - std::iter::from_fn(move || { - if let Some((idx, c)) = iter.next() { - let is_whitespace = c.is_whitespace(); - let start = idx; - let mut end = idx + c.len_utf8(); - while let Some(&(_, next_char)) = iter.peek() { - if next_char.is_whitespace() != is_whitespace { - break; - } - iter.next(); - end += next_char.len_utf8(); - } - Some(&s[start..end]) - } else { - None - } - }) -} - -/// Splits words according to unicode rules. -#[cfg(feature = "unicode")] -fn split_unicode_words(s: &str) -> impl Iterator { - unicode_segmentation::UnicodeSegmentation::split_word_bounds(s) -} - -/// Splits text into characters. -fn split_chars(s: &str) -> impl Iterator { - s.char_indices().map(move |(i, c)| &s[i..i + c.len_utf8()]) -} - -/// Splits text into graphemes. -#[cfg(feature = "unicode")] -fn split_graphemes(s: &str) -> impl Iterator { - unicode_segmentation::UnicodeSegmentation::graphemes(s, true) + /// Iterates over the changes the op expands to with inline emphasis. + /// + /// This is very similar to [`TextDiff::iter_changes`] but it performs a second + /// level diff on adjacent line replacements. The exact behavior of + /// this function with regards to how it detects those inline changes + /// is currently not defined and will likely change over time. + #[cfg(feature = "inline")] + pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator> { + iter_inline_changes(self, op) + } } // quick and dirty way to get an upper sequence ratio. @@ -619,10 +591,10 @@ fn upper_seq_ratio(seq1: &[T], seq2: &[T]) -> f32 { /// /// It counts the number of matches without regard to order, which is an /// obvious upper bound. -struct QuickSeqRatio<'a>(HashMap<&'a str, i32>); +struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>); -impl<'a> QuickSeqRatio<'a> { - pub fn new(seq: &[&'a str]) -> QuickSeqRatio<'a> { +impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> { + pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> { let mut counts = HashMap::new(); for &word in seq { *counts.entry(word).or_insert(0) += 1; @@ -630,7 +602,7 @@ impl<'a> QuickSeqRatio<'a> { QuickSeqRatio(counts) } - pub fn calc(&self, seq: &[&str]) -> f32 { + pub fn calc(&self, seq: &[&T]) -> f32 { let n = self.0.len() + seq.len(); if n == 0 { return 1.0; @@ -669,18 +641,18 @@ impl<'a> QuickSeqRatio<'a> { /// ); /// assert_eq!(matches, vec!["apple", "ape"]); /// ``` -pub fn get_close_matches<'a>( - word: &str, - possibilities: &[&'a str], +pub fn get_close_matches<'a, T: DiffableStr + ?Sized>( + word: &T, + possibilities: &[&'a T], n: usize, cutoff: f32, -) -> Vec<&'a str> { +) -> Vec<&'a T> { let mut matches = BinaryHeap::new(); - let seq1 = split_chars(word).collect::>(); + let seq1 = word.split_chars(); let quick_ratio = QuickSeqRatio::new(&seq1); for &possibility in possibilities { - let seq2 = split_chars(possibility).collect::>(); + let seq2 = possibility.split_chars(); if upper_seq_ratio(&seq1, &seq2) < cutoff || quick_ratio.calc(&seq2) < cutoff { continue; @@ -707,42 +679,6 @@ pub fn get_close_matches<'a>( rv } -#[test] -fn test_split_lines() { - assert_eq!( - split_lines("first\nsecond\rthird\r\nfourth\nlast").collect::>(), - vec!["first\n", "second\r", "third\r\n", "fourth\n", "last"] - ); - assert_eq!(split_lines("\n\n").collect::>(), vec!["\n", "\n"]); - assert_eq!(split_lines("\n").collect::>(), vec!["\n"]); - assert!(split_lines("").collect::>().is_empty()); -} - -#[test] -fn test_split_words() { - assert_eq!( - split_words("foo bar baz\n\n aha").collect::>(), - ["foo", " ", "bar", " ", "baz", "\n\n ", "aha"] - ); -} - -#[test] -fn test_split_chars() { - assert_eq!( - split_chars("abcfö❄️").collect::>(), - vec!["a", "b", "c", "f", "ö", "❄", "\u{fe0f}"] - ); -} - -#[test] -#[cfg(feature = "unicode")] -fn test_split_graphemes() { - assert_eq!( - split_graphemes("abcfö❄️").collect::>(), - vec!["a", "b", "c", "f", "ö", "❄️"] - ); -} - #[test] fn test_captured_ops() { let diff = TextDiff::from_lines( @@ -782,10 +718,9 @@ fn test_unified_diff() { #[test] fn test_line_ops() { - let diff = TextDiff::from_lines( - "Hello World\nsome stuff here\nsome more stuff here\n", - "Hello World\nsome amazing stuff here\nsome more stuff here\n", - ); + let a = "Hello World\nsome stuff here\nsome more stuff here\n"; + let b = "Hello World\nsome amazing stuff here\nsome more stuff here\n"; + let diff = TextDiff::from_lines(a, b); assert_eq!(diff.newline_terminated(), true); let changes = diff .ops() @@ -793,6 +728,19 @@ fn test_line_ops() { .flat_map(|op| diff.iter_changes(op)) .collect::>(); insta::assert_debug_snapshot!(&changes); + + #[cfg(feature = "bytes")] + { + let byte_diff = TextDiff::from_lines(a.as_bytes(), b.as_bytes()); + let byte_changes = byte_diff + .ops() + .iter() + .flat_map(|op| byte_diff.iter_changes(op)) + .collect::>(); + for (change, byte_change) in changes.iter().zip(byte_changes.iter()) { + assert_eq!(change.as_str_lossy(), byte_change.as_str_lossy()); + } + } } #[test] @@ -811,6 +759,12 @@ fn test_virtual_newlines() { fn test_char_diff() { let diff = TextDiff::from_chars("Hello World", "Hallo Welt"); insta::assert_debug_snapshot!(diff.ops()); + + #[cfg(feature = "bytes")] + { + let byte_diff = TextDiff::from_chars("Hello World".as_bytes(), "Hallo Welt".as_bytes()); + assert_eq!(diff.ops(), byte_diff.ops()); + } } #[test] diff --git a/src/text/udiff.rs b/src/text/udiff.rs index 303d62e..1854448 100644 --- a/src/text/udiff.rs +++ b/src/text/udiff.rs @@ -13,13 +13,23 @@ //! .context_radius(10) //! .header("old_file", "new_file")); //! ``` +//! +//! # Unicode vs Bytes +//! +//! The [`UnifiedDiff`] type supports both unicode and byte diffs for all +//! types compatible with [`DiffableStr`]. You can pick between the two +//! versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`]. +//! The former uses [`DiffableStr::as_str_lossy`], the latter uses +//! [`DiffableStr::as_bytes`] for each line. -use std::fmt; use std::ops::Range; +use std::{fmt, io}; use crate::algorithms::{Algorithm, DiffOp}; use crate::text::{Change, ChangeTag, TextDiff}; +use super::DiffableStr; + #[derive(Copy, Clone, Debug)] struct UnifiedDiffHunkRange(usize, usize); @@ -77,17 +87,34 @@ impl fmt::Display for UnifiedHunkHeader { /// Unified diff formatter. /// -/// The `Display` implementation renders a unified diff. -pub struct UnifiedDiff<'diff, 'old, 'new, 'bufs> { - diff: &'diff TextDiff<'old, 'new, 'bufs>, +/// ```rust +/// use similar::text::TextDiff; +/// # let old_text = ""; +/// # let new_text = ""; +/// let text_diff = TextDiff::from_lines(old_text, new_text); +/// print!("{}", text_diff +/// .unified_diff() +/// .context_radius(10) +/// .header("old_file", "new_file")); +/// ``` +/// +/// ## Unicode vs Bytes +/// +/// The [`UnifiedDiff`] type supports both unicode and byte diffs for all +/// types compatible with [`DiffableStr`]. You can pick between the two +/// versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`]. +/// The former uses [`DiffableStr::as_str_lossy`], the latter uses +/// [`DiffableStr::as_bytes`] for each line. +pub struct UnifiedDiff<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> { + diff: &'diff TextDiff<'old, 'new, 'bufs, T>, context_radius: usize, missing_newline_hint: bool, header: Option<(String, String)>, } -impl<'diff, 'old, 'new, 'bufs> UnifiedDiff<'diff, 'old, 'new, 'bufs> { +impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> UnifiedDiff<'diff, 'old, 'new, 'bufs, T> { /// Creates a formatter from a text diff object. - pub fn from_text_diff(diff: &'diff TextDiff<'old, 'new, 'bufs>) -> Self { + pub fn from_text_diff(diff: &'diff TextDiff<'old, 'new, 'bufs, T>) -> Self { UnifiedDiff { diff, context_radius: 3, @@ -127,7 +154,7 @@ impl<'diff, 'old, 'new, 'bufs> UnifiedDiff<'diff, 'old, 'new, 'bufs> { } /// Iterates over all hunks as configured. - pub fn iter_hunks(&self) -> impl Iterator> { + pub fn iter_hunks(&self) -> impl Iterator> { let diff = self.diff; let missing_newline_hint = self.missing_newline_hint; self.diff @@ -137,6 +164,19 @@ impl<'diff, 'old, 'new, 'bufs> UnifiedDiff<'diff, 'old, 'new, 'bufs> { .map(move |ops| UnifiedDiffHunk::new(ops, diff, missing_newline_hint)) } + /// Write the unified diff as bytes to the output stream. + pub fn to_writer(&self, mut w: W) -> Result<(), io::Error> { + let mut header = self.header.as_ref(); + for hunk in self.iter_hunks() { + if let Some((old_file, new_file)) = header.take() { + writeln!(w, "--- {}", old_file)?; + writeln!(w, "+++ {}", new_file)?; + } + write!(w, "{}", hunk)?; + } + Ok(()) + } + fn header_opt(&mut self, header: Option<(&str, &str)>) -> &mut Self { if let Some((a, b)) = header { self.header(a, b); @@ -148,19 +188,21 @@ impl<'diff, 'old, 'new, 'bufs> UnifiedDiff<'diff, 'old, 'new, 'bufs> { /// Unified diff hunk formatter. /// /// The `Display` this renders out a single unified diff's hunk. -pub struct UnifiedDiffHunk<'diff, 'old, 'new, 'bufs> { - diff: &'diff TextDiff<'old, 'new, 'bufs>, +pub struct UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> { + diff: &'diff TextDiff<'old, 'new, 'bufs, T>, ops: Vec, missing_newline_hint: bool, } -impl<'diff, 'old, 'new, 'bufs> UnifiedDiffHunk<'diff, 'old, 'new, 'bufs> { +impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> + UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T> +{ /// Creates a new hunk for some operations. pub fn new( ops: Vec, - diff: &'diff TextDiff<'old, 'new, 'bufs>, + diff: &'diff TextDiff<'old, 'new, 'bufs, T>, missing_newline_hint: bool, - ) -> UnifiedDiffHunk<'diff, 'old, 'new, 'bufs> { + ) -> UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T> { UnifiedDiffHunk { diff, ops, @@ -184,7 +226,7 @@ impl<'diff, 'old, 'new, 'bufs> UnifiedDiffHunk<'diff, 'old, 'new, 'bufs> { } /// Iterates over all changes in a hunk. - pub fn iter_changes(&self) -> impl Iterator> + '_ { + pub fn iter_changes(&self) -> impl Iterator> + '_ { // unclear why this needs Box::new here. It seems to infer some really // odd lifetimes I can't figure out how to work with. (Box::new( @@ -193,9 +235,43 @@ impl<'diff, 'old, 'new, 'bufs> UnifiedDiffHunk<'diff, 'old, 'new, 'bufs> { .flat_map(move |op| self.diff.iter_changes(op)), )) as Box> } + + /// Write the hunk as bytes to the output stream. + pub fn to_writer(&self, mut w: W) -> Result<(), io::Error> { + let mut wrote_header = false; + for change in self.iter_changes() { + if !wrote_header { + writeln!(w, "{}", self.header())?; + wrote_header = true; + } + write!( + w, + "{}", + match change.tag() { + ChangeTag::Equal => ' ', + ChangeTag::Delete => '-', + ChangeTag::Insert => '+', + }, + )?; + w.write_all(change.value().as_bytes())?; + if self.diff.newline_terminated() { + write!(w, "\n")?; + } + if change.missing_newline() { + if self.missing_newline_hint { + writeln!(w, "\n\\ No newline at end of file")?; + } else { + writeln!(w)?; + } + } + } + Ok(()) + } } -impl<'diff, 'old, 'new, 'bufs> fmt::Display for UnifiedDiffHunk<'diff, 'old, 'new, 'bufs> { +impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> fmt::Display + for UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T> +{ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let nl = if self.diff.newline_terminated() { "" @@ -216,7 +292,7 @@ impl<'diff, 'old, 'new, 'bufs> fmt::Display for UnifiedDiffHunk<'diff, 'old, 'ne ChangeTag::Delete => '-', ChangeTag::Insert => '+', }, - change.value(), + change.as_str_lossy(), nl )?; if change.missing_newline() { @@ -231,7 +307,9 @@ impl<'diff, 'old, 'new, 'bufs> fmt::Display for UnifiedDiffHunk<'diff, 'old, 'ne } } -impl<'diff, 'old, 'new, 'bufs> fmt::Display for UnifiedDiff<'diff, 'old, 'new, 'bufs> { +impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> fmt::Display + for UnifiedDiff<'diff, 'old, 'new, 'bufs, T> +{ fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { let mut header = self.header.as_ref(); for hunk in self.iter_hunks() {