diff --git a/examples/terminal-inline.rs b/examples/terminal-inline.rs index ba6ab50..900fdd5 100644 --- a/examples/terminal-inline.rs +++ b/examples/terminal-inline.rs @@ -44,7 +44,7 @@ fn main() { style(Line(change.new_index())).dim(), s.apply_to(sign).bold(), ); - for (emphasized, value) in change.iter_strings() { + for (emphasized, value) in change.iter_strings_lossy() { if emphasized { print!("{}", s.apply_to(value).underlined().on_black()); } else { diff --git a/src/algorithms/capture.rs b/src/algorithms/capture.rs index 264a566..e19e192 100644 --- a/src/algorithms/capture.rs +++ b/src/algorithms/capture.rs @@ -155,34 +155,6 @@ impl DiffOp { } } -/// A [`DiffHook`] that captures all diff operations. -#[derive(Default, Clone)] -pub struct Capture(Vec); - -impl Capture { - /// Creates a new capture hook. - pub fn new() -> Capture { - Capture::default() - } - - /// Converts the capture hook into a vector of ops. - pub fn into_ops(self) -> Vec { - self.0 - } - - /// Isolate change clusters by eliminating ranges with no changes. - /// - /// This is equivalent to calling [`group_diff_ops`] on [`Capture::into_ops`]. - pub fn into_grouped_ops(self, n: usize) -> Vec> { - group_diff_ops(self.into_ops(), n) - } - - /// Accesses the captured operations. - pub fn ops(&self) -> &[DiffOp] { - &self.0 - } -} - /// Isolate change clusters by eliminating ranges with no changes. /// /// This will leave holes behind in long periods of equal ranges so that @@ -272,6 +244,34 @@ pub fn get_diff_ratio(ops: &[DiffOp], old_len: usize, new_len: usize) -> f32 { } } +/// A [`DiffHook`] that captures all diff operations. +#[derive(Default, Clone)] +pub struct Capture(Vec); + +impl Capture { + /// Creates a new capture hook. + pub fn new() -> Capture { + Capture::default() + } + + /// Converts the capture hook into a vector of ops. + pub fn into_ops(self) -> Vec { + self.0 + } + + /// Isolate change clusters by eliminating ranges with no changes. + /// + /// This is equivalent to calling [`group_diff_ops`] on [`Capture::into_ops`]. + pub fn into_grouped_ops(self, n: usize) -> Vec> { + group_diff_ops(self.into_ops(), n) + } + + /// Accesses the captured operations. + pub fn ops(&self) -> &[DiffOp] { + &self.0 + } +} + impl DiffHook for Capture { type Error = Infallible; diff --git a/src/algorithms/mod.rs b/src/algorithms/mod.rs index 04c8ec2..4cd739f 100644 --- a/src/algorithms/mod.rs +++ b/src/algorithms/mod.rs @@ -23,9 +23,9 @@ mod replace; use std::hash::Hash; use std::ops::{Index, Range}; -pub use capture::*; -pub use hook::*; -pub use replace::*; +pub use capture::{get_diff_ratio, group_diff_ops, Capture, DiffOp, DiffTag}; +pub use hook::DiffHook; +pub use replace::Replace; // actual diffing algorithms pub mod myers; diff --git a/src/text/abstraction.rs b/src/text/abstraction.rs index 3a5d058..ad36aa0 100644 --- a/src/text/abstraction.rs +++ b/src/text/abstraction.rs @@ -6,6 +6,13 @@ use std::hash::Hash; use std::ops::Range; /// Reference to a [`DiffableStr`]. +/// +/// This type exists because while the library only really provides ways to +/// work with `&str` and `&[u8]` there are types that deref into those string +/// slices such as `String` and `Vec`. +/// +/// This trait is used in the library whenever it's nice to be able to pass +/// strings of different types in. pub trait DiffableStrRef { /// The type of the resolved [`DiffableStr`]. type Output: DiffableStr + ?Sized; @@ -78,7 +85,7 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { fn as_str(&self) -> Option<&str>; /// Decodes the string (potentially) lossy. - fn as_str_lossy(&self) -> Cow<'_, str>; + fn to_string_lossy(&self) -> Cow<'_, str>; /// Checks if the string ends in a newline. fn ends_with_newline(&self) -> bool; @@ -91,6 +98,11 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned { /// Returns the strings as slice of raw bytes. fn as_bytes(&self) -> &[u8]; + + /// Checks if the string is empty. + fn is_empty(&self) -> bool { + self.len() == 0 + } } impl DiffableStr for str { @@ -184,7 +196,7 @@ impl DiffableStr for str { Some(self) } - fn as_str_lossy(&self) -> Cow<'_, str> { + fn to_string_lossy(&self) -> Cow<'_, str> { Cow::Borrowed(self) } @@ -293,7 +305,7 @@ impl DiffableStr for [u8] { std::str::from_utf8(self).ok() } - fn as_str_lossy(&self) -> Cow<'_, str> { + fn to_string_lossy(&self) -> Cow<'_, str> { String::from_utf8_lossy(self) } diff --git a/src/text/inline.rs b/src/text/inline.rs index 3a55af5..01f1b71 100644 --- a/src/text/inline.rs +++ b/src/text/inline.rs @@ -121,6 +121,10 @@ impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> { /// /// Each item is a tuple in the form `(emphasized, value)` where `emphasized` /// is true if it should be highlighted as an inline diff. + /// + /// Depending on the type of the underlying [`DiffableStr`] this value is + /// more or less useful. If you always want to have a utf-8 string it's + /// better to use the [`InlineChange::iter_strings_lossy`] method. pub fn values(&self) -> &[(bool, &'s T)] { &self.values } @@ -129,10 +133,10 @@ impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> { /// /// Each item is a tuple in the form `(emphasized, value)` where `emphasized` /// is true if it should be highlighted as an inline diff. - pub fn iter_strings(&self) -> impl Iterator)> { + pub fn iter_strings_lossy(&self) -> impl Iterator)> { self.values() .iter() - .map(|(emphasized, raw_value)| (*emphasized, raw_value.as_str_lossy())) + .map(|(emphasized, raw_value)| (*emphasized, raw_value.to_string_lossy())) } /// Returns `true` if this change needs to be followed up by a @@ -156,7 +160,7 @@ impl<'s, T: DiffableStr + ?Sized> From> for InlineChange<'s, T> { impl<'s, T: DiffableStr + ?Sized> fmt::Display for InlineChange<'s, T> { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { - for (emphasized, value) in self.iter_strings() { + for (emphasized, value) in self.iter_strings_lossy() { let marker = match (emphasized, self.tag) { (false, _) | (true, ChangeTag::Equal) => "", (true, ChangeTag::Delete) => "-", diff --git a/src/text/mod.rs b/src/text/mod.rs index e588151..67ab673 100644 --- a/src/text/mod.rs +++ b/src/text/mod.rs @@ -87,22 +87,22 @@ #![cfg(feature = "text")] use std::borrow::Cow; use std::cmp::Reverse; -use std::collections::{BinaryHeap, HashMap}; +use std::collections::BinaryHeap; use std::fmt; use std::hash::Hash; mod abstraction; - #[cfg(feature = "inline")] mod inline; mod udiff; +mod utils; +pub use self::abstraction::{DiffableStr, DiffableStrRef}; #[cfg(feature = "inline")] -pub use self::inline::*; -pub use self::udiff::*; - -pub use crate::text::abstraction::*; +pub use self::inline::InlineChange; +pub use self::udiff::{unified_diff, UnifiedDiff, UnifiedHunkHeader}; +use self::utils::{upper_seq_ratio, QuickSeqRatio}; use crate::algorithms::{ capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp, DiffTag, }; @@ -250,15 +250,6 @@ impl TextDiffConfig { } } -/// Captures diff op codes for textual diffs -pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> { - old: Cow<'bufs, [&'old T]>, - new: Cow<'bufs, [&'new T]>, - ops: Vec, - newline_terminated: bool, - algorithm: Algorithm, -} - /// The tag of a change. #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Ord, PartialOrd)] pub enum ChangeTag { @@ -270,6 +261,20 @@ pub enum ChangeTag { Insert, } +impl fmt::Display for ChangeTag { + fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { + write!( + f, + "{}", + match &self { + ChangeTag::Equal => ' ', + ChangeTag::Delete => '-', + ChangeTag::Insert => '+', + } + ) + } +} + /// Represents the expanded textual change. /// /// This type is returned from the [`TextDiff::iter_changes`] method. It @@ -289,7 +294,7 @@ impl<'s, T: DiffableStr + ?Sized> fmt::Display for Change<'s, T> { write!( f, "{}{}", - self.as_str_lossy(), + self.to_string_lossy(), if self.missing_newline { "\n" } else { "" } ) } @@ -312,6 +317,10 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> { } /// Returns the underlying changed value. + /// + /// Depending on the type of the underlying [`DiffableStr`] this value is + /// more or less useful. If you always want to have a utf-8 string it's + /// best to use the [`Change::as_str`] and [`Change::to_string_lossy`] methods. pub fn value(&self) -> &'s T { self.value } @@ -322,8 +331,8 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> { } /// Returns the value (lossy) decoded as utf-8 string. - pub fn as_str_lossy(&self) -> Cow<'s, str> { - T::as_str_lossy(self.value) + pub fn to_string_lossy(&self) -> Cow<'s, str> { + T::to_string_lossy(self.value) } /// Returns `true` if this change needs to be followed up by a @@ -336,6 +345,15 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> { } } +/// Captures diff op codes for textual diffs +pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> { + old: Cow<'bufs, [&'old T]>, + new: Cow<'bufs, [&'new T]>, + ops: Vec, + newline_terminated: bool, + algorithm: Algorithm, +} + impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs, str> { /// Configures a text differ before diffing. pub fn configure() -> TextDiffConfig { @@ -571,58 +589,7 @@ impl<'old, 'new, 'bufs, T: DiffableStr + ?Sized + 'old + 'new> TextDiff<'old, 'n /// is currently not defined and will likely change over time. #[cfg(feature = "inline")] pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator> { - iter_inline_changes(self, op) - } -} - -// quick and dirty way to get an upper sequence ratio. -fn upper_seq_ratio(seq1: &[T], seq2: &[T]) -> f32 { - let n = seq1.len() + seq2.len(); - if n == 0 { - 1.0 - } else { - 2.0 * seq1.len().min(seq2.len()) as f32 / n as f32 - } -} - -/// Internal utility to calculate an upper bound for a ratio for -/// [`get_close_matches`]. This is based on Python's difflib approach -/// of considering the two sets to be multisets. -/// -/// It counts the number of matches without regard to order, which is an -/// obvious upper bound. -struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>); - -impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> { - pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> { - let mut counts = HashMap::new(); - for &word in seq { - *counts.entry(word).or_insert(0) += 1; - } - QuickSeqRatio(counts) - } - - pub fn calc(&self, seq: &[&T]) -> f32 { - let n = self.0.len() + seq.len(); - if n == 0 { - return 1.0; - } - - let mut available = HashMap::new(); - let mut matches = 0; - for &word in seq { - let x = if let Some(count) = available.get(&word) { - *count - } else { - self.0.get(&word).copied().unwrap_or(0) - }; - available.insert(word, x - 1); - if x > 0 { - matches += 1; - } - } - - 2.0 * matches as f32 / n as f32 + inline::iter_inline_changes(self, op) } } @@ -738,7 +705,7 @@ fn test_line_ops() { .flat_map(|op| byte_diff.iter_changes(op)) .collect::>(); for (change, byte_change) in changes.iter().zip(byte_changes.iter()) { - assert_eq!(change.as_str_lossy(), byte_change.as_str_lossy()); + assert_eq!(change.to_string_lossy(), byte_change.to_string_lossy()); } } } diff --git a/src/text/udiff.rs b/src/text/udiff.rs index 1854448..8c2f3b5 100644 --- a/src/text/udiff.rs +++ b/src/text/udiff.rs @@ -19,17 +19,28 @@ //! The [`UnifiedDiff`] type supports both unicode and byte diffs for all //! types compatible with [`DiffableStr`]. You can pick between the two //! versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`]. -//! The former uses [`DiffableStr::as_str_lossy`], the latter uses +//! The former uses [`DiffableStr::to_string_lossy`], the latter uses //! [`DiffableStr::as_bytes`] for each line. use std::ops::Range; use std::{fmt, io}; use crate::algorithms::{Algorithm, DiffOp}; -use crate::text::{Change, ChangeTag, TextDiff}; +use crate::text::{Change, TextDiff}; use super::DiffableStr; +struct MissingNewlineHint(bool); + +impl fmt::Display for MissingNewlineHint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + if self.0 { + write!(f, "\n\\ No newline at end of file")?; + } + Ok(()) + } +} + #[derive(Copy, Clone, Debug)] struct UnifiedDiffHunkRange(usize, usize); @@ -103,7 +114,7 @@ impl fmt::Display for UnifiedHunkHeader { /// The [`UnifiedDiff`] type supports both unicode and byte diffs for all /// types compatible with [`DiffableStr`]. You can pick between the two /// versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`]. -/// The former uses [`DiffableStr::as_str_lossy`], the latter uses +/// The former uses [`DiffableStr::to_string_lossy`], the latter uses /// [`DiffableStr::as_bytes`] for each line. pub struct UnifiedDiff<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> { diff: &'diff TextDiff<'old, 'new, 'bufs, T>, @@ -238,31 +249,17 @@ impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> /// Write the hunk as bytes to the output stream. pub fn to_writer(&self, mut w: W) -> Result<(), io::Error> { - let mut wrote_header = false; - for change in self.iter_changes() { - if !wrote_header { + for (idx, change) in self.iter_changes().enumerate() { + if idx == 0 { writeln!(w, "{}", self.header())?; - wrote_header = true; } - write!( - w, - "{}", - match change.tag() { - ChangeTag::Equal => ' ', - ChangeTag::Delete => '-', - ChangeTag::Insert => '+', - }, - )?; + write!(w, "{}", change.tag())?; w.write_all(change.value().as_bytes())?; - if self.diff.newline_terminated() { - write!(w, "\n")?; + if !self.diff.newline_terminated() { + writeln!(w)?; } if change.missing_newline() { - if self.missing_newline_hint { - writeln!(w, "\n\\ No newline at end of file")?; - } else { - writeln!(w)?; - } + writeln!(w, "{}", MissingNewlineHint(self.missing_newline_hint))?; } } Ok(()) @@ -273,34 +270,16 @@ impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> fmt::Display for UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T> { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { - let nl = if self.diff.newline_terminated() { - "" - } else { - "\n" - }; - let mut wrote_header = false; - for change in self.iter_changes() { - if !wrote_header { + for (idx, change) in self.iter_changes().enumerate() { + if idx == 0 { writeln!(f, "{}", self.header())?; - wrote_header = true; } - write!( - f, - "{}{}{}", - match change.tag() { - ChangeTag::Equal => ' ', - ChangeTag::Delete => '-', - ChangeTag::Insert => '+', - }, - change.as_str_lossy(), - nl - )?; + write!(f, "{}{}", change.tag(), change.to_string_lossy())?; + if !self.diff.newline_terminated() { + writeln!(f)?; + } if change.missing_newline() { - if self.missing_newline_hint { - writeln!(f, "\n\\ No newline at end of file")?; - } else { - writeln!(f)?; - } + writeln!(f, "{}", MissingNewlineHint(self.missing_newline_hint))?; } } Ok(()) diff --git a/src/text/utils.rs b/src/text/utils.rs new file mode 100644 index 0000000..d4a440f --- /dev/null +++ b/src/text/utils.rs @@ -0,0 +1,55 @@ +use std::collections::HashMap; +use std::hash::Hash; + +use super::DiffableStrRef; + +// quick and dirty way to get an upper sequence ratio. +pub fn upper_seq_ratio(seq1: &[T], seq2: &[T]) -> f32 { + let n = seq1.len() + seq2.len(); + if n == 0 { + 1.0 + } else { + 2.0 * seq1.len().min(seq2.len()) as f32 / n as f32 + } +} + +/// Internal utility to calculate an upper bound for a ratio for +/// [`get_close_matches`]. This is based on Python's difflib approach +/// of considering the two sets to be multisets. +/// +/// It counts the number of matches without regard to order, which is an +/// obvious upper bound. +pub struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>); + +impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> { + pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> { + let mut counts = HashMap::new(); + for &word in seq { + *counts.entry(word).or_insert(0) += 1; + } + QuickSeqRatio(counts) + } + + pub fn calc(&self, seq: &[&T]) -> f32 { + let n = self.0.len() + seq.len(); + if n == 0 { + return 1.0; + } + + let mut available = HashMap::new(); + let mut matches = 0; + for &word in seq { + let x = if let Some(count) = available.get(&word) { + *count + } else { + self.0.get(&word).copied().unwrap_or(0) + }; + available.insert(word, x - 1); + if x > 0 { + matches += 1; + } + } + + 2.0 * matches as f32 / n as f32 + } +}