Some internal refactorings

This commit is contained in:
Armin Ronacher 2021-02-02 20:15:31 +01:00
parent 4b85e70f91
commit 34e5b3d571
8 changed files with 174 additions and 157 deletions

View file

@ -44,7 +44,7 @@ fn main() {
style(Line(change.new_index())).dim(), style(Line(change.new_index())).dim(),
s.apply_to(sign).bold(), s.apply_to(sign).bold(),
); );
for (emphasized, value) in change.iter_strings() { for (emphasized, value) in change.iter_strings_lossy() {
if emphasized { if emphasized {
print!("{}", s.apply_to(value).underlined().on_black()); print!("{}", s.apply_to(value).underlined().on_black());
} else { } else {

View file

@ -155,34 +155,6 @@ impl DiffOp {
} }
} }
/// A [`DiffHook`] that captures all diff operations.
#[derive(Default, Clone)]
pub struct Capture(Vec<DiffOp>);
impl Capture {
/// Creates a new capture hook.
pub fn new() -> Capture {
Capture::default()
}
/// Converts the capture hook into a vector of ops.
pub fn into_ops(self) -> Vec<DiffOp> {
self.0
}
/// Isolate change clusters by eliminating ranges with no changes.
///
/// This is equivalent to calling [`group_diff_ops`] on [`Capture::into_ops`].
pub fn into_grouped_ops(self, n: usize) -> Vec<Vec<DiffOp>> {
group_diff_ops(self.into_ops(), n)
}
/// Accesses the captured operations.
pub fn ops(&self) -> &[DiffOp] {
&self.0
}
}
/// Isolate change clusters by eliminating ranges with no changes. /// Isolate change clusters by eliminating ranges with no changes.
/// ///
/// This will leave holes behind in long periods of equal ranges so that /// This will leave holes behind in long periods of equal ranges so that
@ -272,6 +244,34 @@ pub fn get_diff_ratio(ops: &[DiffOp], old_len: usize, new_len: usize) -> f32 {
} }
} }
/// A [`DiffHook`] that captures all diff operations.
#[derive(Default, Clone)]
pub struct Capture(Vec<DiffOp>);
impl Capture {
/// Creates a new capture hook.
pub fn new() -> Capture {
Capture::default()
}
/// Converts the capture hook into a vector of ops.
pub fn into_ops(self) -> Vec<DiffOp> {
self.0
}
/// Isolate change clusters by eliminating ranges with no changes.
///
/// This is equivalent to calling [`group_diff_ops`] on [`Capture::into_ops`].
pub fn into_grouped_ops(self, n: usize) -> Vec<Vec<DiffOp>> {
group_diff_ops(self.into_ops(), n)
}
/// Accesses the captured operations.
pub fn ops(&self) -> &[DiffOp] {
&self.0
}
}
impl DiffHook for Capture { impl DiffHook for Capture {
type Error = Infallible; type Error = Infallible;

View file

@ -23,9 +23,9 @@ mod replace;
use std::hash::Hash; use std::hash::Hash;
use std::ops::{Index, Range}; use std::ops::{Index, Range};
pub use capture::*; pub use capture::{get_diff_ratio, group_diff_ops, Capture, DiffOp, DiffTag};
pub use hook::*; pub use hook::DiffHook;
pub use replace::*; pub use replace::Replace;
// actual diffing algorithms // actual diffing algorithms
pub mod myers; pub mod myers;

View file

@ -6,6 +6,13 @@ use std::hash::Hash;
use std::ops::Range; use std::ops::Range;
/// Reference to a [`DiffableStr`]. /// Reference to a [`DiffableStr`].
///
/// This type exists because while the library only really provides ways to
/// work with `&str` and `&[u8]` there are types that deref into those string
/// slices such as `String` and `Vec<u8>`.
///
/// This trait is used in the library whenever it's nice to be able to pass
/// strings of different types in.
pub trait DiffableStrRef { pub trait DiffableStrRef {
/// The type of the resolved [`DiffableStr`]. /// The type of the resolved [`DiffableStr`].
type Output: DiffableStr + ?Sized; type Output: DiffableStr + ?Sized;
@ -78,7 +85,7 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
fn as_str(&self) -> Option<&str>; fn as_str(&self) -> Option<&str>;
/// Decodes the string (potentially) lossy. /// Decodes the string (potentially) lossy.
fn as_str_lossy(&self) -> Cow<'_, str>; fn to_string_lossy(&self) -> Cow<'_, str>;
/// Checks if the string ends in a newline. /// Checks if the string ends in a newline.
fn ends_with_newline(&self) -> bool; fn ends_with_newline(&self) -> bool;
@ -91,6 +98,11 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
/// Returns the strings as slice of raw bytes. /// Returns the strings as slice of raw bytes.
fn as_bytes(&self) -> &[u8]; fn as_bytes(&self) -> &[u8];
/// Checks if the string is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
} }
impl DiffableStr for str { impl DiffableStr for str {
@ -184,7 +196,7 @@ impl DiffableStr for str {
Some(self) Some(self)
} }
fn as_str_lossy(&self) -> Cow<'_, str> { fn to_string_lossy(&self) -> Cow<'_, str> {
Cow::Borrowed(self) Cow::Borrowed(self)
} }
@ -293,7 +305,7 @@ impl DiffableStr for [u8] {
std::str::from_utf8(self).ok() std::str::from_utf8(self).ok()
} }
fn as_str_lossy(&self) -> Cow<'_, str> { fn to_string_lossy(&self) -> Cow<'_, str> {
String::from_utf8_lossy(self) String::from_utf8_lossy(self)
} }

View file

@ -121,6 +121,10 @@ impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> {
/// ///
/// Each item is a tuple in the form `(emphasized, value)` where `emphasized` /// Each item is a tuple in the form `(emphasized, value)` where `emphasized`
/// is true if it should be highlighted as an inline diff. /// is true if it should be highlighted as an inline diff.
///
/// Depending on the type of the underlying [`DiffableStr`] this value is
/// more or less useful. If you always want to have a utf-8 string it's
/// better to use the [`InlineChange::iter_strings_lossy`] method.
pub fn values(&self) -> &[(bool, &'s T)] { pub fn values(&self) -> &[(bool, &'s T)] {
&self.values &self.values
} }
@ -129,10 +133,10 @@ impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> {
/// ///
/// Each item is a tuple in the form `(emphasized, value)` where `emphasized` /// Each item is a tuple in the form `(emphasized, value)` where `emphasized`
/// is true if it should be highlighted as an inline diff. /// is true if it should be highlighted as an inline diff.
pub fn iter_strings(&self) -> impl Iterator<Item = (bool, Cow<'_, str>)> { pub fn iter_strings_lossy(&self) -> impl Iterator<Item = (bool, Cow<'_, str>)> {
self.values() self.values()
.iter() .iter()
.map(|(emphasized, raw_value)| (*emphasized, raw_value.as_str_lossy())) .map(|(emphasized, raw_value)| (*emphasized, raw_value.to_string_lossy()))
} }
/// Returns `true` if this change needs to be followed up by a /// Returns `true` if this change needs to be followed up by a
@ -156,7 +160,7 @@ impl<'s, T: DiffableStr + ?Sized> From<Change<'s, T>> for InlineChange<'s, T> {
impl<'s, T: DiffableStr + ?Sized> fmt::Display for InlineChange<'s, T> { impl<'s, T: DiffableStr + ?Sized> fmt::Display for InlineChange<'s, T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for (emphasized, value) in self.iter_strings() { for (emphasized, value) in self.iter_strings_lossy() {
let marker = match (emphasized, self.tag) { let marker = match (emphasized, self.tag) {
(false, _) | (true, ChangeTag::Equal) => "", (false, _) | (true, ChangeTag::Equal) => "",
(true, ChangeTag::Delete) => "-", (true, ChangeTag::Delete) => "-",

View file

@ -87,22 +87,22 @@
#![cfg(feature = "text")] #![cfg(feature = "text")]
use std::borrow::Cow; use std::borrow::Cow;
use std::cmp::Reverse; use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap}; use std::collections::BinaryHeap;
use std::fmt; use std::fmt;
use std::hash::Hash; use std::hash::Hash;
mod abstraction; mod abstraction;
#[cfg(feature = "inline")] #[cfg(feature = "inline")]
mod inline; mod inline;
mod udiff; mod udiff;
mod utils;
pub use self::abstraction::{DiffableStr, DiffableStrRef};
#[cfg(feature = "inline")] #[cfg(feature = "inline")]
pub use self::inline::*; pub use self::inline::InlineChange;
pub use self::udiff::*; pub use self::udiff::{unified_diff, UnifiedDiff, UnifiedHunkHeader};
pub use crate::text::abstraction::*;
use self::utils::{upper_seq_ratio, QuickSeqRatio};
use crate::algorithms::{ use crate::algorithms::{
capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp, DiffTag, capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp, DiffTag,
}; };
@ -250,15 +250,6 @@ impl TextDiffConfig {
} }
} }
/// Captures diff op codes for textual diffs
pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> {
old: Cow<'bufs, [&'old T]>,
new: Cow<'bufs, [&'new T]>,
ops: Vec<DiffOp>,
newline_terminated: bool,
algorithm: Algorithm,
}
/// The tag of a change. /// The tag of a change.
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Ord, PartialOrd)] #[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Ord, PartialOrd)]
pub enum ChangeTag { pub enum ChangeTag {
@ -270,6 +261,20 @@ pub enum ChangeTag {
Insert, Insert,
} }
impl fmt::Display for ChangeTag {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}",
match &self {
ChangeTag::Equal => ' ',
ChangeTag::Delete => '-',
ChangeTag::Insert => '+',
}
)
}
}
/// Represents the expanded textual change. /// Represents the expanded textual change.
/// ///
/// This type is returned from the [`TextDiff::iter_changes`] method. It /// This type is returned from the [`TextDiff::iter_changes`] method. It
@ -289,7 +294,7 @@ impl<'s, T: DiffableStr + ?Sized> fmt::Display for Change<'s, T> {
write!( write!(
f, f,
"{}{}", "{}{}",
self.as_str_lossy(), self.to_string_lossy(),
if self.missing_newline { "\n" } else { "" } if self.missing_newline { "\n" } else { "" }
) )
} }
@ -312,6 +317,10 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> {
} }
/// Returns the underlying changed value. /// Returns the underlying changed value.
///
/// Depending on the type of the underlying [`DiffableStr`] this value is
/// more or less useful. If you always want to have a utf-8 string it's
/// best to use the [`Change::as_str`] and [`Change::to_string_lossy`] methods.
pub fn value(&self) -> &'s T { pub fn value(&self) -> &'s T {
self.value self.value
} }
@ -322,8 +331,8 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> {
} }
/// Returns the value (lossy) decoded as utf-8 string. /// Returns the value (lossy) decoded as utf-8 string.
pub fn as_str_lossy(&self) -> Cow<'s, str> { pub fn to_string_lossy(&self) -> Cow<'s, str> {
T::as_str_lossy(self.value) T::to_string_lossy(self.value)
} }
/// Returns `true` if this change needs to be followed up by a /// Returns `true` if this change needs to be followed up by a
@ -336,6 +345,15 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> {
} }
} }
/// Captures diff op codes for textual diffs
pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> {
old: Cow<'bufs, [&'old T]>,
new: Cow<'bufs, [&'new T]>,
ops: Vec<DiffOp>,
newline_terminated: bool,
algorithm: Algorithm,
}
impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs, str> { impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs, str> {
/// Configures a text differ before diffing. /// Configures a text differ before diffing.
pub fn configure() -> TextDiffConfig { pub fn configure() -> TextDiffConfig {
@ -571,58 +589,7 @@ impl<'old, 'new, 'bufs, T: DiffableStr + ?Sized + 'old + 'new> TextDiff<'old, 'n
/// is currently not defined and will likely change over time. /// is currently not defined and will likely change over time.
#[cfg(feature = "inline")] #[cfg(feature = "inline")]
pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator<Item = InlineChange<'_, T>> { pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator<Item = InlineChange<'_, T>> {
iter_inline_changes(self, op) inline::iter_inline_changes(self, op)
}
}
// quick and dirty way to get an upper sequence ratio.
fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
let n = seq1.len() + seq2.len();
if n == 0 {
1.0
} else {
2.0 * seq1.len().min(seq2.len()) as f32 / n as f32
}
}
/// Internal utility to calculate an upper bound for a ratio for
/// [`get_close_matches`]. This is based on Python's difflib approach
/// of considering the two sets to be multisets.
///
/// It counts the number of matches without regard to order, which is an
/// obvious upper bound.
struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>);
impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> {
pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> {
let mut counts = HashMap::new();
for &word in seq {
*counts.entry(word).or_insert(0) += 1;
}
QuickSeqRatio(counts)
}
pub fn calc(&self, seq: &[&T]) -> f32 {
let n = self.0.len() + seq.len();
if n == 0 {
return 1.0;
}
let mut available = HashMap::new();
let mut matches = 0;
for &word in seq {
let x = if let Some(count) = available.get(&word) {
*count
} else {
self.0.get(&word).copied().unwrap_or(0)
};
available.insert(word, x - 1);
if x > 0 {
matches += 1;
}
}
2.0 * matches as f32 / n as f32
} }
} }
@ -738,7 +705,7 @@ fn test_line_ops() {
.flat_map(|op| byte_diff.iter_changes(op)) .flat_map(|op| byte_diff.iter_changes(op))
.collect::<Vec<_>>(); .collect::<Vec<_>>();
for (change, byte_change) in changes.iter().zip(byte_changes.iter()) { for (change, byte_change) in changes.iter().zip(byte_changes.iter()) {
assert_eq!(change.as_str_lossy(), byte_change.as_str_lossy()); assert_eq!(change.to_string_lossy(), byte_change.to_string_lossy());
} }
} }
} }

View file

@ -19,17 +19,28 @@
//! The [`UnifiedDiff`] type supports both unicode and byte diffs for all //! The [`UnifiedDiff`] type supports both unicode and byte diffs for all
//! types compatible with [`DiffableStr`]. You can pick between the two //! types compatible with [`DiffableStr`]. You can pick between the two
//! versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`]. //! versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`].
//! The former uses [`DiffableStr::as_str_lossy`], the latter uses //! The former uses [`DiffableStr::to_string_lossy`], the latter uses
//! [`DiffableStr::as_bytes`] for each line. //! [`DiffableStr::as_bytes`] for each line.
use std::ops::Range; use std::ops::Range;
use std::{fmt, io}; use std::{fmt, io};
use crate::algorithms::{Algorithm, DiffOp}; use crate::algorithms::{Algorithm, DiffOp};
use crate::text::{Change, ChangeTag, TextDiff}; use crate::text::{Change, TextDiff};
use super::DiffableStr; use super::DiffableStr;
struct MissingNewlineHint(bool);
impl fmt::Display for MissingNewlineHint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.0 {
write!(f, "\n\\ No newline at end of file")?;
}
Ok(())
}
}
#[derive(Copy, Clone, Debug)] #[derive(Copy, Clone, Debug)]
struct UnifiedDiffHunkRange(usize, usize); struct UnifiedDiffHunkRange(usize, usize);
@ -103,7 +114,7 @@ impl fmt::Display for UnifiedHunkHeader {
/// The [`UnifiedDiff`] type supports both unicode and byte diffs for all /// The [`UnifiedDiff`] type supports both unicode and byte diffs for all
/// types compatible with [`DiffableStr`]. You can pick between the two /// types compatible with [`DiffableStr`]. You can pick between the two
/// versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`]. /// versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`].
/// The former uses [`DiffableStr::as_str_lossy`], the latter uses /// The former uses [`DiffableStr::to_string_lossy`], the latter uses
/// [`DiffableStr::as_bytes`] for each line. /// [`DiffableStr::as_bytes`] for each line.
pub struct UnifiedDiff<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> { pub struct UnifiedDiff<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> {
diff: &'diff TextDiff<'old, 'new, 'bufs, T>, diff: &'diff TextDiff<'old, 'new, 'bufs, T>,
@ -238,31 +249,17 @@ impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized>
/// Write the hunk as bytes to the output stream. /// Write the hunk as bytes to the output stream.
pub fn to_writer<W: io::Write>(&self, mut w: W) -> Result<(), io::Error> { pub fn to_writer<W: io::Write>(&self, mut w: W) -> Result<(), io::Error> {
let mut wrote_header = false; for (idx, change) in self.iter_changes().enumerate() {
for change in self.iter_changes() { if idx == 0 {
if !wrote_header {
writeln!(w, "{}", self.header())?; writeln!(w, "{}", self.header())?;
wrote_header = true;
} }
write!( write!(w, "{}", change.tag())?;
w,
"{}",
match change.tag() {
ChangeTag::Equal => ' ',
ChangeTag::Delete => '-',
ChangeTag::Insert => '+',
},
)?;
w.write_all(change.value().as_bytes())?; w.write_all(change.value().as_bytes())?;
if self.diff.newline_terminated() { if !self.diff.newline_terminated() {
write!(w, "\n")?; writeln!(w)?;
} }
if change.missing_newline() { if change.missing_newline() {
if self.missing_newline_hint { writeln!(w, "{}", MissingNewlineHint(self.missing_newline_hint))?;
writeln!(w, "\n\\ No newline at end of file")?;
} else {
writeln!(w)?;
}
} }
} }
Ok(()) Ok(())
@ -273,34 +270,16 @@ impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> fmt::Display
for UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T> for UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T>
{ {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result { fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let nl = if self.diff.newline_terminated() { for (idx, change) in self.iter_changes().enumerate() {
"" if idx == 0 {
} else {
"\n"
};
let mut wrote_header = false;
for change in self.iter_changes() {
if !wrote_header {
writeln!(f, "{}", self.header())?; writeln!(f, "{}", self.header())?;
wrote_header = true;
} }
write!( write!(f, "{}{}", change.tag(), change.to_string_lossy())?;
f, if !self.diff.newline_terminated() {
"{}{}{}", writeln!(f)?;
match change.tag() { }
ChangeTag::Equal => ' ',
ChangeTag::Delete => '-',
ChangeTag::Insert => '+',
},
change.as_str_lossy(),
nl
)?;
if change.missing_newline() { if change.missing_newline() {
if self.missing_newline_hint { writeln!(f, "{}", MissingNewlineHint(self.missing_newline_hint))?;
writeln!(f, "\n\\ No newline at end of file")?;
} else {
writeln!(f)?;
}
} }
} }
Ok(()) Ok(())

55
src/text/utils.rs Normal file
View file

@ -0,0 +1,55 @@
use std::collections::HashMap;
use std::hash::Hash;
use super::DiffableStrRef;
// quick and dirty way to get an upper sequence ratio.
pub fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
let n = seq1.len() + seq2.len();
if n == 0 {
1.0
} else {
2.0 * seq1.len().min(seq2.len()) as f32 / n as f32
}
}
/// Internal utility to calculate an upper bound for a ratio for
/// [`get_close_matches`]. This is based on Python's difflib approach
/// of considering the two sets to be multisets.
///
/// It counts the number of matches without regard to order, which is an
/// obvious upper bound.
pub struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>);
impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> {
pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> {
let mut counts = HashMap::new();
for &word in seq {
*counts.entry(word).or_insert(0) += 1;
}
QuickSeqRatio(counts)
}
pub fn calc(&self, seq: &[&T]) -> f32 {
let n = self.0.len() + seq.len();
if n == 0 {
return 1.0;
}
let mut available = HashMap::new();
let mut matches = 0;
for &word in seq {
let x = if let Some(count) = available.get(&word) {
*count
} else {
self.0.get(&word).copied().unwrap_or(0)
};
available.insert(word, x - 1);
if x > 0 {
matches += 1;
}
}
2.0 * matches as f32 / n as f32
}
}