Some internal refactorings

This commit is contained in:
Armin Ronacher 2021-02-02 20:15:31 +01:00
parent 4b85e70f91
commit 34e5b3d571
8 changed files with 174 additions and 157 deletions

View file

@ -44,7 +44,7 @@ fn main() {
style(Line(change.new_index())).dim(),
s.apply_to(sign).bold(),
);
for (emphasized, value) in change.iter_strings() {
for (emphasized, value) in change.iter_strings_lossy() {
if emphasized {
print!("{}", s.apply_to(value).underlined().on_black());
} else {

View file

@ -155,34 +155,6 @@ impl DiffOp {
}
}
/// A [`DiffHook`] that captures all diff operations.
#[derive(Default, Clone)]
pub struct Capture(Vec<DiffOp>);
impl Capture {
/// Creates a new capture hook.
pub fn new() -> Capture {
Capture::default()
}
/// Converts the capture hook into a vector of ops.
pub fn into_ops(self) -> Vec<DiffOp> {
self.0
}
/// Isolate change clusters by eliminating ranges with no changes.
///
/// This is equivalent to calling [`group_diff_ops`] on [`Capture::into_ops`].
pub fn into_grouped_ops(self, n: usize) -> Vec<Vec<DiffOp>> {
group_diff_ops(self.into_ops(), n)
}
/// Accesses the captured operations.
pub fn ops(&self) -> &[DiffOp] {
&self.0
}
}
/// Isolate change clusters by eliminating ranges with no changes.
///
/// This will leave holes behind in long periods of equal ranges so that
@ -272,6 +244,34 @@ pub fn get_diff_ratio(ops: &[DiffOp], old_len: usize, new_len: usize) -> f32 {
}
}
/// A [`DiffHook`] that captures all diff operations.
#[derive(Default, Clone)]
pub struct Capture(Vec<DiffOp>);
impl Capture {
/// Creates a new capture hook.
pub fn new() -> Capture {
Capture::default()
}
/// Converts the capture hook into a vector of ops.
pub fn into_ops(self) -> Vec<DiffOp> {
self.0
}
/// Isolate change clusters by eliminating ranges with no changes.
///
/// This is equivalent to calling [`group_diff_ops`] on [`Capture::into_ops`].
pub fn into_grouped_ops(self, n: usize) -> Vec<Vec<DiffOp>> {
group_diff_ops(self.into_ops(), n)
}
/// Accesses the captured operations.
pub fn ops(&self) -> &[DiffOp] {
&self.0
}
}
impl DiffHook for Capture {
type Error = Infallible;

View file

@ -23,9 +23,9 @@ mod replace;
use std::hash::Hash;
use std::ops::{Index, Range};
pub use capture::*;
pub use hook::*;
pub use replace::*;
pub use capture::{get_diff_ratio, group_diff_ops, Capture, DiffOp, DiffTag};
pub use hook::DiffHook;
pub use replace::Replace;
// actual diffing algorithms
pub mod myers;

View file

@ -6,6 +6,13 @@ use std::hash::Hash;
use std::ops::Range;
/// Reference to a [`DiffableStr`].
///
/// This type exists because while the library only really provides ways to
/// work with `&str` and `&[u8]` there are types that deref into those string
/// slices such as `String` and `Vec<u8>`.
///
/// This trait is used in the library whenever it's nice to be able to pass
/// strings of different types in.
pub trait DiffableStrRef {
/// The type of the resolved [`DiffableStr`].
type Output: DiffableStr + ?Sized;
@ -78,7 +85,7 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
fn as_str(&self) -> Option<&str>;
/// Decodes the string (potentially) lossy.
fn as_str_lossy(&self) -> Cow<'_, str>;
fn to_string_lossy(&self) -> Cow<'_, str>;
/// Checks if the string ends in a newline.
fn ends_with_newline(&self) -> bool;
@ -91,6 +98,11 @@ pub trait DiffableStr: Hash + PartialEq + PartialOrd + Ord + Eq + ToOwned {
/// Returns the strings as slice of raw bytes.
fn as_bytes(&self) -> &[u8];
/// Checks if the string is empty.
fn is_empty(&self) -> bool {
self.len() == 0
}
}
impl DiffableStr for str {
@ -184,7 +196,7 @@ impl DiffableStr for str {
Some(self)
}
fn as_str_lossy(&self) -> Cow<'_, str> {
fn to_string_lossy(&self) -> Cow<'_, str> {
Cow::Borrowed(self)
}
@ -293,7 +305,7 @@ impl DiffableStr for [u8] {
std::str::from_utf8(self).ok()
}
fn as_str_lossy(&self) -> Cow<'_, str> {
fn to_string_lossy(&self) -> Cow<'_, str> {
String::from_utf8_lossy(self)
}

View file

@ -121,6 +121,10 @@ impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> {
///
/// Each item is a tuple in the form `(emphasized, value)` where `emphasized`
/// is true if it should be highlighted as an inline diff.
///
/// Depending on the type of the underlying [`DiffableStr`] this value is
/// more or less useful. If you always want to have a utf-8 string it's
/// better to use the [`InlineChange::iter_strings_lossy`] method.
pub fn values(&self) -> &[(bool, &'s T)] {
&self.values
}
@ -129,10 +133,10 @@ impl<'s, T: DiffableStr + ?Sized> InlineChange<'s, T> {
///
/// Each item is a tuple in the form `(emphasized, value)` where `emphasized`
/// is true if it should be highlighted as an inline diff.
pub fn iter_strings(&self) -> impl Iterator<Item = (bool, Cow<'_, str>)> {
pub fn iter_strings_lossy(&self) -> impl Iterator<Item = (bool, Cow<'_, str>)> {
self.values()
.iter()
.map(|(emphasized, raw_value)| (*emphasized, raw_value.as_str_lossy()))
.map(|(emphasized, raw_value)| (*emphasized, raw_value.to_string_lossy()))
}
/// Returns `true` if this change needs to be followed up by a
@ -156,7 +160,7 @@ impl<'s, T: DiffableStr + ?Sized> From<Change<'s, T>> for InlineChange<'s, T> {
impl<'s, T: DiffableStr + ?Sized> fmt::Display for InlineChange<'s, T> {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
for (emphasized, value) in self.iter_strings() {
for (emphasized, value) in self.iter_strings_lossy() {
let marker = match (emphasized, self.tag) {
(false, _) | (true, ChangeTag::Equal) => "",
(true, ChangeTag::Delete) => "-",

View file

@ -87,22 +87,22 @@
#![cfg(feature = "text")]
use std::borrow::Cow;
use std::cmp::Reverse;
use std::collections::{BinaryHeap, HashMap};
use std::collections::BinaryHeap;
use std::fmt;
use std::hash::Hash;
mod abstraction;
#[cfg(feature = "inline")]
mod inline;
mod udiff;
mod utils;
pub use self::abstraction::{DiffableStr, DiffableStrRef};
#[cfg(feature = "inline")]
pub use self::inline::*;
pub use self::udiff::*;
pub use crate::text::abstraction::*;
pub use self::inline::InlineChange;
pub use self::udiff::{unified_diff, UnifiedDiff, UnifiedHunkHeader};
use self::utils::{upper_seq_ratio, QuickSeqRatio};
use crate::algorithms::{
capture_diff_slices, get_diff_ratio, group_diff_ops, Algorithm, DiffOp, DiffTag,
};
@ -250,15 +250,6 @@ impl TextDiffConfig {
}
}
/// Captures diff op codes for textual diffs
pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> {
old: Cow<'bufs, [&'old T]>,
new: Cow<'bufs, [&'new T]>,
ops: Vec<DiffOp>,
newline_terminated: bool,
algorithm: Algorithm,
}
/// The tag of a change.
#[derive(Debug, PartialEq, Eq, Hash, Clone, Copy, Ord, PartialOrd)]
pub enum ChangeTag {
@ -270,6 +261,20 @@ pub enum ChangeTag {
Insert,
}
impl fmt::Display for ChangeTag {
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
write!(
f,
"{}",
match &self {
ChangeTag::Equal => ' ',
ChangeTag::Delete => '-',
ChangeTag::Insert => '+',
}
)
}
}
/// Represents the expanded textual change.
///
/// This type is returned from the [`TextDiff::iter_changes`] method. It
@ -289,7 +294,7 @@ impl<'s, T: DiffableStr + ?Sized> fmt::Display for Change<'s, T> {
write!(
f,
"{}{}",
self.as_str_lossy(),
self.to_string_lossy(),
if self.missing_newline { "\n" } else { "" }
)
}
@ -312,6 +317,10 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> {
}
/// Returns the underlying changed value.
///
/// Depending on the type of the underlying [`DiffableStr`] this value is
/// more or less useful. If you always want to have a utf-8 string it's
/// best to use the [`Change::as_str`] and [`Change::to_string_lossy`] methods.
pub fn value(&self) -> &'s T {
self.value
}
@ -322,8 +331,8 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> {
}
/// Returns the value (lossy) decoded as utf-8 string.
pub fn as_str_lossy(&self) -> Cow<'s, str> {
T::as_str_lossy(self.value)
pub fn to_string_lossy(&self) -> Cow<'s, str> {
T::to_string_lossy(self.value)
}
/// Returns `true` if this change needs to be followed up by a
@ -336,6 +345,15 @@ impl<'s, T: DiffableStr + ?Sized> Change<'s, T> {
}
}
/// Captures diff op codes for textual diffs
pub struct TextDiff<'old, 'new, 'bufs, T: DiffableStr + ?Sized> {
old: Cow<'bufs, [&'old T]>,
new: Cow<'bufs, [&'new T]>,
ops: Vec<DiffOp>,
newline_terminated: bool,
algorithm: Algorithm,
}
impl<'old, 'new, 'bufs> TextDiff<'old, 'new, 'bufs, str> {
/// Configures a text differ before diffing.
pub fn configure() -> TextDiffConfig {
@ -571,58 +589,7 @@ impl<'old, 'new, 'bufs, T: DiffableStr + ?Sized + 'old + 'new> TextDiff<'old, 'n
/// is currently not defined and will likely change over time.
#[cfg(feature = "inline")]
pub fn iter_inline_changes(&self, op: &DiffOp) -> impl Iterator<Item = InlineChange<'_, T>> {
iter_inline_changes(self, op)
}
}
// quick and dirty way to get an upper sequence ratio.
fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
let n = seq1.len() + seq2.len();
if n == 0 {
1.0
} else {
2.0 * seq1.len().min(seq2.len()) as f32 / n as f32
}
}
/// Internal utility to calculate an upper bound for a ratio for
/// [`get_close_matches`]. This is based on Python's difflib approach
/// of considering the two sets to be multisets.
///
/// It counts the number of matches without regard to order, which is an
/// obvious upper bound.
struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>);
impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> {
pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> {
let mut counts = HashMap::new();
for &word in seq {
*counts.entry(word).or_insert(0) += 1;
}
QuickSeqRatio(counts)
}
pub fn calc(&self, seq: &[&T]) -> f32 {
let n = self.0.len() + seq.len();
if n == 0 {
return 1.0;
}
let mut available = HashMap::new();
let mut matches = 0;
for &word in seq {
let x = if let Some(count) = available.get(&word) {
*count
} else {
self.0.get(&word).copied().unwrap_or(0)
};
available.insert(word, x - 1);
if x > 0 {
matches += 1;
}
}
2.0 * matches as f32 / n as f32
inline::iter_inline_changes(self, op)
}
}
@ -738,7 +705,7 @@ fn test_line_ops() {
.flat_map(|op| byte_diff.iter_changes(op))
.collect::<Vec<_>>();
for (change, byte_change) in changes.iter().zip(byte_changes.iter()) {
assert_eq!(change.as_str_lossy(), byte_change.as_str_lossy());
assert_eq!(change.to_string_lossy(), byte_change.to_string_lossy());
}
}
}

View file

@ -19,17 +19,28 @@
//! The [`UnifiedDiff`] type supports both unicode and byte diffs for all
//! types compatible with [`DiffableStr`]. You can pick between the two
//! versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`].
//! The former uses [`DiffableStr::as_str_lossy`], the latter uses
//! The former uses [`DiffableStr::to_string_lossy`], the latter uses
//! [`DiffableStr::as_bytes`] for each line.
use std::ops::Range;
use std::{fmt, io};
use crate::algorithms::{Algorithm, DiffOp};
use crate::text::{Change, ChangeTag, TextDiff};
use crate::text::{Change, TextDiff};
use super::DiffableStr;
struct MissingNewlineHint(bool);
impl fmt::Display for MissingNewlineHint {
fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
if self.0 {
write!(f, "\n\\ No newline at end of file")?;
}
Ok(())
}
}
#[derive(Copy, Clone, Debug)]
struct UnifiedDiffHunkRange(usize, usize);
@ -103,7 +114,7 @@ impl fmt::Display for UnifiedHunkHeader {
/// The [`UnifiedDiff`] type supports both unicode and byte diffs for all
/// types compatible with [`DiffableStr`]. You can pick between the two
/// versions by using [`UnifiedDiff.to_string`] or [`UnifiedDiff.to_writer`].
/// The former uses [`DiffableStr::as_str_lossy`], the latter uses
/// The former uses [`DiffableStr::to_string_lossy`], the latter uses
/// [`DiffableStr::as_bytes`] for each line.
pub struct UnifiedDiff<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> {
diff: &'diff TextDiff<'old, 'new, 'bufs, T>,
@ -238,31 +249,17 @@ impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized>
/// Write the hunk as bytes to the output stream.
pub fn to_writer<W: io::Write>(&self, mut w: W) -> Result<(), io::Error> {
let mut wrote_header = false;
for change in self.iter_changes() {
if !wrote_header {
for (idx, change) in self.iter_changes().enumerate() {
if idx == 0 {
writeln!(w, "{}", self.header())?;
wrote_header = true;
}
write!(
w,
"{}",
match change.tag() {
ChangeTag::Equal => ' ',
ChangeTag::Delete => '-',
ChangeTag::Insert => '+',
},
)?;
write!(w, "{}", change.tag())?;
w.write_all(change.value().as_bytes())?;
if self.diff.newline_terminated() {
write!(w, "\n")?;
if !self.diff.newline_terminated() {
writeln!(w)?;
}
if change.missing_newline() {
if self.missing_newline_hint {
writeln!(w, "\n\\ No newline at end of file")?;
} else {
writeln!(w)?;
}
writeln!(w, "{}", MissingNewlineHint(self.missing_newline_hint))?;
}
}
Ok(())
@ -273,34 +270,16 @@ impl<'diff, 'old, 'new, 'bufs, T: DiffableStr + ?Sized> fmt::Display
for UnifiedDiffHunk<'diff, 'old, 'new, 'bufs, T>
{
fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
let nl = if self.diff.newline_terminated() {
""
} else {
"\n"
};
let mut wrote_header = false;
for change in self.iter_changes() {
if !wrote_header {
for (idx, change) in self.iter_changes().enumerate() {
if idx == 0 {
writeln!(f, "{}", self.header())?;
wrote_header = true;
}
write!(
f,
"{}{}{}",
match change.tag() {
ChangeTag::Equal => ' ',
ChangeTag::Delete => '-',
ChangeTag::Insert => '+',
},
change.as_str_lossy(),
nl
)?;
write!(f, "{}{}", change.tag(), change.to_string_lossy())?;
if !self.diff.newline_terminated() {
writeln!(f)?;
}
if change.missing_newline() {
if self.missing_newline_hint {
writeln!(f, "\n\\ No newline at end of file")?;
} else {
writeln!(f)?;
}
writeln!(f, "{}", MissingNewlineHint(self.missing_newline_hint))?;
}
}
Ok(())

55
src/text/utils.rs Normal file
View file

@ -0,0 +1,55 @@
use std::collections::HashMap;
use std::hash::Hash;
use super::DiffableStrRef;
// quick and dirty way to get an upper sequence ratio.
pub fn upper_seq_ratio<T: PartialEq>(seq1: &[T], seq2: &[T]) -> f32 {
let n = seq1.len() + seq2.len();
if n == 0 {
1.0
} else {
2.0 * seq1.len().min(seq2.len()) as f32 / n as f32
}
}
/// Internal utility to calculate an upper bound for a ratio for
/// [`get_close_matches`]. This is based on Python's difflib approach
/// of considering the two sets to be multisets.
///
/// It counts the number of matches without regard to order, which is an
/// obvious upper bound.
pub struct QuickSeqRatio<'a, T: DiffableStrRef + ?Sized>(HashMap<&'a T, i32>);
impl<'a, T: DiffableStrRef + Hash + Eq + ?Sized> QuickSeqRatio<'a, T> {
pub fn new(seq: &[&'a T]) -> QuickSeqRatio<'a, T> {
let mut counts = HashMap::new();
for &word in seq {
*counts.entry(word).or_insert(0) += 1;
}
QuickSeqRatio(counts)
}
pub fn calc(&self, seq: &[&T]) -> f32 {
let n = self.0.len() + seq.len();
if n == 0 {
return 1.0;
}
let mut available = HashMap::new();
let mut matches = 0;
for &word in seq {
let x = if let Some(count) = available.get(&word) {
*count
} else {
self.0.get(&word).copied().unwrap_or(0)
};
available.insert(word, x - 1);
if x > 0 {
matches += 1;
}
}
2.0 * matches as f32 / n as f32
}
}