~reesmichael1/nim-simplediff

f4818c49bf8f65c452b1780c319b2f3e2d8173cd — Michael Rees 3 years ago
Implement simplediff in Nim along with stringDiff wrapper
6 files changed, 196 insertions(+), 0 deletions(-)

A .gitignore
A README.md
A simplediff.nimble
A src/simplediff.nim
A tests/config.nims
A tests/testDiff.nim
A  => .gitignore +3 -0
@@ 1,3 @@
*
!/**/
!*.*

A  => README.md +34 -0
@@ 1,34 @@
# simplediff

A [Nim](https://nim-lang.org) implementaion of a simple diff algorithm, based on [Paul Butler's `simplediff`](https://github.com/paulgb/simplediff).

## Usage

`simplediff` provides a `diff` proc which takes two `openArray`s and generates a `seq` of "instructions" to turn the first into the second. Each "instruction" is of the `Diff` type, which is either an `Insertion`, a `Deletion`, or a `NoChange`. Each `Diff` also has a `tokens` field, which contains a subsequence of elements that the insertion/deletion/leaving alone should be applied to. 

For example:

```
import simplediff

echo diff([1, 2, 3], [1, 2])
# @[Diff(kind: NoChange, tokens: @[1, 2]), Diff(kind: Deletion, tokens: @[3])]
```

Any type that implements the `==` operator can be used.

`simplediff` also provides a convenience wrapper for diffing two strings. By default, the strings are split into lines for diffing, but this can be changed with the `seps` parameter.

```
import simplediff

for diff in stringDiff("the word is blue", "the word is red", seps={' '}):
  echo diff
# Diff(kind: NoChange, tokens: @["the", "word", "is"])
# Diff(kind: Deletion, tokens: @["blue"])
# Diff(kind: Insertion, tokens: @["red"])
```

## Contributing

Contributions are welcome! Please send patches, questions, requests, etc. to my [public inbox](mailto:~reesmichael1/public-inbox@lists.sr.ht).

A  => simplediff.nimble +12 -0
@@ 1,12 @@
# Package

version       = "0.1.0"
author        = "Michael Rees"
description   = "A library for straightforward calculation of string differences"
license       = "GPL-3.0"
srcDir        = "src"
installExt    = @["nim"]

# Dependencies

requires "nim >= 1.0"

A  => src/simplediff.nim +73 -0
@@ 1,73 @@
import strutils
import tables


type
  ChangeType* = enum
    Insertion, Deletion, NoChange

  Diff*[T] = object
    kind*: ChangeType
    tokens*: seq[T]


proc diff*[T](itemsOld, itemsNew: openArray[T]): seq[Diff[T]] =
  ## Find the differences between two seqs.
  ## Each entry of the returned seq is an instruction describing the
  ## shortest method of changing itemsOld into itemsNew.
  var oldIndexMap: Table[T, seq[int]]
  for ix, item in itemsOld:
    if item in oldIndexMap:
      oldIndexMap[item].add(ix)
    else:
      oldIndexMap[item] = @[ix]

  var overlap: Table[int, int]
  var subStartOld = 0
  var subStartNew = 0
  var subLength = 0

  # Iterate over each value in the new list. At each iteration,
  # overlap[ix] is the length of the largest suffix of itemsOld[:ix]
  # equal to a suffix of itemsNew[:ixNew].
  #
  # subLength, subStartOld, and subStartNew keep track
  # of the largest substring of the overlapping strings.
  for ixNew, value in itemsNew:
    var overlapTemp: Table[int, int]
    for ixOld in oldIndexMap.getOrDefault(value):
      var newSuffixLen = 1
      if ixOld > 0 and overlap.getOrDefault(ixOld - 1, 0) > 0:
        newSuffixLen = overlap.getOrDefault(ixOld - 1, 0) + 1
      overlaptemp[ixOld] = newSuffixLen
      if overlapTemp[ixOld] > subLength:
        subLength = overlapTemp[ixOld]
        subStartOld = ixOld - subLength + 1
        subStartNew = ixNew - subLength + 1

    overlap = overlapTemp

  if subLength == 0:
    # If there is no common substring, return an insertion and a deletion
    if itemsOld.len > 0:
      result.add(Diff[T](kind: Deletion, tokens: @itemsOld))
    if itemsNew.len > 0:
      result.add(Diff[T](kind: Insertion, tokens: @itemsNew))

  else:
    # Otherwise, the common substring is left alone and we can find the diff
    # of the elements before and after it.
    let diffBefore = diff(itemsOld[0..<subStartOld], itemsNew[0..<subStartNew])
    let same = itemsNew[subStartNew..<subStartNew+subLength]
    let unchanged = Diff[T](kind: NoChange, tokens: same)
    let diffAfter = diff(itemsOld[subStartOld+subLength..<itemsOld.len],
      itemsNew[subStartNew+subLength..<itemsNew.len])

    return diffBefore & unchanged & diffAfter


proc stringDiff*(s1, s2: string, seps: set[char] = Newlines): seq[Diff[string]] =
  ## Return the difference between two strings on a line-by-line basis.
  ## Each entry of the returned seq is an instruction describing the
  ## shortest method of changing s1 into s2.
  return diff(split(s1, seps = seps), split(s2, seps = seps))

A  => tests/config.nims +1 -0
@@ 1,1 @@
switch("path", "$projectDir/../src")
\ No newline at end of file

A  => tests/testDiff.nim +73 -0
@@ 1,73 @@
import strutils
import unittest

import simplediff


suite "test bare diff":
  # These tests are from the doctests in simplediff's Python implementation
  test "bare diff on ints with equal start":
    check diff(@[1, 2, 3, 4], @[1, 3, 4]) == [
      Diff[int](kind: NoChange, tokens: @[1]),
      Diff[int](kind: Deletion, tokens: @[2]),
      Diff[int](kind: NoChange, tokens: @[3, 4])
      ]

  test "bare diff on ints with deletion at start":
    check diff(@[1, 2, 3, 4], @[2, 3, 4, 1]) == [
      Diff[int](kind: Deletion, tokens: @[1]),
      Diff[int](kind: NoChange, tokens: @[2, 3, 4]),
      Diff[int](kind: Insertion, tokens: @[1])
      ]

  test "bare diff on strings with words for tokens":
    check diff(split("The quick brown fox jumps over the lazy dog"),
      split("The slow blue cheese drips over the lazy carrot")) == [
        Diff[string](kind: NoChange, tokens: @["The"]),
        Diff[string](kind: Deletion, tokens: @["quick", "brown", "fox",
            "jumps"]),
        Diff[string](kind: Insertion, tokens: @["slow", "blue", "cheese",
            "drips"]),
        Diff[string](kind: NoChange, tokens: @["over", "the", "lazy"]),
        Diff[string](kind: Deletion, tokens: @["dog"]),
        Diff[string](kind: Insertion, tokens: @["carrot"]),
      ]


suite "test stringDiff":
  test "correct diff for identical one-line strings":
    check stringDiff("abc", "abc") == [
      Diff[string](kind: NoChange, tokens: @["abc"])
      ]

  test "correct diff for identical multi-line strings":
    check stringDiff("abc def\n123 456", "abc def\n123 456") == [
      Diff[string](kind: NoChange, tokens: @["abc def", "123 456"])
      ]

  test "correct diff for different one-line strings":
    check stringDiff("abc", "def") == [
      Diff[string](kind: Deletion, tokens: @["abc"]),
      Diff[string](kind: Insertion, tokens: @["def"])
      ]

  test "correct diff for different multi-line strings":
    check stringDiff("abc\ndef", "abc 123\ndef") == [
      Diff[string](kind: Deletion, tokens: @["abc"]),
      Diff[string](kind: Insertion, tokens: @["abc 123"]),
      Diff[string](kind: NoChange, tokens: @["def"])
      ]

  test "correct diff when splitting on a different character":
    check stringDiff("abc;def", "abc 123;def", seps = {';'}) == [
      Diff[string](kind: Deletion, tokens: @["abc"]),
      Diff[string](kind: Insertion, tokens: @["abc 123"]),
      Diff[string](kind: NoChange, tokens: @["def"])
      ]

  test "correct diff when splitting on multiple characters":
    check stringDiff("abc;def,123", "abc;fed;abc", seps = {';', ','}) == [
      Diff[string](kind: NoChange, tokens: @["abc"]),
      Diff[string](kind: Deletion, tokens: @["def", "123"]),
      Diff[string](kind: Insertion, tokens: @["fed", "abc"])
      ]