f4818c49bf8f65c452b1780c319b2f3e2d8173cd
—
Michael Rees
3 years ago

Implement simplediff in Nim along with stringDiff wrapper

6 files changed,196insertions(+),0deletions(-) A .gitignore A README.md A simplediff.nimble A src/simplediff.nim A tests/config.nims A tests/testDiff.nim

A => .gitignore +3 -0

@@ 1,3 @@* !/**/ !*.*

A => README.md +34 -0

@@ 1,34 @@# simplediff A [Nim](https://nim-lang.org) implementaion of a simple diff algorithm, based on [Paul Butler's `simplediff`](https://github.com/paulgb/simplediff). ## Usage `simplediff` provides a `diff` proc which takes two `openArray`s and generates a `seq` of "instructions" to turn the first into the second. Each "instruction" is of the `Diff` type, which is either an `Insertion`, a `Deletion`, or a `NoChange`. Each `Diff` also has a `tokens` field, which contains a subsequence of elements that the insertion/deletion/leaving alone should be applied to. For example: ``` import simplediff echo diff([1, 2, 3], [1, 2]) # @[Diff(kind: NoChange, tokens: @[1, 2]), Diff(kind: Deletion, tokens: @[3])] ``` Any type that implements the `==` operator can be used. `simplediff` also provides a convenience wrapper for diffing two strings. By default, the strings are split into lines for diffing, but this can be changed with the `seps` parameter. ``` import simplediff for diff in stringDiff("the word is blue", "the word is red", seps={' '}): echo diff # Diff(kind: NoChange, tokens: @["the", "word", "is"]) # Diff(kind: Deletion, tokens: @["blue"]) # Diff(kind: Insertion, tokens: @["red"]) ``` ## Contributing Contributions are welcome! Please send patches, questions, requests, etc. to my [public inbox](mailto:~reesmichael1/public-inbox@lists.sr.ht).

A => simplediff.nimble +12 -0

@@ 1,12 @@# Package version = "0.1.0" author = "Michael Rees" description = "A library for straightforward calculation of string differences" license = "GPL-3.0" srcDir = "src" installExt = @["nim"] # Dependencies requires "nim >= 1.0"

A => src/simplediff.nim +73 -0

@@ 1,73 @@import strutils import tables type ChangeType* = enum Insertion, Deletion, NoChange Diff*[T] = object kind*: ChangeType tokens*: seq[T] proc diff*[T](itemsOld, itemsNew: openArray[T]): seq[Diff[T]] = ## Find the differences between two seqs. ## Each entry of the returned seq is an instruction describing the ## shortest method of changing itemsOld into itemsNew. var oldIndexMap: Table[T, seq[int]] for ix, item in itemsOld: if item in oldIndexMap: oldIndexMap[item].add(ix) else: oldIndexMap[item] = @[ix] var overlap: Table[int, int] var subStartOld = 0 var subStartNew = 0 var subLength = 0 # Iterate over each value in the new list. At each iteration, # overlap[ix] is the length of the largest suffix of itemsOld[:ix] # equal to a suffix of itemsNew[:ixNew]. # # subLength, subStartOld, and subStartNew keep track # of the largest substring of the overlapping strings. for ixNew, value in itemsNew: var overlapTemp: Table[int, int] for ixOld in oldIndexMap.getOrDefault(value): var newSuffixLen = 1 if ixOld > 0 and overlap.getOrDefault(ixOld - 1, 0) > 0: newSuffixLen = overlap.getOrDefault(ixOld - 1, 0) + 1 overlaptemp[ixOld] = newSuffixLen if overlapTemp[ixOld] > subLength: subLength = overlapTemp[ixOld] subStartOld = ixOld - subLength + 1 subStartNew = ixNew - subLength + 1 overlap = overlapTemp if subLength == 0: # If there is no common substring, return an insertion and a deletion if itemsOld.len > 0: result.add(Diff[T](kind: Deletion, tokens: @itemsOld)) if itemsNew.len > 0: result.add(Diff[T](kind: Insertion, tokens: @itemsNew)) else: # Otherwise, the common substring is left alone and we can find the diff # of the elements before and after it. let diffBefore = diff(itemsOld[0..<subStartOld], itemsNew[0..<subStartNew]) let same = itemsNew[subStartNew..<subStartNew+subLength] let unchanged = Diff[T](kind: NoChange, tokens: same) let diffAfter = diff(itemsOld[subStartOld+subLength..<itemsOld.len], itemsNew[subStartNew+subLength..<itemsNew.len]) return diffBefore & unchanged & diffAfter proc stringDiff*(s1, s2: string, seps: set[char] = Newlines): seq[Diff[string]] = ## Return the difference between two strings on a line-by-line basis. ## Each entry of the returned seq is an instruction describing the ## shortest method of changing s1 into s2. return diff(split(s1, seps = seps), split(s2, seps = seps))

A => tests/config.nims +1 -0

@@ 1,1 @@switch("path", "$projectDir/../src") \ No newline at end of file

A => tests/testDiff.nim +73 -0

@@ 1,73 @@import strutils import unittest import simplediff suite "test bare diff": # These tests are from the doctests in simplediff's Python implementation test "bare diff on ints with equal start": check diff(@[1, 2, 3, 4], @[1, 3, 4]) == [ Diff[int](kind: NoChange, tokens: @[1]), Diff[int](kind: Deletion, tokens: @[2]), Diff[int](kind: NoChange, tokens: @[3, 4]) ] test "bare diff on ints with deletion at start": check diff(@[1, 2, 3, 4], @[2, 3, 4, 1]) == [ Diff[int](kind: Deletion, tokens: @[1]), Diff[int](kind: NoChange, tokens: @[2, 3, 4]), Diff[int](kind: Insertion, tokens: @[1]) ] test "bare diff on strings with words for tokens": check diff(split("The quick brown fox jumps over the lazy dog"), split("The slow blue cheese drips over the lazy carrot")) == [ Diff[string](kind: NoChange, tokens: @["The"]), Diff[string](kind: Deletion, tokens: @["quick", "brown", "fox", "jumps"]), Diff[string](kind: Insertion, tokens: @["slow", "blue", "cheese", "drips"]), Diff[string](kind: NoChange, tokens: @["over", "the", "lazy"]), Diff[string](kind: Deletion, tokens: @["dog"]), Diff[string](kind: Insertion, tokens: @["carrot"]), ] suite "test stringDiff": test "correct diff for identical one-line strings": check stringDiff("abc", "abc") == [ Diff[string](kind: NoChange, tokens: @["abc"]) ] test "correct diff for identical multi-line strings": check stringDiff("abc def\n123 456", "abc def\n123 456") == [ Diff[string](kind: NoChange, tokens: @["abc def", "123 456"]) ] test "correct diff for different one-line strings": check stringDiff("abc", "def") == [ Diff[string](kind: Deletion, tokens: @["abc"]), Diff[string](kind: Insertion, tokens: @["def"]) ] test "correct diff for different multi-line strings": check stringDiff("abc\ndef", "abc 123\ndef") == [ Diff[string](kind: Deletion, tokens: @["abc"]), Diff[string](kind: Insertion, tokens: @["abc 123"]), Diff[string](kind: NoChange, tokens: @["def"]) ] test "correct diff when splitting on a different character": check stringDiff("abc;def", "abc 123;def", seps = {';'}) == [ Diff[string](kind: Deletion, tokens: @["abc"]), Diff[string](kind: Insertion, tokens: @["abc 123"]), Diff[string](kind: NoChange, tokens: @["def"]) ] test "correct diff when splitting on multiple characters": check stringDiff("abc;def,123", "abc;fed;abc", seps = {';', ','}) == [ Diff[string](kind: NoChange, tokens: @["abc"]), Diff[string](kind: Deletion, tokens: @["def", "123"]), Diff[string](kind: Insertion, tokens: @["fed", "abc"]) ]