diff --git a/README.md b/README.md index 1515416..2d33042 100644 --- a/README.md +++ b/README.md @@ -1,3 +1,10 @@ -# lcs-diff +# Toy LCS-based Diff program -Toy Longest-Common-Subsequence based diff and patch program, minimizing the edit script's sum of line-addition and line-deletions. \ No newline at end of file +Toy Longest-Common-Subsequence based diff and patch program, minimizing the edit script's sum of line-addition and line-deletions. + +Example usage: +```bash +python diff.py a1 b1 > patch1 +python patch.py a1 patch1 > c1 # patch is not in-place +python diff.py a1 c1 # verify that a1 and c1 has the same content +``` diff --git a/diff.py b/diff.py new file mode 100644 index 0000000..823d48f --- /dev/null +++ b/diff.py @@ -0,0 +1,58 @@ +import argparse +parser = argparse.ArgumentParser(description="Compare two documents line by line") +parser.add_argument('a', help="reference document") +parser.add_argument('b', help="target document") +args = parser.parse_args() + +fa = open(args.a,'r') +la = fa.read().split('\n') +fb = open(args.b,'r') +lb = fb.read().split('\n') + +# Solve the Longest Common Subsequence (LCS) subproblem with dynamic programming +dp = [[0] * (len(lb)+1) for _ in range(len(la)+1)] +# dp[i+1][j+1] stores the LCS length between la[i] and lb[j] +for i in range(len(la)): + for j in range(len(lb)): + if la[i]==lb[j]: + dp[i+1][j+1] = dp[i][j] + 1 + else: + dp[i+1][j+1] = max(dp[i][j+1], dp[i+1][j]) + +# Backtrack to find one scheme to reduce both a and b to their LCS +diffs = [] +i,j = len(la)-1, len(lb)-1 +while i>=0 or j>=0: # current position (i+1, j+1), try to move to (0, 0) + oi, oj = i, j + while j >= 0 and dp[i+1][j+1]==dp[i+1][j]: # can safely delete lb[j] + j -= 1 + while i >= 0 and dp[i+1][j+1]==dp[i][j+1]: # can safely delete la[i] + i -= 1 + if i==oi and j==oj: + assert dp[i+1][j+1]==dp[i][j]+1 and la[i]==lb[j] + i,j = i-1,j-1 + else: + diffs.append((oi,i,oj,j)) +diffs.reverse() + +def describe(oi,i,oj,j): + # by diff convension line numbering starts from 1 + oi,i,oj,j = oi+1,i+1,oj+1,j+1 + def intv(x,y): # simplify expression if interval is one line + return str(x) if x==y else f"{x},{y}" + if i==oi: + print(f"{i}a" + intv(j+1,oj)) + elif j==oj: + print(intv(i+1,oi) + f"d{j}") + else: + print(intv(i+1,oi) + "c" + intv(j+1,oj)) + +for oi,i,oj,j in diffs: + # delete (i,oi], add (j,oj] + describe(oi,i,oj,j) + for p in range(i+1,oi+1): + print("< " + la[p]) + if oi>i and oj>j: + print("---") + for p in range(j+1,oj+1): + print("> " + lb[p]) diff --git a/patch.py b/patch.py new file mode 100644 index 0000000..ecbe1fc --- /dev/null +++ b/patch.py @@ -0,0 +1,65 @@ +import argparse +parser = argparse.ArgumentParser(description="Prints to stdout the patched document") +parser.add_argument('original', help='original document') +parser.add_argument('patch', help='patch file') +args = parser.parse_args() + +fa = open(args.original,'r') +la = fa.read().split('\n') +# a workspace for us to edit, each element being a list that 1) originally contains a corresponding line +# in a, 2) may be emptied if that line is removed, 3) may be where we insert new content +ws = [[l] for l in la] +fp = open(args.patch,'r') +lp = fp.read().split('\n') + +def str2intv(s): + if "," in s: + x, y = s.split(',') + else: + x, y = s, s + return int(x)-1, int(y) + +def ws_rm(desc, lp, nb): + l,r = str2intv(desc) + for p in range(l,r): + nb += 1 + assert lp[nb][:2] == "< " + assert ws[p][0] == lp[nb][2:],\ + f"Patch file wants to delete '{lp[nb][2:]}', but originally line {p} is '{ws[p][0]}'" + ws[p] = list() + return nb + 1, l + +def ws_insert(p, desc, lp, nb): + l,r = str2intv(desc) + for _ in range(l,r): + nb += 1 + assert lp[nb][:2] == "> " + ws[p].append(lp[nb][2:]) + return nb + 1 + +nb = 0 +while nb < len(lp): + line = lp[nb] + if line: + assert line[0] != '<' and line[0] != '>', f"Expected description at line {nb}, but got {line}" + else: + nb += 1 + continue + idx_d = line.find('d') + if idx_d != -1: + nb, _ = ws_rm(line[:idx_d], lp, nb) + continue + idx_a = line.find('a') + if idx_a != -1: + p = int(line[:idx_a]) - 1 + nb = ws_insert(p, line[idx_a+1:], lp, nb) + continue + idx_c = line.find('c') + if idx_c != -1: + nb, p = ws_rm(line[:idx_c], lp, nb) + nb = ws_insert(p, line[idx_c+1:], lp, nb) + continue + assert False, f"Unreachable. Cannot parse line {nb}: {line}" + +flatws = [line for _ in ws for line in _] +print('\n'.join(flatws), end="") # Don't want to print additional new line