Rsync Algorithm In Python

This is a Python snippet, talking about algorithm, delta, diff and rsync

Rsync Algorithm In Python Add to Favorite

## {{{ Recipe 577022 (r16): Rsync Algorithm In Python 
#!/usr/bin/env python
# -*- coding: utf-8 -*-

__author__ = "James Eric Pruitt"

import os

from cStringIO import StringIO
from zlib import adler32
from collections import deque
from hashlib import md5
from tempfile import mkstemp
from shutil import move

MOD_ADLER = 4294967291 # The 203,280,221st prime.

def adler64(valarray, a = 1, b = 0):
    """
    Alder checksum but with 64 bits and modulus of 4 294 967 291.
    """
    for e in valarray:
        a = (e + a) % MOD_ADLER
        b = (b + a) % MOD_ADLER

    return (b << 32) | a, a, b

def rolladler(removed, new, a, b, blocksize = 4096):
    """
    Allows a new Adler value to be computed when supplied with data from a
    previous calculation.
    """
    a = (a - removed + new) % MOD_ADLER
    b = (b - removed * blocksize - 1 + a) % MOD_ADLER
    return (b << 32) | a, a, b

def generatedelta(iostream, remotesig, blocksize = 4096, iobuffer = 16384):
    """
    Generates a delta from the weak and strong checksums of the recipient file
    that can be applied using the applydelta function. When using this
    function, the blocksize must be the same as the blocksize used whenever
    you the dualchecksums function on the remote file.
    """
    if not((hasattr(iostream, "read"))):
        iostream = StringIO(iostream)

    weaksigs, md5sigs  = remotesig
    skip = 0

    delta = deque((blocksize, ))
    read = iostream.read(blocksize)
    databuffer = deque(read[-1])
    window = deque(map(ord, read))
    lastchecksum, a, b = adler64(window)
    deltasize = 0

    read = iostream.read(iobuffer)
    while read:
        for character in read:
            ascii = ord(character)
            if not skip:
                try:
                    weakhash = weaksigs.index(lastchecksum)
                    goodhash = md5sigs.index(
                        md5(''.join(map(chr, window))).hexdigest(), weakhash)
                    if weakhash == goodhash:
                        if len(databuffer) > 1:
                            databuffer.popleft()
                            delta.append(''.join(databuffer))
                            deltasize += len(databuffer)
                        delta.append(goodhash)
                        deltasize += blocksize
                        skip = blocksize - 1
                except ValueError:
                    pass
            else:
                databuffer = deque()
                skip -= 1

            window.append(ascii)
            oldest = window.popleft()
            databuffer.append(chr(oldest))
            lastchecksum, a, b = rolladler(oldest, ascii, a, b, blocksize)

        read = iostream.read(iobuffer)

    trailingbytes = iostream.tell() - deltasize
    if trailingbytes:
        iostream.seek(-trailingbytes, 1)
        delta.append(iostream.read())

    return delta

def dualchecksums(iostream, blocksize = 4096):
    """
    Calculates the weak and strong checksums for the recipient file.
    """
    if not((hasattr(iostream, "read"))):
        iostream = StringIO(iostream)

    lowhashes = deque()
    md5hashes = deque()
    read = iostream.read(blocksize)

    while read:
        hash, _, _ = adler64(map(ord, read))
        lowhashes.append(hash)
        md5hashes.append(md5(read).hexdigest()) # Using hex digest because
        read = iostream.read(blocksize)         # it takes up less space when
                                                # I serialize it with my code.
    return list(lowhashes), list(md5hashes)

def applydelta(localpath, delta, outputpath = None):
    """
    Applies a generated delta to the given path. If outputpath is specified,
    instead of overwriting the file specified in localpath, the file with the
    delta applied to it will instead be saved to outputpath.
    """
    blocksize = 0
    istream = open(localpath, 'rb')
    ohandle, tmppath = mkstemp(dir = os.path.dirname(localpath))

    for element in delta:
        if isinstance(element, int) and blocksize:
            istream.seek(element * blocksize)
            element = istream.read(blocksize)
        elif not blocksize:
            blocksize = element
            continue

        os.write(ohandle, element)

    istream.close()
    os.close(ohandle)

    if outputpath:
        move(tmppath, outputpath)
    else:
        move(tmppath, localpath)

if __name__ == "__main__":
    print "Testing..."

    # Test our rolling checksum.
    window_i = [15, 78, 12, 41]
    window_s = [78, 12, 41, 54]
    inita64, a_1, b_1 = adler64(window_i)
    shifted, a_2, b_2 = adler64(window_s)
    rolla64, a_s, b_s = rolladler(15, 54, a_1, b_1, blocksize = len(window_i))

    assert rolla64 == shifted
    assert a_2 == a_s
    assert b_2 == b_s

    from random import shuffle

    test_pairs = [
        ("AABBCCDDEEFFGG", "AABBCCDDEEFFGG"),
        ("AABBCCDDEEFFGG", "AABBCCDDEEFXXX"),
        ("AABBCCDDEEFFGG", "AABBCCDDEEFFXX"),
        ("AABBCCDDEEFFGG", "XXXBCCDDEEFFGG"),
        ("AABBCCDDEEFFGG", "XXBBCCDDEEFFGG"),
        ("AABBCCDDEEFFGG", "AABBXXXDEEFFGG"),
        ("AABBCCDDEEFFGG", "XXBBXXCDXEFFGG"),
        ("AABBCCDDEEFFGG", "AABBCCXXDEEFXX")
    ]

    # Fuzz test to hopefully cover most scenarios I might not have
    stack_a = map(ord, "888888887777777666666555554444333221")
    stack_b = list(stack_a)

    for n in xrange(0):
        shuffle(stack_a)
        shuffle(stack_b)
        test_pairs.append((''.join(map(chr, stack_a)),
                           ''.join(map(chr, stack_b))))

    for content_a, content_b in test_pairs:
        ahandle, apath = mkstemp(dir = os.path.dirname(__file__))
        bhandle, bpath = mkstemp(dir = os.path.dirname(__file__))
        _, outtestpath = mkstemp(dir = os.path.dirname(__file__))

        os.write(ahandle, content_a)
        os.write(bhandle, content_b)

        os.close(ahandle)
        os.close(bhandle)

        remote_signature = dualchecksums(open(bpath), 3)
        calculated_delta = generatedelta(open(apath),remote_signature, 3, 12)

        applydelta(bpath, calculated_delta, outtestpath)
        applydelta(bpath, calculated_delta)

        a = open(apath).read()
        b = open(bpath).read()
        o = open(outtestpath).read()

        os.remove(apath)
        os.remove(bpath)
        os.remove(outtestpath)

        try:
            assert a == o
            assert a == b
            print content_a ," converts ",content_b
            print "A:",a
            print "B:",b
            print "O:",o
            print "Delta:", calculated_delta
        except AssertionError:
            print "Test failed."
            exit(1)

    print "Test passed."
## End of recipe 577022 }}}
URL: http://samba.anu.edu.au/rsync/tech_report/node3.html

An implementation of the rsync algorithm in Python. As my rolling checksum, I just summed all of the ascii byte values in a given window. Even with this simple weak rolling checksum, computing and comparing the produced rolling checksums is still terribly slow. I'm fairly certain the speed could be reduced a fair amount but I haven't decided what the most efficient manner of doing this in Python alone would be.

Created by ThePeppersStudio (1689 days, 19.45 hours ago)

Do you want to leave a message? Please login first.