I've been working on my backup strategy for the notebook recently. The idea is to have full backups every now month and then incremental backups in between as fine-grained as possible. As it's a mobile device there's no point in time where it is guaranteed to be up, connected and within reach of the backup server.
As I'm running Debian GNU/kFreeBSD on it, using ZFS and specifically
zfs send
comes quite naturally. I'm now generating a
new file system snapshot every day (if the notebook happens to be
online during that day) using cron.
@daily zfs snapshot base/root@`date -I` @daily zfs snapshot base/home@`date -I` @reboot zfs snapshot base/root@`date -I` @reboot zfs snapshot base/home@`date -I`
When connected to the home network I'm synchronizing off all
incrementals that are not yet on the backup server. This is using
zfs send
together with gpg
to encrypt the
data and then put it off to some sftp storage. For the first
snapshot every month a full backup is created. As there doesn't seem
to be a way to merge zfs send
streams without
importing everything in a zfs pool I create additional incremental
streams to the first snapshot of last month so I'm able to
delete older full backups and daily snapshots and still keep
coarse-gained backups for a longer period of time.
#!/usr/bin/python # -*- coding: utf-8 -*- #################### # Config SFTP_HOST = 'botero.siccegge.de' SFTP_DIR = '/srv/backup/mitoraj' SFTP_USER = 'root' ZPOOL = 'base' GPGUSER = '9FED5C6CE206B70A585770CA965522B9D49AE731' # #################### import subprocess import os.path import sys import paramiko term = { 'green': "\033[0;32m", 'red': "\033[0;31m", 'yellow': "\033[0;33m", 'purple': "\033[0;35m", 'none': "\033[0m", } sftp = None def print_colored(data, color): sys.stdout.write(term[color]) sys.stdout.write(data) sys.stdout.write(term['none']) sys.stdout.write('\n') sys.stdout.flush() def postprocess_datasets(datasets): devices = set([entry.split('@')[0] for entry in datasets]) result = dict() for device in devices: result[device] = sorted([ entry.split('@')[1] for entry in datasets if entry.startswith(device) ]) return result def sftp_connect(): global sftp host_keys = paramiko.util.load_host_keys(os.path.expanduser('~/.ssh/known_hosts')) hostkeytype = host_keys[SFTP_HOST].keys()[0] hostkey = host_keys[SFTP_HOST][hostkeytype] agent = paramiko.Agent() transport = paramiko.Transport((SFTP_HOST, 22)) transport.connect(hostkey=hostkey) for key in agent.get_keys(): try: transport.auth_publickey(SFTP_USER, key) break except paramiko.SSHException: continue sftp = paramiko.SFTPClient.from_transport(transport) sftp.chdir(SFTP_DIR) def sftp_send(dataset, reference=None): zfscommand = ['sudo', 'zfs', 'send', '%s/%s' % (ZPOOL, dataset)] if reference is not None: zfscommand = zfscommand + ['-i', reference] zfs = subprocess.Popen(zfscommand, stdout=subprocess.PIPE) gpgcommand = [ 'gpg', '--batch', '--compress-algo', 'ZLIB', '--sign', '--encrypt', '--recipient', GPGUSER ] gpg = subprocess.Popen(gpgcommand, stdout=subprocess.PIPE, stdin=zfs.stdout, stderr=subprocess.PIPE) gpg.poll() if gpg.returncode not in [None, 0]: print_colored("Error:\n\n" + gpg.stderr, 'red') return if reference is None: filename = '%s.full.zfs.gpg' % dataset else: filename = '%s.from.%s.zfs.gpg' % (dataset, reference) with sftp.open(filename, 'w') as remotefile: sys.stdout.write(term['purple']) while True: junk = gpg.stdout.read(1024*1024) if len(junk) == 0: break sys.stdout.write('#') sys.stdout.flush() remotefile.write(junk) print_colored(" DONE", 'green') def syncronize(local_datasets, remote_datasets): for device in local_datasets.keys(): current = "" for dataset in local_datasets[device]: last = current current = dataset if device in remote_datasets: if dataset in remote_datasets[device]: print_colored("%s@%s -- found on remote server" % (device, dataset), 'yellow') continue if last == '': print_colored("Initial syncronization for device %s" % device, 'green') sftp_send("%s@%s" % (device, dataset)) lastmonth = dataset continue if last[:7] == dataset[:7]: print_colored("%s@%s -- incremental backup (reference: %s)" % (device, dataset, last), 'green') sftp_send("%s@%s" % (device, dataset), last) else: print_colored("%s@%s -- full backup" % (device, dataset), 'green') sftp_send("%s@%s" % (device, dataset)) print_colored("%s@%s -- doing incremental backup" % (device, dataset), 'green') sftp_send("%s@%s" % (device, dataset), lastmonth) lastmonth = dataset def get_remote_datasets(): datasets = sftp.listdir() datasets = filter(lambda x: '@' in x, datasets) datasets = [ entry.split('.')[0] for entry in datasets ] return postprocess_datasets(datasets) def get_local_datasets(): datasets = subprocess.check_output(['sudo', 'zfs', 'list', '-t', 'snapshot', '-H', '-o', 'name']) datasets = datasets.strip().split('\n') datasets = [ entry[5:] for entry in datasets ] return postprocess_datasets(datasets) def main(): sftp_connect() syncronize(get_local_datasets(), get_remote_datasets()) if __name__ == '__main__': main()
Rumors have it, btrfs has gained similar functionality to zfs
send
so maybe I'll be able to extend that code and use it on
my linux nodes some future day (after migrating to btrfs there for a
start).