I've been working on my backup strategy for the notebook recently. The idea is to have full backups every now month and then incremental backups in between as fine-grained as possible. As it's a mobile device there's no point in time where it is guaranteed to be up, connected and within reach of the backup server.
As I'm running Debian GNU/kFreeBSD on it, using ZFS and specifically
zfs send comes quite naturally. I'm now generating a
new file system snapshot every day (if the notebook happens to be
online during that day) using cron.
@daily zfs snapshot base/root@`date -I` @daily zfs snapshot base/home@`date -I` @reboot zfs snapshot base/root@`date -I` @reboot zfs snapshot base/home@`date -I`
When connected to the home network I'm synchronizing off all
incrementals that are not yet on the backup server. This is using
zfs send together with gpg to encrypt the
data and then put it off to some sftp storage. For the first
snapshot every month a full backup is created. As there doesn't seem
to be a way to merge zfs send streams without
importing everything in a zfs pool I create additional incremental
streams to the first snapshot of last month so I'm able to
delete older full backups and daily snapshots and still keep
coarse-gained backups for a longer period of time.
#!/usr/bin/python
# -*- coding: utf-8 -*-
####################
# Config
SFTP_HOST = 'botero.siccegge.de'
SFTP_DIR = '/srv/backup/mitoraj'
SFTP_USER = 'root'
ZPOOL = 'base'
GPGUSER = '9FED5C6CE206B70A585770CA965522B9D49AE731'
#
####################
import subprocess
import os.path
import sys
import paramiko
term = {
'green': "\033[0;32m",
'red': "\033[0;31m",
'yellow': "\033[0;33m",
'purple': "\033[0;35m",
'none': "\033[0m",
}
sftp = None
def print_colored(data, color):
sys.stdout.write(term[color])
sys.stdout.write(data)
sys.stdout.write(term['none'])
sys.stdout.write('\n')
sys.stdout.flush()
def postprocess_datasets(datasets):
devices = set([entry.split('@')[0] for entry in datasets])
result = dict()
for device in devices:
result[device] = sorted([ entry.split('@')[1] for entry in datasets
if entry.startswith(device) ])
return result
def sftp_connect():
global sftp
host_keys = paramiko.util.load_host_keys(os.path.expanduser('~/.ssh/known_hosts'))
hostkeytype = host_keys[SFTP_HOST].keys()[0]
hostkey = host_keys[SFTP_HOST][hostkeytype]
agent = paramiko.Agent()
transport = paramiko.Transport((SFTP_HOST, 22))
transport.connect(hostkey=hostkey)
for key in agent.get_keys():
try:
transport.auth_publickey(SFTP_USER, key)
break
except paramiko.SSHException:
continue
sftp = paramiko.SFTPClient.from_transport(transport)
sftp.chdir(SFTP_DIR)
def sftp_send(dataset, reference=None):
zfscommand = ['sudo', 'zfs', 'send', '%s/%s' % (ZPOOL, dataset)]
if reference is not None:
zfscommand = zfscommand + ['-i', reference]
zfs = subprocess.Popen(zfscommand, stdout=subprocess.PIPE)
gpgcommand = [ 'gpg', '--batch', '--compress-algo', 'ZLIB',
'--sign', '--encrypt', '--recipient', GPGUSER ]
gpg = subprocess.Popen(gpgcommand, stdout=subprocess.PIPE,
stdin=zfs.stdout,
stderr=subprocess.PIPE)
gpg.poll()
if gpg.returncode not in [None, 0]:
print_colored("Error:\n\n" + gpg.stderr, 'red')
return
if reference is None:
filename = '%s.full.zfs.gpg' % dataset
else:
filename = '%s.from.%s.zfs.gpg' % (dataset, reference)
with sftp.open(filename, 'w') as remotefile:
sys.stdout.write(term['purple'])
while True:
junk = gpg.stdout.read(1024*1024)
if len(junk) == 0:
break
sys.stdout.write('#')
sys.stdout.flush()
remotefile.write(junk)
print_colored(" DONE", 'green')
def syncronize(local_datasets, remote_datasets):
for device in local_datasets.keys():
current = ""
for dataset in local_datasets[device]:
last = current
current = dataset
if device in remote_datasets:
if dataset in remote_datasets[device]:
print_colored("%s@%s -- found on remote server" % (device, dataset), 'yellow')
continue
if last == '':
print_colored("Initial syncronization for device %s" % device, 'green')
sftp_send("%s@%s" % (device, dataset))
lastmonth = dataset
continue
if last[:7] == dataset[:7]:
print_colored("%s@%s -- incremental backup (reference: %s)" %
(device, dataset, last), 'green')
sftp_send("%s@%s" % (device, dataset), last)
else:
print_colored("%s@%s -- full backup" % (device, dataset), 'green')
sftp_send("%s@%s" % (device, dataset))
print_colored("%s@%s -- doing incremental backup" % (device, dataset), 'green')
sftp_send("%s@%s" % (device, dataset), lastmonth)
lastmonth = dataset
def get_remote_datasets():
datasets = sftp.listdir()
datasets = filter(lambda x: '@' in x, datasets)
datasets = [ entry.split('.')[0] for entry in datasets ]
return postprocess_datasets(datasets)
def get_local_datasets():
datasets = subprocess.check_output(['sudo', 'zfs', 'list', '-t', 'snapshot', '-H', '-o', 'name'])
datasets = datasets.strip().split('\n')
datasets = [ entry[5:] for entry in datasets ]
return postprocess_datasets(datasets)
def main():
sftp_connect()
syncronize(get_local_datasets(), get_remote_datasets())
if __name__ == '__main__':
main()
Rumors have it, btrfs has gained similar functionality to zfs
send so maybe I'll be able to extend that code and use it on
my linux nodes some future day (after migrating to btrfs there for a
start).