Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Chunker permutation #5070

Open
wants to merge 5 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions docs/internals/data-structures.rst
Original file line number Diff line number Diff line change
Expand Up @@ -624,8 +624,9 @@ can be used to tune the chunker parameters, the default is:
- HASH_MASK_BITS = 21 (target chunk size ~= 2^21 B = 2 MiB)
- HASH_WINDOW_SIZE = 4095 [B] (`0xFFF`)

The buzhash table is altered by XORing it with a seed randomly generated once
for the repository, and stored encrypted in the keyfile. This is to prevent
The buzhash table is altered by XORing it with a seed and shuffling its
elements. The XOR seed and shuffle pattern are randomly generated once for
the repository, and stored encrypted in the keyfile. This is to prevent
chunk size based fingerprinting attacks on your encrypted repo contents (to
guess what files you have based on a specific set of chunk sizes).

Expand Down Expand Up @@ -901,6 +902,9 @@ id_key
chunk_seed
the seed for the buzhash chunking table (signed 32 bit integer)

chunk_permutation
the permutation for shuffling the buzhash table (256 bytes)

These fields are packed using msgpack_. The utf-8 encoded passphrase
is processed with PBKDF2_ (SHA256_, 100000 iterations, random 256 bit salt)
to derive a 256 bit key encryption key (KEK).
Expand Down
4 changes: 2 additions & 2 deletions docs/internals/security.rst
Original file line number Diff line number Diff line change
Expand Up @@ -407,8 +407,8 @@ buzhash chunker
+++++++++++++++

The buzhash chunker chunks according to the input data, the chunker's
parameters and the secret chunker seed (which all influence the chunk boundary
positions).
parameters and the secret chunker seed and permutation (which all influence the
chunk boundary positions).

Small files below some specific threshold (default: 512 KiB) result in only one
chunk (identical content / size as the original file), bigger files result in
Expand Down
9 changes: 5 additions & 4 deletions src/borg/_chunker.c
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ static uint32_t table_base[] =
size_t pagemask;

static uint32_t *
buzhash_init_table(uint32_t seed)
buzhash_init_table(uint32_t seed, unsigned char *permutation)
{
int i;
uint32_t *table = malloc(1024);
for(i = 0; i < 256; i++)
{
table[i] = table_base[i] ^ seed;
table[i] = table_base[permutation[i]] ^ seed;
}
return table;
}
Expand Down Expand Up @@ -112,13 +112,14 @@ typedef struct {
} Chunker;

static Chunker *
chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed)
chunker_init(size_t window_size, uint32_t chunk_mask, size_t min_size, size_t max_size, uint32_t seed,
unsigned char *permutation)
{
Chunker *c = calloc(sizeof(Chunker), 1);
c->window_size = window_size;
c->chunk_mask = chunk_mask;
c->min_size = min_size;
c->table = buzhash_init_table(seed);
c->table = buzhash_init_table(seed, permutation);
c->buf_size = max_size;
c->data = malloc(c->buf_size);
c->fh = -1;
Expand Down
7 changes: 4 additions & 3 deletions src/borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -320,7 +320,7 @@ def __init__(self, key, chunker_params=ITEMS_CHUNKER_PARAMS):
self.packer = msgpack.Packer()
self.chunks = []
self.key = key
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed)
self.chunker = get_chunker(*chunker_params, seed=self.key.chunk_seed, permutation=self.key.chunk_permutation)

def add(self, item):
self.buffer.write(self.packer.pack(item.as_dict()))
Expand Down Expand Up @@ -1178,7 +1178,7 @@ def __init__(self, *, metadata_collector, cache, key,
self.hard_links = {}
self.stats = Statistics() # threading: done by cache (including progress)
self.cwd = os.getcwd()
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed)
self.chunker = get_chunker(*chunker_params, seed=key.chunk_seed, permutation=key.chunk_permutation)

@contextmanager
def create_helper(self, path, st, status=None, hardlinkable=True):
Expand Down Expand Up @@ -2102,7 +2102,8 @@ def create_target(self, archive, target_name=None):
cache=self.cache, key=self.key,
add_item=target.add_item, write_checkpoint=target.write_checkpoint,
checkpoint_interval=self.checkpoint_interval, rechunkify=target.recreate_rechunkify).process_file_chunks
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed)
target.chunker = get_chunker(*target.chunker_params, seed=self.key.chunk_seed,
permutation=self.key.chunk_permutation)
return target

def create_target_archive(self, name):
Expand Down
30 changes: 23 additions & 7 deletions src/borg/chunker.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,19 @@ cdef extern from "_chunker.c":
ctypedef int uint32_t
ctypedef struct _Chunker "Chunker":
pass
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size, uint32_t seed)
_Chunker *chunker_init(int window_size, int chunk_mask, int min_size, int max_size,
uint32_t seed, unsigned char *permutation)
void chunker_set_fd(_Chunker *chunker, object f, int fd)
void chunker_free(_Chunker *chunker)
object chunker_process(_Chunker *chunker)
uint32_t *buzhash_init_table(uint32_t seed)
uint32_t *buzhash_init_table(uint32_t seed, unsigned char *permutation)
uint32_t c_buzhash "buzhash"(unsigned char *data, size_t len, uint32_t *h)
uint32_t c_buzhash_update "buzhash_update"(uint32_t sum, unsigned char remove, unsigned char add, size_t len, uint32_t *h)

# The identity permutation of input by bytes, useful for maintaining
# backward compatibility with interfaces defined before input byte
# permutations were introduced.
null_permutation = bytes(range(256))

class ChunkerFixed:
"""
Expand Down Expand Up @@ -94,13 +99,14 @@ cdef class Chunker:
"""
cdef _Chunker *chunker

def __cinit__(self, int seed, int chunk_min_exp, int chunk_max_exp, int hash_mask_bits, int hash_window_size):
def __cinit__(self, int seed, unsigned char *permutation, int chunk_min_exp, int chunk_max_exp,
int hash_mask_bits, int hash_window_size):
min_size = 1 << chunk_min_exp
max_size = 1 << chunk_max_exp
# see chunker_process, first while loop condition, first term must be able to get True:
assert hash_window_size + min_size + 1 <= max_size, "too small max_size"
hash_mask = (1 << hash_mask_bits) - 1
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff)
self.chunker = chunker_init(hash_window_size, hash_mask, min_size, max_size, seed & 0xffffffff, permutation)

def chunkify(self, fd, fh=-1):
"""
Expand All @@ -127,7 +133,8 @@ cdef class Chunker:
def get_chunker(algo, *params, **kw):
if algo == 'buzhash':
seed = kw['seed']
return Chunker(seed, *params)
perm = kw.get('permutation') or null_permutation
return Chunker(seed, perm, *params)
if algo == 'fixed':
return ChunkerFixed(*params)
raise TypeError('unsupported chunker algo %r' % algo)
Expand All @@ -143,17 +150,26 @@ def max_chunk_size(algo, *params):


def buzhash(data, unsigned long seed):
return buzhash_perm(data, seed, null_permutation)


def buzhash_perm(data, unsigned long seed, unsigned char *permutation):
cdef uint32_t *table
cdef uint32_t sum
table = buzhash_init_table(seed & 0xffffffff)
table = buzhash_init_table(seed & 0xffffffff, permutation)
sum = c_buzhash(<const unsigned char *> data, len(data), table)
free(table)
return sum


def buzhash_update(uint32_t sum, unsigned char remove, unsigned char add, size_t len, unsigned long seed):
return buzhash_update_perm(sum, remove, add, len, seed, null_permutation)


def buzhash_update_perm(uint32_t sum, unsigned char remove, unsigned char add, size_t len,
unsigned long seed, unsigned char *permutation):
cdef uint32_t *table
table = buzhash_init_table(seed & 0xffffffff)
table = buzhash_init_table(seed & 0xffffffff, permutation)
sum = c_buzhash_update(sum, remove, add, len, table)
free(table)
return sum
41 changes: 40 additions & 1 deletion src/borg/crypto/key.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,10 @@ class KeyBase:
# type: int
chunk_seed = None

# The input byte permutation for the buzhash chunker
# type: bytes
chunk_permutation = None

# Whether this *particular instance* is encrypted from a practical point of view,
# i.e. when it's using encryption with a empty passphrase, then
# that may be *technically* called encryption, but for all intents and purposes
Expand Down Expand Up @@ -266,6 +270,7 @@ class PlaintextKey(KeyBase):
STORAGE = KeyBlobStorage.NO_STORAGE

chunk_seed = 0
chunk_permutation = None
logically_encrypted = False

def __init__(self, repository):
Expand Down Expand Up @@ -345,6 +350,35 @@ def id_hash(self, data):
return hmac_sha256(self.id_key, data)


def _derive_byte_permutation(key_material):
"""
Derive a 256-byte permutation table from the key material

There are 256! possible permutations of a byte-indexed table, and
we want to make an unbiased choice. Since 256! is just under 2^1684
(it's 0xFF578F....) we derive 1684 pseudorandom bits from the key
material and treat it as a single large integer. There's only a 1 in
350 chance that this integer is >= 256!, in which case we try again.
"""
for attempt in range(10):
context = b"chunker input byte permutation, attempt %d" % attempt
key = hkdf_hmac_sha512(key_material, None, context, 211)
pool = int.from_bytes(key, "big")
pool >>= 4 # 211 bytes is 1688 bits, 4 bits more than we want
perm = list(range(256))
for i in range(256):
pool, offset = divmod(pool, 256-i)
j = i + offset
perm[i], perm[j] = perm[j], perm[i]

if pool == 0:
# the pool value was less than 256!, we have an unbiased choice
return bytes(perm)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

could be also just break.


# we're very unlikely to fall through to here. Just accept the biased permutation
return bytes(perm)


class AESKeyBase(KeyBase):
"""
Common base class shared by KeyfileKey and PassphraseKey
Expand Down Expand Up @@ -388,14 +422,17 @@ def decrypt(self, id, data, decompress=True):

def init_from_random_data(self, data=None):
if data is None:
data = os.urandom(100)
data = os.urandom(132)
self.enc_key = data[0:32]
self.enc_hmac_key = data[32:64]
self.id_key = data[64:96]
self.chunk_seed = bytes_to_int(data[96:100])
# Convert to signed int32
if self.chunk_seed & 0x80000000:
self.chunk_seed = self.chunk_seed - 0xffffffff - 1
if len(data) >= 132:
chunk_key = data[100:132]
self.chunk_permutation = _derive_byte_permutation(chunk_key)

def init_ciphers(self, manifest_data=None):
self.cipher = self.CIPHERSUITE(mac_key=self.enc_hmac_key, enc_key=self.enc_key, header_len=1, aad_offset=1)
Expand Down Expand Up @@ -620,6 +657,7 @@ def _load(self, key_data, passphrase):
self.enc_hmac_key = key.enc_hmac_key
self.id_key = key.id_key
self.chunk_seed = key.chunk_seed
self.chunk_permutation = key.get('chunk_permutation')
self.tam_required = key.get('tam_required', tam_required(self.repository))
return True
return False
Expand Down Expand Up @@ -660,6 +698,7 @@ def _save(self, passphrase):
enc_hmac_key=self.enc_hmac_key,
id_key=self.id_key,
chunk_seed=self.chunk_seed,
chunk_permutation=self.chunk_permutation,
tam_required=self.tam_required,
)
data = self.encrypt_key_file(msgpack.packb(key.as_dict()), passphrase)
Expand Down
4 changes: 3 additions & 1 deletion src/borg/item.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -318,7 +318,8 @@ class Key(PropDict):
If a Key shall be serialized, give as_dict() method output to msgpack packer.
"""

VALID_KEYS = {'version', 'repository_id', 'enc_key', 'enc_hmac_key', 'id_key', 'chunk_seed', 'tam_required'} # str-typed keys
VALID_KEYS = {'version', 'repository_id', 'enc_key', 'enc_hmac_key', 'id_key', 'chunk_seed',
'chunk_permutation', 'tam_required'} # str-typed keys

__slots__ = ("_dict", ) # avoid setting attributes not supported by properties

Expand All @@ -328,6 +329,7 @@ class Key(PropDict):
enc_hmac_key = PropDict._make_property('enc_hmac_key', bytes)
id_key = PropDict._make_property('id_key', bytes)
chunk_seed = PropDict._make_property('chunk_seed', int)
chunk_permutation = PropDict._make_property('chunk_permutation', bytes)
tam_required = PropDict._make_property('tam_required', bool)


Expand Down
2 changes: 1 addition & 1 deletion src/borg/selftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
ChunkerTestCase,
]

SELFTEST_COUNT = 37
SELFTEST_COUNT = 38


class SelfTestResult(TestResult):
Expand Down
52 changes: 40 additions & 12 deletions src/borg/testsuite/chunker.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,25 @@
from io import BytesIO

from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_update
from ..chunker import ChunkerFixed, Chunker, get_chunker, buzhash, buzhash_perm, buzhash_update, buzhash_update_perm
from ..constants import * # NOQA
from . import BaseTestCase

# Note: these tests are part of the self test, do not use or import py.test functionality here.
# See borg.selftest for details. If you add/remove test methods, update SELFTEST_COUNT


null_permutation = bytes(range(256))


def permutation_invert_case():
perm = list(range(256))
for up in "ABCDEFGHIJKLMNOPQRSTUVWXYZ":
low = up.lower()
perm[ord(low)] = ord(up)
perm[ord(up)] = ord(low)
return bytes(perm)


class ChunkerFixedTestCase(BaseTestCase):

def test_chunkify_just_blocks(self):
Expand All @@ -26,20 +38,21 @@ def test_chunkify_header_and_blocks(self):
class ChunkerTestCase(BaseTestCase):

def test_chunkify(self):
np = null_permutation
data = b'0' * int(1.5 * (1 << CHUNK_MAX_EXP)) + b'Y'
parts = [bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
parts = [bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(data))]
self.assert_equal(len(parts), 2)
self.assert_equal(b''.join(parts), data)
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(0, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(1, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b''))], [])
self.assert_equal([bytes(c) for c in Chunker(0, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fooba', b'rboobaz', b'fooba', b'rboobaz', b'fooba', b'rboobaz'])
self.assert_equal([bytes(c) for c in Chunker(1, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'fo', b'obarb', b'oob', b'azf', b'oobarb', b'oob', b'azf', b'oobarb', b'oobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, np, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'ar', b'boobazfoob', b'ar', b'boobazfoob', b'ar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobar', b'boobazfo', b'obar', b'boobazfo', b'obar', b'boobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, np, 2, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foob', b'arboobaz', b'foob', b'arboobaz', b'foob', b'arboobaz'])
self.assert_equal([bytes(c) for c in Chunker(0, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz' * 3])
self.assert_equal([bytes(c) for c in Chunker(1, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarbo', b'obazfoobar', b'boobazfo', b'obarboobaz'])
self.assert_equal([bytes(c) for c in Chunker(2, np, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])

def test_buzhash(self):
self.assert_equal(buzhash(b'abcdefghijklmnop', 0), 3795437769)
Expand All @@ -48,6 +61,21 @@ def test_buzhash(self):
# Test with more than 31 bytes to make sure our barrel_shift macro works correctly
self.assert_equal(buzhash(b'abcdefghijklmnopqrstuvwxyzabcdefghijklmnopqrstuvwxyz', 0), 566521248)

def test_permutation(self):
p = permutation_invert_case()

# a non-null permutation should spoil these test cases copied from the methods above
self.assert_not_equal([bytes(c) for c in Chunker(2, p, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'foobarboobaz' * 3))], [b'foobarboobaz', b'foobarboobaz', b'foobarboobaz'])
self.assert_not_equal(buzhash_perm(b'abcdefghijklmnop', 0, p), 3795437769)

# inverting the case of the input should compensate for the permutation
self.assert_equal([bytes(c) for c in Chunker(0, p, 1, CHUNK_MAX_EXP, 2, 2).chunkify(BytesIO(b'FOOBARBOOBAZ' * 3))], [b'FOOBA', b'RBOOBAZ', b'FOOBA', b'RBOOBAZ', b'FOOBA', b'RBOOBAZ'])
self.assert_equal([bytes(c) for c in Chunker(2, p, 3, CHUNK_MAX_EXP, 2, 3).chunkify(BytesIO(b'FOOBARBOOBAZ' * 3))], [b'FOOBARBOOBAZ', b'FOOBARBOOBAZ', b'FOOBARBOOBAZ'])
self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 0, p), 3795437769)
self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 1, p), 3795400502)
self.assert_equal(buzhash_perm(b'ABCDEFGHIJKLMNOP', 1, p),
buzhash_update_perm(buzhash_perm(b'xABCDEFGHIJKLMNO', 1, p), ord('x'), ord('P'), 16, 1, p))

def test_small_reads(self):
class SmallReadFile:
input = b'a' * (20 + 1)
Expand Down
Loading