Skip to content

Commit

Permalink
Merge pull request #4097 from tybug/draw-bytes-min-max-size-2
Browse files Browse the repository at this point in the history
Add support for variable-width bytes in the IR
  • Loading branch information
Zac-HD authored Sep 5, 2024
2 parents eaafdfc + 531bdf0 commit 342017a
Show file tree
Hide file tree
Showing 22 changed files with 322 additions and 208 deletions.
3 changes: 3 additions & 0 deletions hypothesis-python/RELEASE.rst
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
RELEASE_TYPE: minor

This release adds support for variable-width bytes in our IR layer (:issue:`3921`), which should mean improved performance anywhere you use :func:`~hypothesis.strategies.binary`. If you maintain an alternative backend as part of our (for now explicitly unstable) :ref:`alternative-backends`, this release changes the ``draw_*`` interface and may be a breaking change for you.
86 changes: 58 additions & 28 deletions hypothesis-python/src/hypothesis/internal/conjecture/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,12 @@ class FloatKWargs(TypedDict):
class StringKWargs(TypedDict):
intervals: IntervalSet
min_size: int
max_size: Optional[int]
max_size: int


class BytesKWargs(TypedDict):
size: int
min_size: int
max_size: int


class BooleanKWargs(TypedDict):
Expand Down Expand Up @@ -206,7 +207,7 @@ def structural_coverage(label: int) -> StructuralCoverageTag:
FLOAT_INIT_LOGIC_CACHE = LRUCache(4096)
POOLED_KWARGS_CACHE = LRUCache(4096)

DRAW_STRING_DEFAULT_MAX_SIZE = 10**10 # "arbitrarily large"
COLLECTION_DEFAULT_MAX_SIZE = 10**10 # "arbitrarily large"


class Example:
Expand Down Expand Up @@ -1036,7 +1037,7 @@ def trivial(self):
return self.value == (minimal_char * self.kwargs["min_size"])
if self.ir_type == "bytes":
# smallest size and all-zero value.
return len(self.value) == self.kwargs["size"] and not any(self.value)
return len(self.value) == self.kwargs["min_size"] and not any(self.value)

raise NotImplementedError(f"unhandled ir_type {self.ir_type}")

Expand Down Expand Up @@ -1095,7 +1096,9 @@ def ir_value_permitted(value, ir_type, kwargs):
return False
return all(ord(c) in kwargs["intervals"] for c in value)
elif ir_type == "bytes":
return len(value) == kwargs["size"]
if len(value) < kwargs["min_size"]:
return False
return kwargs["max_size"] is None or len(value) <= kwargs["max_size"]
elif ir_type == "boolean":
if kwargs["p"] <= 2 ** (-64):
return value is False
Expand Down Expand Up @@ -1314,15 +1317,20 @@ def draw_string(
intervals: IntervalSet,
*,
min_size: int = 0,
max_size: Optional[int] = None,
max_size: int = COLLECTION_DEFAULT_MAX_SIZE,
forced: Optional[str] = None,
fake_forced: bool = False,
) -> str:
raise NotImplementedError

@abc.abstractmethod
def draw_bytes(
self, size: int, *, forced: Optional[bytes] = None, fake_forced: bool = False
self,
min_size: int = 0,
max_size: int = COLLECTION_DEFAULT_MAX_SIZE,
*,
forced: Optional[bytes] = None,
fake_forced: bool = False,
) -> bytes:
raise NotImplementedError

Expand Down Expand Up @@ -1606,14 +1614,10 @@ def draw_string(
intervals: IntervalSet,
*,
min_size: int = 0,
max_size: Optional[int] = None,
max_size: int = COLLECTION_DEFAULT_MAX_SIZE,
forced: Optional[str] = None,
fake_forced: bool = False,
) -> str:
if max_size is None:
max_size = DRAW_STRING_DEFAULT_MAX_SIZE

assert forced is None or min_size <= len(forced) <= max_size
assert self._cd is not None

average_size = min(
Expand Down Expand Up @@ -1663,17 +1667,40 @@ def draw_string(
return "".join(chars)

def draw_bytes(
self, size: int, *, forced: Optional[bytes] = None, fake_forced: bool = False
self,
min_size: int = 0,
max_size: int = COLLECTION_DEFAULT_MAX_SIZE,
*,
forced: Optional[bytes] = None,
fake_forced: bool = False,
) -> bytes:
forced_i = None
if forced is not None:
forced_i = int_from_bytes(forced)
size = len(forced)

assert self._cd is not None
return self._cd.draw_bits(
8 * size, forced=forced_i, fake_forced=fake_forced
).to_bytes(size, "big")

buf = bytearray()
average_size = min(
max(min_size * 2, min_size + 5),
0.5 * (min_size + max_size),
)
elements = many(
self._cd,
min_size=min_size,
max_size=max_size,
average_size=average_size,
forced=None if forced is None else len(forced),
fake_forced=fake_forced,
observe=False,
)
while elements.more():
forced_i: Optional[int] = None
if forced is not None:
# implicit conversion from bytes to int by indexing here
forced_i = forced[elements.count - 1]

buf += self._cd.draw_bits(
8, forced=forced_i, fake_forced=fake_forced
).to_bytes(1, "big")

return bytes(buf)

def _draw_float(
self,
Expand Down Expand Up @@ -2216,12 +2243,13 @@ def draw_string(
intervals: IntervalSet,
*,
min_size: int = 0,
max_size: Optional[int] = None,
max_size: int = COLLECTION_DEFAULT_MAX_SIZE,
forced: Optional[str] = None,
fake_forced: bool = False,
observe: bool = True,
) -> str:
assert forced is None or min_size <= len(forced)
assert forced is None or min_size <= len(forced) <= max_size
assert min_size >= 0

kwargs: StringKWargs = self._pooled_kwargs(
"string",
Expand Down Expand Up @@ -2255,17 +2283,19 @@ def draw_string(

def draw_bytes(
self,
# TODO move to min_size and max_size here.
size: int,
min_size: int = 0,
max_size: int = COLLECTION_DEFAULT_MAX_SIZE,
*,
forced: Optional[bytes] = None,
fake_forced: bool = False,
observe: bool = True,
) -> bytes:
assert forced is None or len(forced) == size
assert size >= 0
assert forced is None or min_size <= len(forced) <= max_size
assert min_size >= 0

kwargs: BytesKWargs = self._pooled_kwargs("bytes", {"size": size})
kwargs: BytesKWargs = self._pooled_kwargs(
"bytes", {"min_size": min_size, "max_size": max_size}
)

if self.ir_tree_nodes is not None and observe:
node_value = self._pop_ir_tree_node("bytes", kwargs, forced=forced)
Expand Down
71 changes: 35 additions & 36 deletions hypothesis-python/src/hypothesis/internal/conjecture/datatree.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,9 +146,31 @@ def _repr_pretty_(self, p, cycle):
MAX_CHILDREN_EFFECTIVELY_INFINITE = 100_000


def compute_max_children(ir_type, kwargs):
from hypothesis.internal.conjecture.data import DRAW_STRING_DEFAULT_MAX_SIZE
def _count_distinct_strings(*, alphabet_size, min_size, max_size):
# We want to estimate if we're going to have more children than
# MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially
# extremely expensive pow. We'll check if the number of strings in
# the largest string size alone is enough to put us over this limit.
# We'll also employ a trick of estimating against log, which is cheaper
# than computing a pow.
#
# x = max_size
# y = alphabet_size
# n = MAX_CHILDREN_EFFECTIVELY_INFINITE
#
# x**y > n
# <=> log(x**y) > log(n)
# <=> y * log(x) > log(n)
definitely_too_large = max_size * math.log(alphabet_size) > math.log(
MAX_CHILDREN_EFFECTIVELY_INFINITE
)
if definitely_too_large:
return MAX_CHILDREN_EFFECTIVELY_INFINITE

return sum(alphabet_size**k for k in range(min_size, max_size + 1))


def compute_max_children(ir_type, kwargs):
if ir_type == "integer":
min_value = kwargs["min_value"]
max_value = kwargs["max_value"]
Expand Down Expand Up @@ -178,50 +200,27 @@ def compute_max_children(ir_type, kwargs):
return 1
return 2
elif ir_type == "bytes":
return 2 ** (8 * kwargs["size"])
return _count_distinct_strings(
alphabet_size=2**8, min_size=kwargs["min_size"], max_size=kwargs["max_size"]
)
elif ir_type == "string":
min_size = kwargs["min_size"]
max_size = kwargs["max_size"]
intervals = kwargs["intervals"]

if max_size is None:
max_size = DRAW_STRING_DEFAULT_MAX_SIZE

if len(intervals) == 0:
# Special-case the empty alphabet to avoid an error in math.log(0).
# Only possibility is the empty string.
return 1

# We want to estimate if we're going to have more children than
# MAX_CHILDREN_EFFECTIVELY_INFINITE, without computing a potentially
# extremely expensive pow. We'll check if the number of strings in
# the largest string size alone is enough to put us over this limit.
# We'll also employ a trick of estimating against log, which is cheaper
# than computing a pow.
#
# x = max_size
# y = len(intervals)
# n = MAX_CHILDREN_EFFECTIVELY_INFINITE
#
# x**y > n
# <=> log(x**y) > log(n)
# <=> y * log(x) > log(n)

# avoid math.log(1) == 0 and incorrectly failing the below estimate,
# even when we definitely are too large.
if len(intervals) == 1:
definitely_too_large = max_size > MAX_CHILDREN_EFFECTIVELY_INFINITE
else:
definitely_too_large = max_size * math.log(len(intervals)) > math.log(
MAX_CHILDREN_EFFECTIVELY_INFINITE
)

if definitely_too_large:
# avoid math.log(1) == 0 and incorrectly failing our effectively_infinite
# estimate, even when we definitely are too large.
if len(intervals) == 1 and max_size > MAX_CHILDREN_EFFECTIVELY_INFINITE:
return MAX_CHILDREN_EFFECTIVELY_INFINITE

# number of strings of length k, for each k in [min_size, max_size].
return sum(len(intervals) ** k for k in range(min_size, max_size + 1))

return _count_distinct_strings(
alphabet_size=len(intervals), min_size=min_size, max_size=max_size
)
elif ir_type == "float":
min_value = kwargs["min_value"]
max_value = kwargs["max_value"]
Expand Down Expand Up @@ -306,8 +305,8 @@ def all_children(ir_type, kwargs):
else:
yield from [False, True]
if ir_type == "bytes":
size = kwargs["size"]
yield from (int_to_bytes(i, size) for i in range(2 ** (8 * size)))
for size in range(kwargs["min_size"], kwargs["max_size"] + 1):
yield from (int_to_bytes(i, size) for i in range(2 ** (8 * size)))
if ir_type == "string":
min_size = kwargs["min_size"]
max_size = kwargs["max_size"]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -1075,10 +1075,9 @@ def try_shrinking_nodes(self, nodes, n):
return False # pragma: no cover

if node.ir_type in {"string", "bytes"}:
size_kwarg = "min_size" if node.ir_type == "string" else "size"
# if the size *increased*, we would have to guess what to pad with
# in order to try fixing up this attempt. Just give up.
if node.kwargs[size_kwarg] <= attempt_kwargs[size_kwarg]:
if node.kwargs["min_size"] <= attempt_kwargs["min_size"]:
return False
# the size decreased in our attempt. Try again, but replace with
# the min_size that we would have gotten, and truncate the value
Expand All @@ -1089,7 +1088,7 @@ def try_shrinking_nodes(self, nodes, n):
initial_attempt[node.index].copy(
with_kwargs=attempt_kwargs,
with_value=initial_attempt[node.index].value[
: attempt_kwargs[size_kwarg]
: attempt_kwargs["min_size"]
],
)
]
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -8,17 +8,16 @@
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

from hypothesis.internal.compat import int_from_bytes, int_to_bytes
from hypothesis.internal.conjecture.shrinking.collection import Collection
from hypothesis.internal.conjecture.shrinking.integer import Integer


class Bytes(Integer):
class Bytes(Collection):
def __init__(self, initial, predicate, **kwargs):
# shrink by interpreting the bytes as an integer.
# move to Collection.shrink when we support variable-size bytes,
# because b'\x00\x02' could shrink to either b'\x00\x01' or b'\x02'.
super().__init__(
int_from_bytes(initial),
lambda n: predicate(int_to_bytes(n, len(initial))),
# implicit conversion from bytes to list of integers here
list(initial),
lambda val: predicate(bytes(val)),
ElementShrinker=Integer,
**kwargs,
)
Loading

0 comments on commit 342017a

Please sign in to comment.