扫码登录,获取cookies

This commit is contained in:
2026-03-09 16:10:29 +08:00
parent 754e720ba7
commit 8229208165
7775 changed files with 1150053 additions and 208 deletions

View File

@@ -0,0 +1,9 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.

View File

@@ -0,0 +1,160 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from collections import defaultdict
from random import Random
from typing import Callable, Dict, Iterable, List, Optional, Sequence
from hypothesis.internal.conjecture.junkdrawer import LazySequenceCopy, pop_random
def prefix_selection_order(
prefix: Sequence[int],
) -> Callable[[int, int], Iterable[int]]:
"""Select choices starting from ``prefix```,
preferring to move left then wrapping around
to the right."""
def selection_order(depth: int, n: int) -> Iterable[int]:
if depth < len(prefix):
i = prefix[depth]
if i >= n:
i = n - 1
yield from range(i, -1, -1)
yield from range(n - 1, i, -1)
else:
yield from range(n - 1, -1, -1)
return selection_order
def random_selection_order(random: Random) -> Callable[[int, int], Iterable[int]]:
"""Select choices uniformly at random."""
def selection_order(depth: int, n: int) -> Iterable[int]:
pending = LazySequenceCopy(range(n))
while pending:
yield pop_random(random, pending)
return selection_order
class Chooser:
"""A source of nondeterminism for use in shrink passes."""
def __init__(
self,
tree: "ChoiceTree",
selection_order: Callable[[int, int], Iterable[int]],
):
self.__selection_order = selection_order
self.__node_trail = [tree.root]
self.__choices: "List[int]" = []
self.__finished = False
def choose(
self,
values: Sequence[int],
condition: Callable[[int], bool] = lambda x: True,
) -> int:
"""Return some element of values satisfying the condition
that will not lead to an exhausted branch, or raise DeadBranch
if no such element exist".
"""
assert not self.__finished
node = self.__node_trail[-1]
if node.live_child_count is None:
node.live_child_count = len(values)
node.n = len(values)
assert node.live_child_count > 0 or len(values) == 0
for i in self.__selection_order(len(self.__choices), len(values)):
if node.live_child_count == 0:
break
if not node.children[i].exhausted:
v = values[i]
if condition(v):
self.__choices.append(i)
self.__node_trail.append(node.children[i])
return v
else:
node.children[i] = DeadNode
node.live_child_count -= 1
assert node.live_child_count == 0
raise DeadBranch
def finish(self) -> Sequence[int]:
"""Record the decisions made in the underlying tree and return
a prefix that can be used for the next Chooser to be used."""
self.__finished = True
assert len(self.__node_trail) == len(self.__choices) + 1
result = tuple(self.__choices)
self.__node_trail[-1].live_child_count = 0
while len(self.__node_trail) > 1 and self.__node_trail[-1].exhausted:
self.__node_trail.pop()
assert len(self.__node_trail) == len(self.__choices)
i = self.__choices.pop()
target = self.__node_trail[-1]
target.children[i] = DeadNode
assert target.live_child_count is not None
target.live_child_count -= 1
return result
class ChoiceTree:
"""Records sequences of choices made during shrinking so that we
can track what parts of a pass has run. Used to create Chooser
objects that are the main interface that a pass uses to make
decisions about what to do.
"""
def __init__(self) -> None:
self.root = TreeNode()
@property
def exhausted(self) -> bool:
return self.root.exhausted
def step(
self,
selection_order: Callable[[int, int], Iterable[int]],
f: Callable[[Chooser], None],
) -> Sequence[int]:
assert not self.exhausted
chooser = Chooser(self, selection_order)
try:
f(chooser)
except DeadBranch:
pass
return chooser.finish()
class TreeNode:
def __init__(self) -> None:
self.children: Dict[int, TreeNode] = defaultdict(TreeNode)
self.live_child_count: "Optional[int]" = None
self.n: "Optional[int]" = None
@property
def exhausted(self) -> bool:
return self.live_child_count == 0
DeadNode = TreeNode()
DeadNode.live_child_count = 0
class DeadBranch(Exception):
pass

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,427 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
import attr
from hypothesis.errors import Flaky, HypothesisException, StopTest
from hypothesis.internal.compat import int_to_bytes
from hypothesis.internal.conjecture.data import (
ConjectureData,
DataObserver,
Status,
bits_to_bytes,
)
from hypothesis.internal.conjecture.junkdrawer import IntList
class PreviouslyUnseenBehaviour(HypothesisException):
pass
def inconsistent_generation():
raise Flaky(
"Inconsistent data generation! Data generation behaved differently "
"between different runs. Is your data generation depending on external "
"state?"
)
EMPTY: frozenset = frozenset()
@attr.s(slots=True)
class Killed:
"""Represents a transition to part of the tree which has been marked as
"killed", meaning we want to treat it as not worth exploring, so it will
be treated as if it were completely explored for the purposes of
exhaustion."""
next_node = attr.ib()
@attr.s(slots=True)
class Branch:
"""Represents a transition where multiple choices can be made as to what
to drawn."""
bit_length = attr.ib()
children = attr.ib(repr=False)
@property
def max_children(self):
return 1 << self.bit_length
@attr.s(slots=True, frozen=True)
class Conclusion:
"""Represents a transition to a finished state."""
status = attr.ib()
interesting_origin = attr.ib()
@attr.s(slots=True)
class TreeNode:
"""Node in a tree that corresponds to previous interactions with
a ``ConjectureData`` object according to some fixed test function.
This is functionally a variant patricia trie.
See https://en.wikipedia.org/wiki/Radix_tree for the general idea,
but what this means in particular here is that we have a very deep
but very lightly branching tree and rather than store this as a fully
recursive structure we flatten prefixes and long branches into
lists. This significantly compacts the storage requirements.
A single ``TreeNode`` corresponds to a previously seen sequence
of calls to ``ConjectureData`` which we have never seen branch,
followed by a ``transition`` which describes what happens next.
"""
# Records the previous sequence of calls to ``data.draw_bits``,
# with the ``n_bits`` argument going in ``bit_lengths`` and the
# values seen in ``values``. These should always have the same
# length.
bit_lengths = attr.ib(factory=IntList)
values = attr.ib(factory=IntList)
# The indices of of the calls to ``draw_bits`` that we have stored
# where ``forced`` is not None. Stored as None if no indices
# have been forced, purely for space saving reasons (we force
# quite rarely).
__forced = attr.ib(default=None, init=False)
# What happens next after observing this sequence of calls.
# Either:
#
# * ``None``, indicating we don't know yet.
# * A ``Branch`` object indicating that there is a ``draw_bits``
# call that we have seen take multiple outcomes there.
# * A ``Conclusion`` object indicating that ``conclude_test``
# was called here.
transition = attr.ib(default=None)
# A tree node is exhausted if every possible sequence of
# draws below it has been explored. We store this information
# on a field and update it when performing operations that
# could change the answer.
#
# A node may start exhausted, e.g. because it it leads
# immediately to a conclusion, but can only go from
# non-exhausted to exhausted when one of its children
# becomes exhausted or it is marked as a conclusion.
#
# Therefore we only need to check whether we need to update
# this field when the node is first created in ``split_at``
# or when we have walked a path through this node to a
# conclusion in ``TreeRecordingObserver``.
is_exhausted = attr.ib(default=False, init=False)
@property
def forced(self):
if not self.__forced:
return EMPTY
return self.__forced
def mark_forced(self, i):
"""Note that the value at index ``i`` was forced."""
assert 0 <= i < len(self.values)
if self.__forced is None:
self.__forced = set()
self.__forced.add(i)
def split_at(self, i):
"""Splits the tree so that it can incorporate
a decision at the ``draw_bits`` call corresponding
to position ``i``, or raises ``Flaky`` if that was
meant to be a forced node."""
if i in self.forced:
inconsistent_generation()
assert not self.is_exhausted
key = self.values[i]
child = TreeNode(
bit_lengths=self.bit_lengths[i + 1 :],
values=self.values[i + 1 :],
transition=self.transition,
)
self.transition = Branch(bit_length=self.bit_lengths[i], children={key: child})
if self.__forced is not None:
child.__forced = {j - i - 1 for j in self.__forced if j > i}
self.__forced = {j for j in self.__forced if j < i}
child.check_exhausted()
del self.values[i:]
del self.bit_lengths[i:]
assert len(self.values) == len(self.bit_lengths) == i
def check_exhausted(self):
"""Recalculates ``self.is_exhausted`` if necessary then returns
it."""
if (
not self.is_exhausted
and len(self.forced) == len(self.values)
and self.transition is not None
):
if isinstance(self.transition, (Conclusion, Killed)):
self.is_exhausted = True
elif len(self.transition.children) == self.transition.max_children:
self.is_exhausted = all(
v.is_exhausted for v in self.transition.children.values()
)
return self.is_exhausted
class DataTree:
"""Tracks the tree structure of a collection of ConjectureData
objects, for use in ConjectureRunner."""
def __init__(self):
self.root = TreeNode()
@property
def is_exhausted(self):
"""Returns True if every possible node is dead and thus the language
described must have been fully explored."""
return self.root.is_exhausted
def generate_novel_prefix(self, random):
"""Generate a short random string that (after rewriting) is not
a prefix of any buffer previously added to the tree.
The resulting prefix is essentially arbitrary - it would be nice
for it to be uniform at random, but previous attempts to do that
have proven too expensive.
"""
assert not self.is_exhausted
novel_prefix = bytearray()
def append_int(n_bits, value):
novel_prefix.extend(int_to_bytes(value, bits_to_bytes(n_bits)))
current_node = self.root
while True:
assert not current_node.is_exhausted
for i, (n_bits, value) in enumerate(
zip(current_node.bit_lengths, current_node.values)
):
if i in current_node.forced:
append_int(n_bits, value)
else:
while True:
k = random.getrandbits(n_bits)
if k != value:
append_int(n_bits, k)
break
# We've now found a value that is allowed to
# vary, so what follows is not fixed.
return bytes(novel_prefix)
else:
assert not isinstance(current_node.transition, (Conclusion, Killed))
if current_node.transition is None:
return bytes(novel_prefix)
branch = current_node.transition
assert isinstance(branch, Branch)
n_bits = branch.bit_length
check_counter = 0
while True:
k = random.getrandbits(n_bits)
try:
child = branch.children[k]
except KeyError:
append_int(n_bits, k)
return bytes(novel_prefix)
if not child.is_exhausted:
append_int(n_bits, k)
current_node = child
break
check_counter += 1
# We don't expect this assertion to ever fire, but coverage
# wants the loop inside to run if you have branch checking
# on, hence the pragma.
assert ( # pragma: no cover
check_counter != 1000
or len(branch.children) < (2**n_bits)
or any(not v.is_exhausted for v in branch.children.values())
)
def rewrite(self, buffer):
"""Use previously seen ConjectureData objects to return a tuple of
the rewritten buffer and the status we would get from running that
buffer with the test function. If the status cannot be predicted
from the existing values it will be None."""
buffer = bytes(buffer)
data = ConjectureData.for_buffer(buffer)
try:
self.simulate_test_function(data)
return (data.buffer, data.status)
except PreviouslyUnseenBehaviour:
return (buffer, None)
def simulate_test_function(self, data):
"""Run a simulated version of the test function recorded by
this tree. Note that this does not currently call ``stop_example``
or ``start_example`` as these are not currently recorded in the
tree. This will likely change in future."""
node = self.root
try:
while True:
for i, (n_bits, previous) in enumerate(
zip(node.bit_lengths, node.values)
):
v = data.draw_bits(
n_bits, forced=node.values[i] if i in node.forced else None
)
if v != previous:
raise PreviouslyUnseenBehaviour
if isinstance(node.transition, Conclusion):
t = node.transition
data.conclude_test(t.status, t.interesting_origin)
elif node.transition is None:
raise PreviouslyUnseenBehaviour
elif isinstance(node.transition, Branch):
v = data.draw_bits(node.transition.bit_length)
try:
node = node.transition.children[v]
except KeyError as err:
raise PreviouslyUnseenBehaviour from err
else:
assert isinstance(node.transition, Killed)
data.observer.kill_branch()
node = node.transition.next_node
except StopTest:
pass
def new_observer(self):
return TreeRecordingObserver(self)
class TreeRecordingObserver(DataObserver):
def __init__(self, tree):
self.__current_node = tree.root
self.__index_in_current_node = 0
self.__trail = [self.__current_node]
self.killed = False
def draw_bits(self, n_bits, forced, value):
i = self.__index_in_current_node
self.__index_in_current_node += 1
node = self.__current_node
assert len(node.bit_lengths) == len(node.values)
if i < len(node.bit_lengths):
if n_bits != node.bit_lengths[i]:
inconsistent_generation()
# Note that we don't check whether a previously
# forced value is now free. That will be caught
# if we ever split the node there, but otherwise
# may pass silently. This is acceptable because it
# means we skip a hash set lookup on every
# draw and that's a pretty niche failure mode.
if forced and i not in node.forced:
inconsistent_generation()
if value != node.values[i]:
node.split_at(i)
assert i == len(node.values)
new_node = TreeNode()
branch = node.transition
branch.children[value] = new_node
self.__current_node = new_node
self.__index_in_current_node = 0
else:
trans = node.transition
if trans is None:
node.bit_lengths.append(n_bits)
node.values.append(value)
if forced:
node.mark_forced(i)
elif isinstance(trans, Conclusion):
assert trans.status != Status.OVERRUN
# We tried to draw where history says we should have
# stopped
inconsistent_generation()
else:
assert isinstance(trans, Branch), trans
if n_bits != trans.bit_length:
inconsistent_generation()
try:
self.__current_node = trans.children[value]
except KeyError:
self.__current_node = trans.children.setdefault(value, TreeNode())
self.__index_in_current_node = 0
if self.__trail[-1] is not self.__current_node:
self.__trail.append(self.__current_node)
def kill_branch(self):
"""Mark this part of the tree as not worth re-exploring."""
if self.killed:
return
self.killed = True
if self.__index_in_current_node < len(self.__current_node.values) or (
self.__current_node.transition is not None
and not isinstance(self.__current_node.transition, Killed)
):
inconsistent_generation()
if self.__current_node.transition is None:
self.__current_node.transition = Killed(TreeNode())
self.__update_exhausted()
self.__current_node = self.__current_node.transition.next_node
self.__index_in_current_node = 0
self.__trail.append(self.__current_node)
def conclude_test(self, status, interesting_origin):
"""Says that ``status`` occurred at node ``node``. This updates the
node if necessary and checks for consistency."""
if status == Status.OVERRUN:
return
i = self.__index_in_current_node
node = self.__current_node
if i < len(node.values) or isinstance(node.transition, Branch):
inconsistent_generation()
new_transition = Conclusion(status, interesting_origin)
if node.transition is not None and node.transition != new_transition:
# As an, I'm afraid, horrible bodge, we deliberately ignore flakiness
# where tests go from interesting to valid, because it's much easier
# to produce good error messages for these further up the stack.
if isinstance(node.transition, Conclusion) and (
node.transition.status != Status.INTERESTING
or new_transition.status != Status.VALID
):
raise Flaky(
f"Inconsistent test results! Test case was {node.transition!r} "
f"on first run but {new_transition!r} on second"
)
else:
node.transition = new_transition
assert node is self.__trail[-1]
node.check_exhausted()
assert len(node.values) > 0 or node.check_exhausted()
if not self.killed:
self.__update_exhausted()
def __update_exhausted(self):
for t in reversed(self.__trail):
# Any node we've traversed might have now become exhausted.
# We check from the right. As soon as we hit a node that
# isn't exhausted, this automatically implies that all of
# its parents are not exhausted, so we stop.
if not t.check_exhausted():
break

View File

@@ -0,0 +1,674 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
import threading
from collections import Counter, defaultdict, deque
from math import inf
from hypothesis.internal.reflection import proxies
def cached(fn):
@proxies(fn)
def wrapped(self, *args):
cache = self._DFA__cache(fn.__name__)
try:
return cache[args]
except KeyError:
return cache.setdefault(args, fn(self, *args))
return wrapped
class DFA:
"""Base class for implementations of deterministic finite
automata.
This is abstract to allow for the possibility of states
being calculated lazily as we traverse the DFA (which
we make heavy use of in our L* implementation - see
lstar.py for details).
States can be of any hashable type.
"""
def __init__(self):
self.__caches = threading.local()
def __cache(self, name):
try:
cache = getattr(self.__caches, name)
except AttributeError:
cache = {}
setattr(self.__caches, name, cache)
return cache
@property
def start(self):
"""Returns the starting state."""
raise NotImplementedError
def is_accepting(self, i):
"""Returns if state ``i`` is an accepting one."""
raise NotImplementedError
def transition(self, i, c):
"""Returns the state that i transitions to on reading
character c from a string."""
raise NotImplementedError
@property
def alphabet(self):
return range(256)
def transitions(self, i):
"""Iterates over all pairs (byte, state) of transitions
which do not lead to dead states."""
for c, j in self.raw_transitions(i):
if not self.is_dead(j):
yield c, j
@cached
def transition_counts(self, state):
counts = Counter()
for _, j in self.transitions(state):
counts[j] += 1
return list(counts.items())
def matches(self, s):
"""Returns whether the string ``s`` is accepted
by this automaton."""
i = self.start
for c in s:
i = self.transition(i, c)
return self.is_accepting(i)
def all_matching_regions(self, string):
"""Return all pairs ``(u, v)`` such that ``self.matches(string[u:v])``."""
# Stack format: (k, state, indices). After reading ``k`` characters
# starting from any i in ``indices`` the DFA would be at ``state``.
stack = [(0, self.start, range(len(string)))]
results = []
while stack:
k, state, indices = stack.pop()
# If the state is dead, abort early - no point continuing on
# from here where there will be no more matches.
if self.is_dead(state):
continue
# If the state is accepting, then every one of these indices
# has a matching region of length ``k`` starting from it.
if self.is_accepting(state):
results.extend([(i, i + k) for i in indices])
next_by_state = defaultdict(list)
for i in indices:
if i + k < len(string):
c = string[i + k]
next_by_state[self.transition(state, c)].append(i)
for next_state, next_indices in next_by_state.items():
stack.append((k + 1, next_state, next_indices))
return results
def max_length(self, i):
"""Returns the maximum length of a string that is
accepted when starting from i."""
if self.is_dead(i):
return 0
cache = self.__cache("max_length")
try:
return cache[i]
except KeyError:
pass
# Naively we can calculate this as 1 longer than the
# max length of the non-dead states this can immediately
# transition to, but a) We don't want unbounded recursion
# because that's how you get RecursionErrors and b) This
# makes it hard to look for cycles. So we basically do
# the recursion explicitly with a stack, but we maintain
# a parallel set that tracks what's already on the stack
# so that when we encounter a loop we can immediately
# determine that the max length here is infinite.
stack = [i]
stack_set = {i}
def pop():
"""Remove the top element from the stack, maintaining
the stack set appropriately."""
assert len(stack) == len(stack_set)
j = stack.pop()
stack_set.remove(j)
assert len(stack) == len(stack_set)
while stack:
j = stack[-1]
assert not self.is_dead(j)
# If any of the children have infinite max_length we don't
# need to check all of them to know that this state does
# too.
if any(cache.get(k) == inf for k in self.successor_states(j)):
cache[j] = inf
pop()
continue
# Recurse to the first child node that we have not yet
# calculated max_length for.
for k in self.successor_states(j):
if k in stack_set:
# k is part of a loop and is known to be live
# (since we never push dead states on the stack),
# so it can reach strings of unbounded length.
assert not self.is_dead(k)
cache[k] = inf
break
elif k not in cache and not self.is_dead(k):
stack.append(k)
stack_set.add(k)
break
else:
# All of j's successors have a known max_length or are dead,
# so we can now compute a max_length for j itself.
cache[j] = max(
(
1 + cache[k]
for k in self.successor_states(j)
if not self.is_dead(k)
),
default=0,
)
# j is live so it must either be accepting or have a live child.
assert self.is_accepting(j) or cache[j] > 0
pop()
return cache[i]
@cached
def has_strings(self, state, length):
"""Returns if any strings of length ``length`` are accepted when
starting from state ``state``."""
assert length >= 0
cache = self.__cache("has_strings")
try:
return cache[state, length]
except KeyError:
pass
pending = [(state, length)]
seen = set()
i = 0
while i < len(pending):
s, n = pending[i]
i += 1
if n > 0:
for t in self.successor_states(s):
key = (t, n - 1)
if key not in cache and key not in seen:
pending.append(key)
seen.add(key)
while pending:
s, n = pending.pop()
if n == 0:
cache[s, n] = self.is_accepting(s)
else:
cache[s, n] = any(
cache.get((t, n - 1)) for t in self.successor_states(s)
)
return cache[state, length]
def count_strings(self, state, length):
"""Returns the number of strings of length ``length``
that are accepted when starting from state ``state``."""
assert length >= 0
cache = self.__cache("count_strings")
try:
return cache[state, length]
except KeyError:
pass
pending = [(state, length)]
seen = set()
i = 0
while i < len(pending):
s, n = pending[i]
i += 1
if n > 0:
for t in self.successor_states(s):
key = (t, n - 1)
if key not in cache and key not in seen:
pending.append(key)
seen.add(key)
while pending:
s, n = pending.pop()
if n == 0:
cache[s, n] = int(self.is_accepting(s))
else:
cache[s, n] = sum(
cache[t, n - 1] * k for t, k in self.transition_counts(s)
)
return cache[state, length]
@cached
def successor_states(self, state):
"""Returns all of the distinct states that can be reached via one
transition from ``state``, in the lexicographic order of the
smallest character that reaches them."""
seen = set()
result = []
for _, j in self.raw_transitions(state):
if j not in seen:
seen.add(j)
result.append(j)
return tuple(result)
def is_dead(self, state):
"""Returns True if no strings can be accepted
when starting from ``state``."""
return not self.is_live(state)
def is_live(self, state):
"""Returns True if any strings can be accepted
when starting from ``state``."""
if self.is_accepting(state):
return True
# We work this out by calculating is_live for all nodes
# reachable from state which have not already had it calculated.
cache = self.__cache("is_live")
try:
return cache[state]
except KeyError:
pass
# roots are states that we know already must be live,
# either because we have previously calculated them to
# be or because they are an accepting state.
roots = set()
# We maintain a backwards graph where ``j in backwards_graph[k]``
# if there is a transition from j to k. Thus if a key in this
# graph is live, so must all its values be.
backwards_graph = defaultdict(set)
# First we find all reachable nodes from i which have not
# already been cached, noting any which are roots and
# populating the backwards graph.
explored = set()
queue = deque([state])
while queue:
j = queue.popleft()
if cache.get(j, self.is_accepting(j)):
# If j can be immediately determined to be live
# then there is no point in exploring beneath it,
# because any effect of states below it is screened
# off by the known answer for j.
roots.add(j)
continue
if j in cache:
# Likewise if j is known to be dead then there is
# no point exploring beneath it because we know
# that all nodes reachable from it must be dead.
continue
if j in explored:
continue
explored.add(j)
for k in self.successor_states(j):
backwards_graph[k].add(j)
queue.append(k)
marked_live = set()
queue = deque(roots)
while queue:
j = queue.popleft()
if j in marked_live:
continue
marked_live.add(j)
for k in backwards_graph[j]:
queue.append(k)
for j in explored:
cache[j] = j in marked_live
return cache[state]
def all_matching_strings_of_length(self, k):
"""Yields all matching strings whose length is ``k``, in ascending
lexicographic order."""
if k == 0:
if self.is_accepting(self.start):
yield b""
return
if not self.has_strings(self.start, k):
return
# This tracks a path through the DFA. We alternate between growing
# it until it has length ``k`` and is in an accepting state, then
# yielding that as a result, then modifying it so that the next
# time we do that it will yield the lexicographically next matching
# string.
path = bytearray()
# Tracks the states that are visited by following ``path`` from the
# starting point.
states = [self.start]
while True:
# First we build up our current best prefix to the lexicographically
# first string starting with it.
while len(path) < k:
state = states[-1]
for c, j in self.transitions(state):
if self.has_strings(j, k - len(path) - 1):
states.append(j)
path.append(c)
break
else:
raise NotImplementedError("Should be unreachable")
assert self.is_accepting(states[-1])
assert len(states) == len(path) + 1
yield bytes(path)
# Now we want to replace this string with the prefix that will
# cause us to extend to its lexicographic successor. This can
# be thought of as just repeatedly moving to the next lexicographic
# successor until we find a matching string, but we're able to
# use our length counts to jump over long sequences where there
# cannot be a match.
while True:
# As long as we are in this loop we are trying to move to
# the successor of the current string.
# If we've removed the entire prefix then we're done - no
# successor is possible.
if not path:
return
if path[-1] == 255:
# If our last element is maximal then the we have to "carry
# the one" - our lexicographic successor must be incremented
# earlier than this.
path.pop()
states.pop()
else:
# Otherwise increment by one.
path[-1] += 1
states[-1] = self.transition(states[-2], path[-1])
# If there are no strings of the right length starting from
# this prefix we need to keep going. Otherwise, this is
# the right place to be and we break out of our loop of
# trying to find the successor because it starts here.
if self.count_strings(states[-1], k - len(path)) > 0:
break
def all_matching_strings(self, min_length=0):
"""Iterate over all strings matched by this automaton
in shortlex-ascending order."""
# max_length might be infinite, hence the while loop
max_length = self.max_length(self.start)
length = min_length
while length <= max_length:
yield from self.all_matching_strings_of_length(length)
length += 1
def raw_transitions(self, i):
for c in self.alphabet:
j = self.transition(i, c)
yield c, j
def canonicalise(self):
"""Return a canonical version of ``self`` as a ConcreteDFA.
The DFA is not minimized, but nodes are sorted and relabelled
and dead nodes are pruned, so two minimized DFAs for the same
language will end up with identical canonical representatives.
This is mildly important because it means that the output of
L* should produce the same canonical DFA regardless of what
order we happen to have run it in.
"""
# We map all states to their index of appearance in depth
# first search. This both is useful for canonicalising and
# also allows for states that aren't integers.
state_map = {}
reverse_state_map = []
accepting = set()
seen = set()
queue = deque([self.start])
while queue:
state = queue.popleft()
if state in state_map:
continue
i = len(reverse_state_map)
if self.is_accepting(state):
accepting.add(i)
reverse_state_map.append(state)
state_map[state] = i
for _, j in self.transitions(state):
if j in seen:
continue
seen.add(j)
queue.append(j)
transitions = [
{c: state_map[s] for c, s in self.transitions(t)} for t in reverse_state_map
]
result = ConcreteDFA(transitions, accepting)
assert self.equivalent(result)
return result
def equivalent(self, other):
"""Checks whether this DFA and other match precisely the same
language.
Uses the classic algorithm of Hopcroft and Karp (more or less):
Hopcroft, John E. A linear algorithm for testing equivalence
of finite automata. Vol. 114. Defense Technical Information Center, 1971.
"""
# The basic idea of this algorithm is that we repeatedly
# merge states that would be equivalent if the two start
# states were. This starts by merging the two start states,
# and whenever we merge two states merging all pairs of
# states that are reachable by following the same character
# from that point.
#
# Whenever we merge two states, we check if one of them
# is accepting and the other non-accepting. If so, we have
# obtained a contradiction and have made a bad merge, so
# the two start states must not have been equivalent in the
# first place and we return False.
#
# If the languages matched are different then some string
# is contained in one but not the other. By looking at
# the pairs of states visited by traversing the string in
# each automaton in parallel, we eventually come to a pair
# of states that would have to be merged by this algorithm
# where one is accepting and the other is not. Thus this
# algorithm always returns False as a result of a bad merge
# if the two languages are not the same.
#
# If we successfully complete all merges without a contradiction
# we can thus safely return True.
# We maintain a union/find table for tracking merges of states.
table = {}
def find(s):
trail = [s]
while trail[-1] in table and table[trail[-1]] != trail[-1]:
trail.append(table[trail[-1]])
for t in trail:
table[t] = trail[-1]
return trail[-1]
def union(s, t):
s = find(s)
t = find(t)
table[s] = t
alphabet = sorted(set(self.alphabet) | set(other.alphabet))
queue = deque([((self.start, other.start))])
while queue:
self_state, other_state = queue.popleft()
# We use a DFA/state pair for keys because the same value
# may represent a different state in each DFA.
self_key = (self, self_state)
other_key = (other, other_state)
# We have already merged these, no need to remerge.
if find(self_key) == find(other_key):
continue
# We have found a contradiction, therefore the two DFAs must
# not be equivalent.
if self.is_accepting(self_state) != other.is_accepting(other_state):
return False
# Merge the two states
union(self_key, other_key)
# And also queue any logical consequences of merging those
# two states for merging.
for c in alphabet:
queue.append(
(self.transition(self_state, c), other.transition(other_state, c))
)
return True
DEAD = "DEAD"
class ConcreteDFA(DFA):
"""A concrete representation of a DFA in terms of an explicit list
of states."""
def __init__(self, transitions, accepting, start=0):
"""
* ``transitions`` is a list where transitions[i] represents the
valid transitions out of state ``i``. Elements may be either dicts
(in which case they map characters to other states) or lists. If they
are a list they may contain tuples of length 2 or 3. A tuple ``(c, j)``
indicates that this state transitions to state ``j`` given ``c``. A
tuple ``(u, v, j)`` indicates this state transitions to state ``j``
given any ``c`` with ``u <= c <= v``.
* ``accepting`` is a set containing the integer labels of accepting
states.
* ``start`` is the integer label of the starting state.
"""
super().__init__()
self.__start = start
self.__accepting = accepting
self.__transitions = list(transitions)
def __repr__(self):
transitions = []
# Particularly for including in source code it's nice to have the more
# compact repr, so where possible we convert to the tuple based representation
# which can represent ranges more compactly.
for i in range(len(self.__transitions)):
table = []
for c, j in self.transitions(i):
if not table or j != table[-1][-1] or c != table[-1][1] + 1:
table.append([c, c, j])
else:
table[-1][1] = c
transitions.append([(u, j) if u == v else (u, v, j) for u, v, j in table])
start = "" if self.__start == 0 else f", start={self.__start!r}"
return f"ConcreteDFA({transitions!r}, {self.__accepting!r}{start})"
@property
def start(self):
return self.__start
def is_accepting(self, i):
return i in self.__accepting
def transition(self, state, char):
"""Returns the state that i transitions to on reading
character c from a string."""
if state == DEAD:
return DEAD
table = self.__transitions[state]
# Given long transition tables we convert them to
# dictionaries for more efficient lookup.
if not isinstance(table, dict) and len(table) >= 5:
new_table = {}
for t in table:
if len(t) == 2:
new_table[t[0]] = t[1]
else:
u, v, j = t
for c in range(u, v + 1):
new_table[c] = j
self.__transitions[state] = new_table
table = new_table
if isinstance(table, dict):
try:
return self.__transitions[state][char]
except KeyError:
return DEAD
else:
for t in table:
if len(t) == 2:
if t[0] == char:
return t[1]
else:
u, v, j = t
if u <= char <= v:
return j
return DEAD
def raw_transitions(self, i):
if i == DEAD:
return
transitions = self.__transitions[i]
if isinstance(transitions, dict):
yield from sorted(transitions.items())
else:
for t in transitions:
if len(t) == 2:
yield t
else:
u, v, j = t
for c in range(u, v + 1):
yield c, j

View File

@@ -0,0 +1,498 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from bisect import bisect_right, insort
from collections import Counter
import attr
from hypothesis.errors import InvalidState
from hypothesis.internal.conjecture.dfa import DFA, cached
from hypothesis.internal.conjecture.junkdrawer import (
IntList,
NotFound,
SelfOrganisingList,
find_integer,
)
"""
This module contains an implementation of the L* algorithm
for learning a deterministic finite automaton based on an
unknown membership function and a series of examples of
strings that may or may not satisfy it.
The two relevant papers for understanding this are:
* Angluin, Dana. "Learning regular sets from queries and counterexamples."
Information and computation 75.2 (1987): 87-106.
* Rivest, Ronald L., and Robert E. Schapire. "Inference of finite automata
using homing sequences." Information and Computation 103.2 (1993): 299-347.
Note that we only use the material from section 4.5 "Improving Angluin's L*
algorithm" (page 318), and all of the rest of the material on homing
sequences can be skipped.
The former explains the core algorithm, the latter a modification
we use (which we have further modified) which allows it to
be implemented more efficiently.
Although we continue to call this L*, we in fact depart heavily from it to the
point where honestly this is an entirely different algorithm and we should come
up with a better name.
We have several major departures from the papers:
1. We learn the automaton lazily as we traverse it. This is particularly
valuable because if we make many corrections on the same string we only
have to learn the transitions that correspond to the string we are
correcting on.
2. We make use of our ``find_integer`` method rather than a binary search
as proposed in the Rivest and Schapire paper, as we expect that
usually most strings will be mispredicted near the beginning.
3. We try to learn a smaller alphabet of "interestingly distinct"
values. e.g. if all bytes larger than two result in an invalid
string, there is no point in distinguishing those bytes. In aid
of this we learn a single canonicalisation table which maps integers
to smaller integers that we currently think are equivalent, and learn
their inequivalence where necessary. This may require more learning
steps, as at each stage in the process we might learn either an
inequivalent pair of integers or a new experiment, but it may greatly
reduce the number of membership queries we have to make.
In addition, we have a totally different approach for mapping a string to its
canonical representative, which will be explained below inline. The general gist
is that our implementation is much more willing to make mistakes: It will often
create a DFA that is demonstrably wrong, based on information that it already
has, but where it is too expensive to discover that before it causes us to
make a mistake.
A note on performance: This code is not really fast enough for
us to ever want to run in production on large strings, and this
is somewhat intrinsic. We should only use it in testing or for
learning languages offline that we can record for later use.
"""
@attr.s(slots=True)
class DistinguishedState:
"""Relevant information for a state that we have witnessed as definitely
distinct from ones we have previously seen so far."""
# Index of this state in the learner's list of states
index: int = attr.ib()
# A string that witnesses this state (i.e. when starting from the origin
# and following this string you will end up in this state).
label: str = attr.ib()
# A boolean as to whether this is an accepting state.
accepting: bool = attr.ib()
# A list of experiments that it is necessary to run to determine whether
# a string is in this state. This is stored as a dict mapping experiments
# to their expected result. A string is only considered to lead to this
# state if ``all(learner.member(s + experiment) == result for experiment,
# result in self.experiments.items())``.
experiments: dict = attr.ib()
# A cache of transitions out of this state, mapping bytes to the states
# that they lead to.
transitions: dict = attr.ib(factory=dict)
class LStar:
"""This class holds the state for learning a DFA. The current DFA can be
accessed as the ``dfa`` member of this class. Such a DFA becomes invalid
as soon as ``learn`` has been called, and should only be used until the
next call to ``learn``.
Note that many of the DFA methods are on this class, but it is not itself
a DFA. The reason for this is that it stores mutable state which can cause
the structure of the learned DFA to change in potentially arbitrary ways,
making all cached properties become nonsense.
"""
def __init__(self, member):
self.experiments = []
self.__experiment_set = set()
self.normalizer = IntegerNormalizer()
self.__member_cache = {}
self.__member = member
self.__generation = 0
# A list of all state objects that correspond to strings we have
# seen and can demonstrate map to unique states.
self.__states = [
DistinguishedState(
index=0,
label=b"",
accepting=self.member(b""),
experiments={b"": self.member(b"")},
)
]
# When we're trying to figure out what state a string leads to we will
# end up searching to find a suitable candidate. By putting states in
# a self-organising list we ideally minimise the number of lookups.
self.__self_organising_states = SelfOrganisingList(self.__states)
self.start = 0
self.__dfa_changed()
def __dfa_changed(self):
"""Note that something has changed, updating the generation
and resetting any cached state."""
self.__generation += 1
self.dfa = LearnedDFA(self)
def is_accepting(self, i):
"""Equivalent to ``self.dfa.is_accepting(i)``"""
return self.__states[i].accepting
def label(self, i):
"""Returns the string label for state ``i``."""
return self.__states[i].label
def transition(self, i, c):
"""Equivalent to ``self.dfa.transition(i, c)```"""
c = self.normalizer.normalize(c)
state = self.__states[i]
try:
return state.transitions[c]
except KeyError:
pass
# The state that we transition to when reading ``c`` is reached by
# this string, because this state is reached by state.label. We thus
# want our candidate for the transition to be some state with a label
# equivalent to this string.
#
# We find such a state by looking for one such that all of its listed
# experiments agree on the result for its state label and this string.
string = state.label + bytes([c])
# We keep track of some useful experiments for distinguishing this
# string from other states, as this both allows us to more accurately
# select the state to map to and, if necessary, create the new state
# that this string corresponds to with a decent set of starting
# experiments.
accumulated = {}
counts = Counter()
def equivalent(t):
"""Checks if ``string`` could possibly lead to state ``t``."""
for e, expected in accumulated.items():
if self.member(t.label + e) != expected:
counts[e] += 1
return False
for e, expected in t.experiments.items():
result = self.member(string + e)
if result != expected:
# We expect most experiments to return False so if we add
# only True ones to our collection of essential experiments
# we keep the size way down and select only ones that are
# likely to provide useful information in future.
if result:
accumulated[e] = result
return False
return True
try:
destination = self.__self_organising_states.find(equivalent)
except NotFound:
i = len(self.__states)
destination = DistinguishedState(
index=i,
label=string,
experiments=accumulated,
accepting=self.member(string),
)
self.__states.append(destination)
self.__self_organising_states.add(destination)
state.transitions[c] = destination.index
return destination.index
def member(self, s):
"""Check whether this string is a member of the language
to be learned."""
try:
return self.__member_cache[s]
except KeyError:
result = self.__member(s)
self.__member_cache[s] = result
return result
@property
def generation(self):
"""Return an integer value that will be incremented
every time the DFA we predict changes."""
return self.__generation
def learn(self, string):
"""Learn to give the correct answer on this string.
That is, after this method completes we will have
``self.dfa.matches(s) == self.member(s)``.
Note that we do not guarantee that this will remain
true in the event that learn is called again with
a different string. It is in principle possible that
future learning will cause us to make a mistake on
this string. However, repeatedly calling learn on
each of a set of strings until the generation stops
changing is guaranteed to terminate.
"""
string = bytes(string)
correct_outcome = self.member(string)
# We don't want to check this inside the loop because it potentially
# causes us to evaluate more of the states than we actually need to,
# but if our model is mostly correct then this will be faster because
# we only need to evaluate strings that are of the form
# ``state + experiment``, which will generally be cached and/or needed
# later.
if self.dfa.matches(string) == correct_outcome:
return
# In the papers they assume that we only run this process
# once, but this is silly - often when you've got a messy
# string it will be wrong for many different reasons.
#
# Thus we iterate this to a fixed point where we repair
# the DFA by repeatedly adding experiments until the DFA
# agrees with the membership function on this string.
# First we make sure that normalization is not the source of the
# failure to match.
while True:
normalized = bytes(self.normalizer.normalize(c) for c in string)
# We can correctly replace the string with its normalized version
# so normalization is not the problem here.
if self.member(normalized) == correct_outcome:
string = normalized
break
alphabet = sorted(set(string), reverse=True)
target = string
for a in alphabet:
def replace(b):
if a == b:
return target
return bytes(b if c == a else c for c in target)
self.normalizer.distinguish(a, lambda x: self.member(replace(x)))
target = replace(self.normalizer.normalize(a))
assert self.member(target) == correct_outcome
assert target != normalized
self.__dfa_changed()
if self.dfa.matches(string) == correct_outcome:
return
# Now we know normalization is correct we can attempt to determine if
# any of our transitions are wrong.
while True:
dfa = self.dfa
states = [dfa.start]
def seems_right(n):
"""After reading n characters from s, do we seem to be
in the right state?
We determine this by replacing the first n characters
of s with the label of the state we expect to be in.
If we are in the right state, that will replace a substring
with an equivalent one so must produce the same answer.
"""
if n > len(string):
return False
# Populate enough of the states list to know where we are.
while n >= len(states):
states.append(dfa.transition(states[-1], string[len(states) - 1]))
return self.member(dfa.label(states[n]) + string[n:]) == correct_outcome
assert seems_right(0)
n = find_integer(seems_right)
# We got to the end without ever finding ourself in a bad
# state, so we must correctly match this string.
if n == len(string):
assert dfa.matches(string) == correct_outcome
break
# Reading n characters does not put us in a bad state but
# reading n + 1 does. This means that the remainder of
# the string that we have not read yet is an experiment
# that allows us to distinguish the state that we ended
# up in from the state that we should have ended up in.
source = states[n]
character = string[n]
wrong_destination = states[n + 1]
# We've made an error in transitioning from ``source`` to
# ``wrong_destination`` via ``character``. We now need to update
# the DFA so that this transition no longer occurs. Note that we
# do not guarantee that the transition is *correct* after this,
# only that we don't make this particular error.
assert self.transition(source, character) == wrong_destination
labels_wrong_destination = self.dfa.label(wrong_destination)
labels_correct_destination = self.dfa.label(source) + bytes([character])
ex = string[n + 1 :]
assert self.member(labels_wrong_destination + ex) != self.member(
labels_correct_destination + ex
)
# Adding this experiment causes us to distinguish the wrong
# destination from the correct one.
self.__states[wrong_destination].experiments[ex] = self.member(
labels_wrong_destination + ex
)
# We now clear the cached details that caused us to make this error
# so that when we recalculate this transition we get to a
# (hopefully now correct) different state.
del self.__states[source].transitions[character]
self.__dfa_changed()
# We immediately recalculate the transition so that we can check
# that it has changed as we expect it to have.
new_destination = self.transition(source, string[n])
assert new_destination != wrong_destination
class LearnedDFA(DFA):
"""This implements a lazily calculated DFA where states
are labelled by some string that reaches them, and are
distinguished by a membership test and a set of experiments."""
def __init__(self, lstar):
super().__init__()
self.__lstar = lstar
self.__generation = lstar.generation
def __check_changed(self):
if self.__generation != self.__lstar.generation:
raise InvalidState(
"The underlying L* model has changed, so this DFA is no longer valid. "
"If you want to preserve a previously learned DFA for posterity, call "
"canonicalise() on it first."
)
def label(self, i):
self.__check_changed()
return self.__lstar.label(i)
@property
def start(self):
self.__check_changed()
return self.__lstar.start
def is_accepting(self, i):
self.__check_changed()
return self.__lstar.is_accepting(i)
def transition(self, i, c):
self.__check_changed()
return self.__lstar.transition(i, c)
@cached
def successor_states(self, state):
"""Returns all of the distinct states that can be reached via one
transition from ``state``, in the lexicographic order of the
smallest character that reaches them."""
seen = set()
result = []
for c in self.__lstar.normalizer.representatives():
j = self.transition(state, c)
if j not in seen:
seen.add(j)
result.append(j)
return tuple(result)
class IntegerNormalizer:
"""A class for replacing non-negative integers with a
"canonical" value that is equivalent for all relevant
purposes."""
def __init__(self):
# We store canonical values as a sorted list of integers
# with each value being treated as equivalent to the largest
# integer in the list that is below it.
self.__values = IntList([0])
self.__cache = {}
def __repr__(self):
return f"IntegerNormalizer({list(self.__values)!r})"
def __copy__(self):
result = IntegerNormalizer()
result.__values = IntList(self.__values)
return result
def representatives(self):
yield from self.__values
def normalize(self, value):
"""Return the canonical integer considered equivalent
to ``value``."""
try:
return self.__cache[value]
except KeyError:
pass
i = bisect_right(self.__values, value) - 1
assert i >= 0
return self.__cache.setdefault(value, self.__values[i])
def distinguish(self, value, test):
"""Checks whether ``test`` gives the same answer for
``value`` and ``self.normalize(value)``. If it does
not, updates the list of canonical values so that
it does.
Returns True if and only if this makes a change to
the underlying canonical values."""
canonical = self.normalize(value)
if canonical == value:
return False
value_test = test(value)
if test(canonical) == value_test:
return False
self.__cache.clear()
def can_lower(k):
new_canon = value - k
if new_canon <= canonical:
return False
return test(new_canon) == value_test
new_canon = value - find_integer(can_lower)
assert new_canon not in self.__values
insort(self.__values, new_canon)
assert self.normalize(value) == new_canon
return True

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,219 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from array import array
from hypothesis.internal.floats import float_to_int, int_to_float
"""
This module implements support for arbitrary floating point numbers in
Conjecture. It doesn't make any attempt to get a good distribution, only to
get a format that will shrink well.
It works by defining an encoding of non-negative floating point numbers
(including NaN values with a zero sign bit) that has good lexical shrinking
properties.
This encoding is a tagged union of two separate encodings for floating point
numbers, with the tag being the first bit of 64 and the remaining 63-bits being
the payload.
If the tag bit is 0, the next 7 bits are ignored, and the remaining 7 bytes are
interpreted as a 7 byte integer in big-endian order and then converted to a
float (there is some redundancy here, as 7 * 8 = 56, which is larger than the
largest integer that floating point numbers can represent exactly, so multiple
encodings may map to the same float).
If the tag bit is 1, we instead use something that is closer to the normal
representation of floats (and can represent every non-negative float exactly)
but has a better ordering:
1. NaNs are ordered after everything else.
2. Infinity is ordered after every finite number.
3. The sign is ignored unless two floating point numbers are identical in
absolute magnitude. In that case, the positive is ordered before the
negative.
4. Positive floating point numbers are ordered first by int(x) where
encoding(x) < encoding(y) if int(x) < int(y).
5. If int(x) == int(y) then x and y are sorted towards lower denominators of
their fractional parts.
The format of this encoding of floating point goes as follows:
[exponent] [mantissa]
Each of these is the same size their equivalent in IEEE floating point, but are
in a different format.
We translate exponents as follows:
1. The maximum exponent (2 ** 11 - 1) is left unchanged.
2. We reorder the remaining exponents so that all of the positive exponents
are first, in increasing order, followed by all of the negative
exponents in decreasing order (where positive/negative is done by the
unbiased exponent e - 1023).
We translate the mantissa as follows:
1. If the unbiased exponent is <= 0 we reverse it bitwise.
2. If the unbiased exponent is >= 52 we leave it alone.
3. If the unbiased exponent is in the range [1, 51] then we reverse the
low k bits, where k is 52 - unbiased exponent.
The low bits correspond to the fractional part of the floating point number.
Reversing it bitwise means that we try to minimize the low bits, which kills
off the higher powers of 2 in the fraction first.
"""
MAX_EXPONENT = 0x7FF
BIAS = 1023
MAX_POSITIVE_EXPONENT = MAX_EXPONENT - 1 - BIAS
def exponent_key(e: int) -> float:
if e == MAX_EXPONENT:
return float("inf")
unbiased = e - BIAS
if unbiased < 0:
return 10000 - unbiased
else:
return unbiased
ENCODING_TABLE = array("H", sorted(range(MAX_EXPONENT + 1), key=exponent_key))
DECODING_TABLE = array("H", [0]) * len(ENCODING_TABLE)
for i, b in enumerate(ENCODING_TABLE):
DECODING_TABLE[b] = i
del i, b
def decode_exponent(e: int) -> int:
"""Take draw_bits(11) and turn it into a suitable floating point exponent
such that lexicographically simpler leads to simpler floats."""
assert 0 <= e <= MAX_EXPONENT
return ENCODING_TABLE[e]
def encode_exponent(e: int) -> int:
"""Take a floating point exponent and turn it back into the equivalent
result from conjecture."""
assert 0 <= e <= MAX_EXPONENT
return DECODING_TABLE[e]
def reverse_byte(b: int) -> int:
result = 0
for _ in range(8):
result <<= 1
result |= b & 1
b >>= 1
return result
# Table mapping individual bytes to the equivalent byte with the bits of the
# byte reversed. e.g. 1=0b1 is mapped to 0xb10000000=0x80=128. We use this
# precalculated table to simplify calculating the bitwise reversal of a longer
# integer.
REVERSE_BITS_TABLE = bytearray(map(reverse_byte, range(256)))
def reverse64(v: int) -> int:
"""Reverse a 64-bit integer bitwise.
We do this by breaking it up into 8 bytes. The 64-bit integer is then the
concatenation of each of these bytes. We reverse it by reversing each byte
on its own using the REVERSE_BITS_TABLE above, and then concatenating the
reversed bytes.
In this case concatenating consists of shifting them into the right
position for the word and then oring the bits together.
"""
assert v.bit_length() <= 64
return (
(REVERSE_BITS_TABLE[(v >> 0) & 0xFF] << 56)
| (REVERSE_BITS_TABLE[(v >> 8) & 0xFF] << 48)
| (REVERSE_BITS_TABLE[(v >> 16) & 0xFF] << 40)
| (REVERSE_BITS_TABLE[(v >> 24) & 0xFF] << 32)
| (REVERSE_BITS_TABLE[(v >> 32) & 0xFF] << 24)
| (REVERSE_BITS_TABLE[(v >> 40) & 0xFF] << 16)
| (REVERSE_BITS_TABLE[(v >> 48) & 0xFF] << 8)
| (REVERSE_BITS_TABLE[(v >> 56) & 0xFF] << 0)
)
MANTISSA_MASK = (1 << 52) - 1
def reverse_bits(x: int, n: int) -> int:
assert x.bit_length() <= n <= 64
x = reverse64(x)
x >>= 64 - n
return x
def update_mantissa(unbiased_exponent: int, mantissa: int) -> int:
if unbiased_exponent <= 0:
mantissa = reverse_bits(mantissa, 52)
elif unbiased_exponent <= 51:
n_fractional_bits = 52 - unbiased_exponent
fractional_part = mantissa & ((1 << n_fractional_bits) - 1)
mantissa ^= fractional_part
mantissa |= reverse_bits(fractional_part, n_fractional_bits)
return mantissa
def lex_to_float(i: int) -> float:
assert i.bit_length() <= 64
has_fractional_part = i >> 63
if has_fractional_part:
exponent = (i >> 52) & ((1 << 11) - 1)
exponent = decode_exponent(exponent)
mantissa = i & MANTISSA_MASK
mantissa = update_mantissa(exponent - BIAS, mantissa)
assert mantissa.bit_length() <= 52
return int_to_float((exponent << 52) | mantissa)
else:
integral_part = i & ((1 << 56) - 1)
return float(integral_part)
def float_to_lex(f: float) -> int:
if is_simple(f):
assert f >= 0
return int(f)
return base_float_to_lex(f)
def base_float_to_lex(f: float) -> int:
i = float_to_int(f)
i &= (1 << 63) - 1
exponent = i >> 52
mantissa = i & MANTISSA_MASK
mantissa = update_mantissa(exponent - BIAS, mantissa)
exponent = encode_exponent(exponent)
assert mantissa.bit_length() <= 52
return (1 << 63) | (exponent << 52) | mantissa
def is_simple(f: float) -> int:
try:
i = int(f)
except (ValueError, OverflowError):
return False
if i != f:
return False
return i.bit_length() <= 56

View File

@@ -0,0 +1,399 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
"""A module for miscellaneous useful bits and bobs that don't
obviously belong anywhere else. If you spot a better home for
anything that lives here, please move it."""
import array
import sys
import warnings
from random import Random
from typing import (
Callable,
Dict,
Generic,
Iterable,
Iterator,
List,
Optional,
Sequence,
Tuple,
TypeVar,
Union,
overload,
)
from hypothesis.errors import HypothesisWarning
ARRAY_CODES = ["B", "H", "I", "L", "Q", "O"]
def array_or_list(
code: str, contents: Iterable[int]
) -> "Union[List[int], array.ArrayType[int]]":
if code == "O":
return list(contents)
return array.array(code, contents)
def replace_all(
buffer: Sequence[int],
replacements: Iterable[Tuple[int, int, Sequence[int]]],
) -> bytes:
"""Substitute multiple replacement values into a buffer.
Replacements is a list of (start, end, value) triples.
"""
result = bytearray()
prev = 0
offset = 0
for u, v, r in replacements:
result.extend(buffer[prev:u])
result.extend(r)
prev = v
offset += len(r) - (v - u)
result.extend(buffer[prev:])
assert len(result) == len(buffer) + offset
return bytes(result)
NEXT_ARRAY_CODE = dict(zip(ARRAY_CODES, ARRAY_CODES[1:]))
class IntList(Sequence[int]):
"""Class for storing a list of non-negative integers compactly.
We store them as the smallest size integer array we can get
away with. When we try to add an integer that is too large,
we upgrade the array to the smallest word size needed to store
the new value."""
__slots__ = ("__underlying",)
__underlying: "Union[List[int], array.ArrayType[int]]"
def __init__(self, values: Sequence[int] = ()):
for code in ARRAY_CODES:
try:
underlying = array_or_list(code, values)
break
except OverflowError:
pass
else: # pragma: no cover
raise AssertionError(f"Could not create storage for {values!r}")
if isinstance(underlying, list):
for v in underlying:
if not isinstance(v, int) or v < 0:
raise ValueError(f"Could not create IntList for {values!r}")
self.__underlying = underlying
@classmethod
def of_length(cls, n: int) -> "IntList":
return cls(array_or_list("B", [0]) * n)
def count(self, value: int) -> int:
return self.__underlying.count(value)
def __repr__(self):
return f"IntList({list(self.__underlying)!r})"
def __len__(self):
return len(self.__underlying)
@overload
def __getitem__(self, i: int) -> int:
... # pragma: no cover
@overload
def __getitem__(self, i: slice) -> "IntList":
... # pragma: no cover
def __getitem__(self, i: Union[int, slice]) -> "Union[int, IntList]":
if isinstance(i, slice):
return IntList(self.__underlying[i])
return self.__underlying[i]
def __delitem__(self, i: int) -> None:
del self.__underlying[i]
def insert(self, i: int, v: int) -> None:
self.__underlying.insert(i, v)
def __iter__(self) -> Iterator[int]:
return iter(self.__underlying)
def __eq__(self, other: object) -> bool:
if self is other:
return True
if not isinstance(other, IntList):
return NotImplemented
return self.__underlying == other.__underlying
def __ne__(self, other: object) -> bool:
if self is other:
return False
if not isinstance(other, IntList):
return NotImplemented
return self.__underlying != other.__underlying
def append(self, n: int) -> None:
i = len(self)
self.__underlying.append(0)
self[i] = n
def __setitem__(self, i: int, n: int) -> None:
while True:
try:
self.__underlying[i] = n
return
except OverflowError:
assert n > 0
self.__upgrade()
def extend(self, ls: Iterable[int]) -> None:
for n in ls:
self.append(n)
def __upgrade(self) -> None:
assert isinstance(self.__underlying, array.array)
code = NEXT_ARRAY_CODE[self.__underlying.typecode]
self.__underlying = array_or_list(code, self.__underlying)
def binary_search(lo: int, hi: int, f: Callable[[int], bool]) -> int:
"""Binary searches in [lo , hi) to find
n such that f(n) == f(lo) but f(n + 1) != f(lo).
It is implicitly assumed and will not be checked
that f(hi) != f(lo).
"""
reference = f(lo)
while lo + 1 < hi:
mid = (lo + hi) // 2
if f(mid) == reference:
lo = mid
else:
hi = mid
return lo
def uniform(random: Random, n: int) -> bytes:
"""Returns a bytestring of length n, distributed uniformly at random."""
return random.getrandbits(n * 8).to_bytes(n, "big")
T = TypeVar("T")
class LazySequenceCopy:
"""A "copy" of a sequence that works by inserting a mask in front
of the underlying sequence, so that you can mutate it without changing
the underlying sequence. Effectively behaves as if you could do list(x)
in O(1) time. The full list API is not supported yet but there's no reason
in principle it couldn't be."""
__mask: Optional[Dict[int, int]]
def __init__(self, values: Sequence[int]):
self.__values = values
self.__len = len(values)
self.__mask = None
def __len__(self) -> int:
return self.__len
def pop(self) -> int:
if len(self) == 0:
raise IndexError("Cannot pop from empty list")
result = self[-1]
self.__len -= 1
if self.__mask is not None:
self.__mask.pop(self.__len, None)
return result
def __getitem__(self, i: int) -> int:
i = self.__check_index(i)
default = self.__values[i]
if self.__mask is None:
return default
else:
return self.__mask.get(i, default)
def __setitem__(self, i: int, v: int) -> None:
i = self.__check_index(i)
if self.__mask is None:
self.__mask = {}
self.__mask[i] = v
def __check_index(self, i: int) -> int:
n = len(self)
if i < -n or i >= n:
raise IndexError(f"Index {i} out of range [0, {n})")
if i < 0:
i += n
assert 0 <= i < n
return i
def clamp(lower: int, value: int, upper: int) -> int:
"""Given a value and lower/upper bounds, 'clamp' the value so that
it satisfies lower <= value <= upper."""
return max(lower, min(value, upper))
def swap(ls: LazySequenceCopy, i: int, j: int) -> None:
"""Swap the elements ls[i], ls[j]."""
if i == j:
return
ls[i], ls[j] = ls[j], ls[i]
def stack_depth_of_caller() -> int:
"""Get stack size for caller's frame.
From https://stackoverflow.com/a/47956089/9297601 , this is a simple
but much faster alternative to `len(inspect.stack(0))`. We use it
with get/set recursionlimit to make stack overflows non-flaky; see
https://github.com/HypothesisWorks/hypothesis/issues/2494 for details.
"""
frame = sys._getframe(2)
size = 1
while frame:
frame = frame.f_back # type: ignore[assignment]
size += 1
return size
class ensure_free_stackframes:
"""Context manager that ensures there are at least N free stackframes (for
a reasonable value of N).
"""
def __enter__(self):
cur_depth = stack_depth_of_caller()
self.old_maxdepth = sys.getrecursionlimit()
# The default CPython recursionlimit is 1000, but pytest seems to bump
# it to 3000 during test execution. Let's make it something reasonable:
self.new_maxdepth = cur_depth + 2000
# Because we add to the recursion limit, to be good citizens we also
# add a check for unbounded recursion. The default limit is typically
# 1000/3000, so this can only ever trigger if something really strange
# is happening and it's hard to imagine an
# intentionally-deeply-recursive use of this code.
assert cur_depth <= 1000, (
"Hypothesis would usually add %d to the stack depth of %d here, "
"but we are already much deeper than expected. Aborting now, to "
"avoid extending the stack limit in an infinite loop..."
% (self.new_maxdepth - self.old_maxdepth, self.old_maxdepth)
)
sys.setrecursionlimit(self.new_maxdepth)
def __exit__(self, *args, **kwargs):
if self.new_maxdepth == sys.getrecursionlimit():
sys.setrecursionlimit(self.old_maxdepth)
else: # pragma: no cover
warnings.warn(
"The recursion limit will not be reset, since it was changed "
"from another thread or during execution of a test.",
HypothesisWarning,
stacklevel=2,
)
def find_integer(f: Callable[[int], bool]) -> int:
"""Finds a (hopefully large) integer such that f(n) is True and f(n + 1) is
False.
f(0) is assumed to be True and will not be checked.
"""
# We first do a linear scan over the small numbers and only start to do
# anything intelligent if f(4) is true. This is because it's very hard to
# win big when the result is small. If the result is 0 and we try 2 first
# then we've done twice as much work as we needed to!
for i in range(1, 5):
if not f(i):
return i - 1
# We now know that f(4) is true. We want to find some number for which
# f(n) is *not* true.
# lo is the largest number for which we know that f(lo) is true.
lo = 4
# Exponential probe upwards until we find some value hi such that f(hi)
# is not true. Subsequently we maintain the invariant that hi is the
# smallest number for which we know that f(hi) is not true.
hi = 5
while f(hi):
lo = hi
hi *= 2
# Now binary search until lo + 1 = hi. At that point we have f(lo) and not
# f(lo + 1), as desired..
while lo + 1 < hi:
mid = (lo + hi) // 2
if f(mid):
lo = mid
else:
hi = mid
return lo
def pop_random(random: Random, seq: LazySequenceCopy) -> int:
"""Remove and return a random element of seq. This runs in O(1) but leaves
the sequence in an arbitrary order."""
i = random.randrange(0, len(seq))
swap(seq, i, len(seq) - 1)
return seq.pop()
class NotFound(Exception):
pass
class SelfOrganisingList(Generic[T]):
"""A self-organising list with the move-to-front heuristic.
A self-organising list is a collection which we want to retrieve items
that satisfy some predicate from. There is no faster way to do this than
a linear scan (as the predicates may be arbitrary), but the performance
of a linear scan can vary dramatically - if we happen to find a good item
on the first try it's O(1) after all. The idea of a self-organising list is
to reorder the list to try to get lucky this way as often as possible.
There are various heuristics we could use for this, and it's not clear
which are best. We use the simplest, which is that every time we find
an item we move it to the "front" (actually the back in our implementation
because we iterate in reverse) of the list.
"""
def __init__(self, values: Iterable[T] = ()) -> None:
self.__values = list(values)
def __repr__(self) -> str:
return f"SelfOrganisingList({self.__values!r})"
def add(self, value: T) -> None:
"""Add a value to this list."""
self.__values.append(value)
def find(self, condition: Callable[[T], bool]) -> T:
"""Returns some value in this list such that ``condition(value)`` is
True. If no such value exists raises ``NotFound``."""
for i in range(len(self.__values) - 1, -1, -1):
value = self.__values[i]
if condition(value):
del self.__values[i]
self.__values.append(value)
return value
raise NotFound("No values satisfying condition")

View File

@@ -0,0 +1,168 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from hypothesis.internal.compat import int_from_bytes, int_to_bytes
from hypothesis.internal.conjecture.data import Status
from hypothesis.internal.conjecture.engine import BUFFER_SIZE
from hypothesis.internal.conjecture.junkdrawer import find_integer
from hypothesis.internal.conjecture.pareto import NO_SCORE
class Optimiser:
"""A fairly basic optimiser designed to increase the value of scores for
targeted property-based testing.
This implements a fairly naive hill climbing algorithm based on randomly
regenerating parts of the test case to attempt to improve the result. It is
not expected to produce amazing results, because it is designed to be run
in a fairly small testing budget, so it prioritises finding easy wins and
bailing out quickly if that doesn't work.
For more information about targeted property-based testing, see
Löscher, Andreas, and Konstantinos Sagonas. "Targeted property-based
testing." Proceedings of the 26th ACM SIGSOFT International Symposium on
Software Testing and Analysis. ACM, 2017.
"""
def __init__(self, engine, data, target, max_improvements=100):
"""Optimise ``target`` starting from ``data``. Will stop either when
we seem to have found a local maximum or when the target score has
been improved ``max_improvements`` times. This limit is in place to
deal with the fact that the target score may not be bounded above."""
self.engine = engine
self.current_data = data
self.target = target
self.max_improvements = max_improvements
self.improvements = 0
def run(self):
self.hill_climb()
def score_function(self, data):
return data.target_observations.get(self.target, NO_SCORE)
@property
def current_score(self):
return self.score_function(self.current_data)
def consider_new_test_data(self, data):
"""Consider a new data object as a candidate target. If it is better
than the current one, return True."""
if data.status < Status.VALID:
return False
score = self.score_function(data)
if score < self.current_score:
return False
if score > self.current_score:
self.improvements += 1
self.current_data = data
return True
assert score == self.current_score
# We allow transitions that leave the score unchanged as long as they
# don't increase the buffer size. This gives us a certain amount of
# freedom for lateral moves that will take us out of local maxima.
if len(data.buffer) <= len(self.current_data.buffer):
self.current_data = data
return True
return False
def hill_climb(self):
"""The main hill climbing loop where we actually do the work: Take
data, and attempt to improve its score for target. select_example takes
a data object and returns an index to an example where we should focus
our efforts."""
blocks_examined = set()
prev = None
i = len(self.current_data.blocks) - 1
while i >= 0 and self.improvements <= self.max_improvements:
if prev is not self.current_data:
i = len(self.current_data.blocks) - 1
prev = self.current_data
if i in blocks_examined:
i -= 1
continue
blocks_examined.add(i)
data = self.current_data
block = data.blocks[i]
prefix = data.buffer[: block.start]
existing = data.buffer[block.start : block.end]
existing_as_int = int_from_bytes(existing)
max_int_value = (256 ** len(existing)) - 1
if existing_as_int == max_int_value:
continue
def attempt_replace(v):
"""Try replacing the current block in the current best test case
with an integer of value i. Note that we use the *current*
best and not the one we started with. This helps ensure that
if we luck into a good draw when making random choices we get
to keep the good bits."""
if v < 0 or v > max_int_value:
return False
v_as_bytes = int_to_bytes(v, len(existing))
# We make a couple attempts at replacement. This only matters
# if we end up growing the buffer - otherwise we exit the loop
# early - but in the event that there *is* some randomized
# component we want to give it a couple of tries to succeed.
for _ in range(3):
attempt = self.engine.cached_test_function(
prefix
+ v_as_bytes
+ self.current_data.buffer[block.end :]
+ bytes(BUFFER_SIZE),
)
if self.consider_new_test_data(attempt):
return True
if attempt.status < Status.INVALID or len(attempt.buffer) == len(
self.current_data.buffer
):
return False
for i, ex in enumerate(self.current_data.examples):
if ex.start >= block.end:
break
if ex.end <= block.start:
continue
ex_attempt = attempt.examples[i]
if ex.length == ex_attempt.length:
continue
replacement = attempt.buffer[ex_attempt.start : ex_attempt.end]
if self.consider_new_test_data(
self.engine.cached_test_function(
prefix
+ replacement
+ self.current_data.buffer[ex.end :]
)
):
return True
return False
# We unconditionally scan both upwards and downwards. The reason
# for this is that we allow "lateral" moves that don't increase the
# score but instead leave it constant. All else being equal we'd
# like to leave the test case closer to shrunk, so afterwards we
# try lowering the value towards zero even if we've just raised it.
if not attempt_replace(max_int_value):
find_integer(lambda k: attempt_replace(k + existing_as_int))
existing = self.current_data.buffer[block.start : block.end]
existing_as_int = int_from_bytes(existing)
if not attempt_replace(0):
find_integer(lambda k: attempt_replace(existing_as_int - k))

View File

@@ -0,0 +1,339 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from enum import Enum
from sortedcontainers import SortedList
from hypothesis.internal.conjecture.data import ConjectureData, ConjectureResult, Status
from hypothesis.internal.conjecture.junkdrawer import LazySequenceCopy, swap
from hypothesis.internal.conjecture.shrinker import sort_key
NO_SCORE = float("-inf")
class DominanceRelation(Enum):
NO_DOMINANCE = 0
EQUAL = 1
LEFT_DOMINATES = 2
RIGHT_DOMINATES = 3
def dominance(left, right):
"""Returns the dominance relation between ``left`` and ``right``, according
to the rules that one ConjectureResult dominates another if and only if it
is better in every way.
The things we currently consider to be "better" are:
* Something that is smaller in shrinking order is better.
* Something that has higher status is better.
* Each ``interesting_origin`` is treated as its own score, so if two
interesting examples have different origins then neither dominates
the other.
* For each target observation, a higher score is better.
In "normal" operation where there are no bugs or target observations, the
pareto front only has one element (the smallest valid test case), but for
more structured or failing tests it can be useful to track, and future work
will depend on it more."""
if left.buffer == right.buffer:
return DominanceRelation.EQUAL
if sort_key(right.buffer) < sort_key(left.buffer):
result = dominance(left=right, right=left)
if result == DominanceRelation.LEFT_DOMINATES:
return DominanceRelation.RIGHT_DOMINATES
else:
# Because we have sort_key(left) < sort_key(right) the only options
# are that right is better than left or that the two are
# incomparable.
assert result == DominanceRelation.NO_DOMINANCE
return result
# Either left is better or there is no dominance relationship.
assert sort_key(left.buffer) < sort_key(right.buffer)
# The right is more interesting
if left.status < right.status:
return DominanceRelation.NO_DOMINANCE
if not right.tags.issubset(left.tags):
return DominanceRelation.NO_DOMINANCE
# Things that are interesting for different reasons are incomparable in
# the dominance relationship.
if (
left.status == Status.INTERESTING
and left.interesting_origin != right.interesting_origin
):
return DominanceRelation.NO_DOMINANCE
for target in set(left.target_observations) | set(right.target_observations):
left_score = left.target_observations.get(target, NO_SCORE)
right_score = right.target_observations.get(target, NO_SCORE)
if right_score > left_score:
return DominanceRelation.NO_DOMINANCE
return DominanceRelation.LEFT_DOMINATES
class ParetoFront:
"""Maintains an approximate pareto front of ConjectureData objects. That
is, we try to maintain a collection of objects such that no element of the
collection is pareto dominated by any other. In practice we don't quite
manage that, because doing so is computationally very expensive. Instead
we maintain a random sample of data objects that are "rarely" dominated by
any other element of the collection (roughly, no more than about 10%).
Only valid test cases are considered to belong to the pareto front - any
test case with a status less than valid is discarded.
Note that the pareto front is potentially quite large, and currently this
will store the entire front in memory. This is bounded by the number of
valid examples we run, which is max_examples in normal execution, and
currently we do not support workflows with large max_examples which have
large values of max_examples very well anyway, so this isn't a major issue.
In future we may weish to implement some sort of paging out to disk so that
we can work with larger fronts.
Additionally, because this is only an approximate pareto front, there are
scenarios where it can be much larger than the actual pareto front. There
isn't a huge amount we can do about this - checking an exact pareto front
is intrinsically quadratic.
"Most" of the time we should be relatively close to the true pareto front,
say within an order of magnitude, but it's not hard to construct scenarios
where this is not the case. e.g. suppose we enumerate all valid test cases
in increasing shortlex order as s_1, ..., s_n, ... and have scores f and
g such that f(s_i) = min(i, N) and g(s_i) = 1 if i >= N, then the pareto
front is the set {s_1, ..., S_N}, but the only element of the front that
will dominate s_i when i > N is S_N, which we select with probability
1 / N. A better data structure could solve this, but at the cost of more
expensive operations and higher per element memory use, so we'll wait to
see how much of a problem this is in practice before we try that.
"""
def __init__(self, random):
self.__random = random
self.__eviction_listeners = []
self.front = SortedList(key=lambda d: sort_key(d.buffer))
self.__pending = None
def add(self, data):
"""Attempts to add ``data`` to the pareto front. Returns True if
``data`` is now in the front, including if data is already in the
collection, and False otherwise"""
data = data.as_result()
if data.status < Status.VALID:
return False
if not self.front:
self.front.add(data)
return True
if data in self.front:
return True
# We add data to the pareto front by adding it unconditionally and then
# doing a certain amount of randomized "clear down" - testing a random
# set of elements (currently 10) to see if they are dominated by
# something else in the collection. If they are, we remove them.
self.front.add(data)
assert self.__pending is None
try:
self.__pending = data
# We maintain a set of the current exact pareto front of the
# values we've sampled so far. When we sample a new element we
# either add it to this exact pareto front or remove it from the
# collection entirely.
front = LazySequenceCopy(self.front)
# We track which values we are going to remove and remove them all
# at the end so the shape of the front doesn't change while we're
# using it.
to_remove = []
# We now iteratively sample elements from the approximate pareto
# front to check whether they should be retained. When the set of
# dominators gets too large we have sampled at least 10 elements
# and it gets too expensive to continue, so we consider that enough
# due diligence.
i = self.front.index(data)
# First we attempt to look for values that must be removed by the
# addition of the data. These are necessarily to the right of it
# in the list.
failures = 0
while i + 1 < len(front) and failures < 10:
j = self.__random.randrange(i + 1, len(front))
swap(front, j, len(front) - 1)
candidate = front.pop()
dom = dominance(data, candidate)
assert dom != DominanceRelation.RIGHT_DOMINATES
if dom == DominanceRelation.LEFT_DOMINATES:
to_remove.append(candidate)
failures = 0
else:
failures += 1
# Now we look at the points up to where we put data in to see if
# it is dominated. While we're here we spend some time looking for
# anything else that might be dominated too, compacting down parts
# of the list.
dominators = [data]
while i >= 0 and len(dominators) < 10:
swap(front, i, self.__random.randint(0, i))
candidate = front[i]
already_replaced = False
j = 0
while j < len(dominators):
v = dominators[j]
dom = dominance(candidate, v)
if dom == DominanceRelation.LEFT_DOMINATES:
if not already_replaced:
already_replaced = True
dominators[j] = candidate
j += 1
else:
dominators[j], dominators[-1] = (
dominators[-1],
dominators[j],
)
dominators.pop()
to_remove.append(v)
elif dom == DominanceRelation.RIGHT_DOMINATES:
to_remove.append(candidate)
break
elif dom == DominanceRelation.EQUAL:
break
else:
j += 1
else:
dominators.append(candidate)
i -= 1
for v in to_remove:
self.__remove(v)
return data in self.front
finally:
self.__pending = None
def on_evict(self, f):
"""Register a listener function that will be called with data when it
gets removed from the front because something else dominates it."""
self.__eviction_listeners.append(f)
def __contains__(self, data):
return isinstance(data, (ConjectureData, ConjectureResult)) and (
data.as_result() in self.front
)
def __iter__(self):
return iter(self.front)
def __getitem__(self, i):
return self.front[i]
def __len__(self):
return len(self.front)
def __remove(self, data):
try:
self.front.remove(data)
except ValueError:
return
if data is not self.__pending:
for f in self.__eviction_listeners:
f(data)
class ParetoOptimiser:
"""Class for managing optimisation of the pareto front. That is, given the
current best known pareto front, this class runs an optimisation process
that attempts to bring it closer to the actual pareto front.
Currently this is fairly basic and only handles pareto optimisation that
works by reducing the test case in the shortlex order. We expect it will
grow more powerful over time.
"""
def __init__(self, engine):
self.__engine = engine
self.front = self.__engine.pareto_front
def run(self):
seen = set()
# We iterate backwards through the pareto front, using the shrinker to
# (hopefully) replace each example with a smaller one. Note that it's
# important that we start from the end for two reasons: Firstly, by
# doing it this way we ensure that any new front members we discover
# during optimisation will also get optimised (because they will be
# inserted into the part of the front that we haven't visited yet),
# and secondly we generally expect that we will not finish this process
# in a single run, because it's relatively expensive in terms of our
# example budget, and by starting from the end we ensure that each time
# we run the tests we improve the pareto front because we work on the
# bits that we haven't covered yet.
i = len(self.front) - 1
prev = None
while i >= 0 and not self.__engine.interesting_examples:
assert self.front
i = min(i, len(self.front) - 1)
target = self.front[i]
if target.buffer in seen:
i -= 1
continue
assert target is not prev
prev = target
def allow_transition(source, destination):
"""Shrink to data that strictly pareto dominates the current
best value we've seen, which is the current target of the
shrinker.
Note that during shrinking we may discover other smaller
examples that this function will reject and will get added to
the front. This is fine, because they will be processed on
later iterations of this loop."""
if dominance(destination, source) == DominanceRelation.LEFT_DOMINATES:
# If ``destination`` dominates ``source`` then ``source``
# must be dominated in the front - either ``destination`` is in
# the front, or it was not added to it because it was
# dominated by something in it.,
try:
self.front.front.remove(source)
except ValueError:
pass
return True
return False
shrunk = self.__engine.shrink(target, allow_transition=allow_transition)
seen.add(shrunk.buffer)
# Note that the front may have changed shape arbitrarily when
# we ran the shrinker. If it didn't change shape then this is
# i - 1. If it did change shape then this is the largest value
# in the front which is smaller than the previous target, so
# is the correct place to resume from. In particular note that the
# size of the front might have grown because of slippage during the
# shrink, but all of the newly introduced elements will be smaller
# than `target`, so will be covered by this iteration.
i = self.front.front.bisect_left(target)

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,16 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from hypothesis.internal.conjecture.shrinking.floats import Float
from hypothesis.internal.conjecture.shrinking.integer import Integer
from hypothesis.internal.conjecture.shrinking.lexical import Lexical
from hypothesis.internal.conjecture.shrinking.ordering import Ordering
__all__ = ["Lexical", "Integer", "Ordering", "Float"]

View File

@@ -0,0 +1,175 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
"""This module implements various useful common functions for shrinking tasks."""
class Shrinker:
"""A Shrinker object manages a single value and a predicate it should
satisfy, and attempts to improve it in some direction, making it smaller
and simpler."""
def __init__(
self,
initial,
predicate,
random,
*,
full=False,
debug=False,
name=None,
**kwargs,
):
self.setup(**kwargs)
self.current = self.make_immutable(initial)
self.initial = self.current
self.random = random
self.full = full
self.changes = 0
self.name = name
self.__predicate = predicate
self.__seen = set()
self.debugging_enabled = debug
@property
def calls(self):
return len(self.__seen)
def __repr__(self):
return "{}({}initial={!r}, current={!r})".format(
type(self).__name__,
"" if self.name is None else f"{self.name!r}, ",
self.initial,
self.current,
)
def setup(self, **kwargs):
"""Runs initial setup code.
Convenience function for children that doesn't require messing
with the signature of init.
"""
def delegate(self, other_class, convert_to, convert_from, **kwargs):
"""Delegates shrinking to another shrinker class, by converting the
current value to and from it with provided functions."""
self.call_shrinker(
other_class,
convert_to(self.current),
lambda v: self.consider(convert_from(v)),
**kwargs,
)
def call_shrinker(self, other_class, initial, predicate, **kwargs):
"""Calls another shrinker class, passing through the relevant context
variables.
Note we explicitly do not pass through full.
"""
return other_class.shrink(initial, predicate, random=self.random, **kwargs)
def debug(self, *args):
if self.debugging_enabled:
print("DEBUG", self, *args)
@classmethod
def shrink(cls, initial, predicate, **kwargs):
"""Shrink the value ``initial`` subject to the constraint that it
satisfies ``predicate``.
Returns the shrunk value.
"""
shrinker = cls(initial, predicate, **kwargs)
shrinker.run()
return shrinker.current
def run(self):
"""Run for an appropriate number of steps to improve the current value.
If self.full is True, will run until no further improvements can
be found.
"""
if self.short_circuit():
return
if self.full:
prev = -1
while self.changes != prev:
prev = self.changes
self.run_step()
else:
self.run_step()
self.debug("COMPLETE")
def incorporate(self, value):
"""Try using ``value`` as a possible candidate improvement.
Return True if it works.
"""
value = self.make_immutable(value)
self.check_invariants(value)
if not self.left_is_better(value, self.current):
if value != self.current and (value == value):
self.debug(f"Rejected {value!r} as worse than {self.current=}")
return False
if value in self.__seen:
return False
self.__seen.add(value)
if self.__predicate(value):
self.debug(f"shrinking to {value!r}")
self.changes += 1
self.current = value
return True
return False
def consider(self, value):
"""Returns True if make_immutable(value) == self.current after calling
self.incorporate(value)."""
value = self.make_immutable(value)
if value == self.current:
return True
return self.incorporate(value)
def make_immutable(self, value):
"""Convert value into an immutable (and hashable) representation of
itself.
It is these immutable versions that the shrinker will work on.
Defaults to just returning the value.
"""
return value
def check_invariants(self, value):
"""Make appropriate assertions about the value to ensure that it is
valid for this shrinker.
Does nothing by default.
"""
raise NotImplementedError
def short_circuit(self):
"""Possibly attempt to do some shrinking.
If this returns True, the ``run`` method will terminate early
without doing any more work.
"""
raise NotImplementedError
def left_is_better(self, left, right):
"""Returns True if the left is strictly simpler than the right
according to the standards of this shrinker."""
raise NotImplementedError
def run_step(self):
"""Run a single step of the main shrink loop, attempting to improve the
current value."""
raise NotImplementedError

View File

@@ -0,0 +1,338 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
"""
This is a module for learning new DFAs that help normalize test
functions. That is, given a test function that sometimes shrinks
to one thing and sometimes another, this module is designed to
help learn new DFA-based shrink passes that will cause it to
always shrink to the same thing.
"""
import hashlib
import math
from itertools import islice
from pathlib import Path
from hypothesis import HealthCheck, settings
from hypothesis.errors import HypothesisException
from hypothesis.internal.conjecture.data import ConjectureResult, Status
from hypothesis.internal.conjecture.dfa.lstar import LStar
from hypothesis.internal.conjecture.shrinking.learned_dfas import (
SHRINKING_DFAS,
__file__ as _learned_dfa_file,
)
learned_dfa_file = Path(_learned_dfa_file)
class FailedToNormalise(HypothesisException):
pass
def update_learned_dfas():
"""Write any modifications to the SHRINKING_DFAS dictionary
back to the learned DFAs file."""
source = learned_dfa_file.read_text(encoding="utf-8")
lines = source.splitlines()
i = lines.index("# AUTOGENERATED BEGINS")
del lines[i + 1 :]
lines.append("")
lines.append("# fmt: off")
lines.append("")
for k, v in sorted(SHRINKING_DFAS.items()):
lines.append(f"SHRINKING_DFAS[{k!r}] = {v!r} # noqa: E501")
lines.append("")
lines.append("# fmt: on")
new_source = "\n".join(lines) + "\n"
if new_source != source:
learned_dfa_file.write_text(new_source, encoding="utf-8")
def learn_a_new_dfa(runner, u, v, predicate):
"""Given two buffers ``u`` and ``v```, learn a DFA that will
allow the shrinker to normalise them better. ``u`` and ``v``
should not currently shrink to the same test case when calling
this function."""
from hypothesis.internal.conjecture.shrinker import dfa_replacement, sort_key
assert predicate(runner.cached_test_function(u))
assert predicate(runner.cached_test_function(v))
u_shrunk = fully_shrink(runner, u, predicate)
v_shrunk = fully_shrink(runner, v, predicate)
u, v = sorted((u_shrunk.buffer, v_shrunk.buffer), key=sort_key)
assert u != v
assert not v.startswith(u)
# We would like to avoid using LStar on large strings as its
# behaviour can be quadratic or worse. In order to help achieve
# this we peel off a common prefix and suffix of the two final
# results and just learn the internal bit where they differ.
#
# This potentially reduces the length quite far if there's
# just one tricky bit of control flow we're struggling to
# reduce inside a strategy somewhere and the rest of the
# test function reduces fine.
if v.endswith(u):
prefix = b""
suffix = u
u_core = b""
assert len(u) > 0
v_core = v[: -len(u)]
else:
i = 0
while u[i] == v[i]:
i += 1
prefix = u[:i]
assert u.startswith(prefix)
assert v.startswith(prefix)
i = 1
while u[-i] == v[-i]:
i += 1
suffix = u[max(len(prefix), len(u) + 1 - i) :]
assert u.endswith(suffix)
assert v.endswith(suffix)
u_core = u[len(prefix) : len(u) - len(suffix)]
v_core = v[len(prefix) : len(v) - len(suffix)]
assert u == prefix + u_core + suffix, (list(u), list(v))
assert v == prefix + v_core + suffix, (list(u), list(v))
better = runner.cached_test_function(u)
worse = runner.cached_test_function(v)
allow_discards = worse.has_discards or better.has_discards
def is_valid_core(s):
if not (len(u_core) <= len(s) <= len(v_core)):
return False
buf = prefix + s + suffix
result = runner.cached_test_function(buf)
return (
predicate(result)
# Because we're often using this to learn strategies
# rather than entire complex test functions, it's
# important that our replacements are precise and
# don't leave the rest of the test case in a weird
# state.
and result.buffer == buf
# Because the shrinker is good at removing discarded
# data, unless we need discards to allow one or both
# of u and v to result in valid shrinks, we don't
# count attempts that have them as valid. This will
# cause us to match fewer strings, which will make
# the resulting shrink pass more efficient when run
# on test functions it wasn't really intended for.
and (allow_discards or not result.has_discards)
)
assert sort_key(u_core) < sort_key(v_core)
assert is_valid_core(u_core)
assert is_valid_core(v_core)
learner = LStar(is_valid_core)
prev = -1
while learner.generation != prev:
prev = learner.generation
learner.learn(u_core)
learner.learn(v_core)
# L* has a tendency to learn DFAs which wrap around to
# the beginning. We don't want to it to do that unless
# it's accurate, so we use these as examples to show
# check going around the DFA twice.
learner.learn(u_core * 2)
learner.learn(v_core * 2)
if learner.dfa.max_length(learner.dfa.start) > len(v_core):
# The language we learn is finite and bounded above
# by the length of v_core. This is important in order
# to keep our shrink passes reasonably efficient -
# otherwise they can match far too much. So whenever
# we learn a DFA that could match a string longer
# than len(v_core) we fix it by finding the first
# string longer than v_core and learning that as
# a correction.
x = next(learner.dfa.all_matching_strings(min_length=len(v_core) + 1))
assert not is_valid_core(x)
learner.learn(x)
assert not learner.dfa.matches(x)
assert learner.generation != prev
else:
# We mostly care about getting the right answer on the
# minimal test case, but because we're doing this offline
# anyway we might as well spend a little more time trying
# small examples to make sure the learner gets them right.
for x in islice(learner.dfa.all_matching_strings(), 100):
if not is_valid_core(x):
learner.learn(x)
assert learner.generation != prev
break
# We've now successfully learned a DFA that works for shrinking
# our failed normalisation further. Canonicalise it into a concrete
# DFA so we can save it for later.
new_dfa = learner.dfa.canonicalise()
assert math.isfinite(new_dfa.max_length(new_dfa.start))
shrinker = runner.new_shrinker(runner.cached_test_function(v), predicate)
assert (len(prefix), len(v) - len(suffix)) in shrinker.matching_regions(new_dfa)
name = "tmp-dfa-" + repr(new_dfa)
shrinker.extra_dfas[name] = new_dfa
shrinker.fixate_shrink_passes([dfa_replacement(name)])
assert sort_key(shrinker.buffer) < sort_key(v)
return new_dfa
def fully_shrink(runner, test_case, predicate):
if not isinstance(test_case, ConjectureResult):
test_case = runner.cached_test_function(test_case)
while True:
shrunk = runner.shrink(test_case, predicate)
if shrunk.buffer == test_case.buffer:
break
test_case = shrunk
return test_case
def normalize(
base_name,
test_function,
*,
required_successes=100,
allowed_to_update=False,
max_dfas=10,
random=None,
):
"""Attempt to ensure that this test function successfully normalizes - i.e.
whenever it declares a test case to be interesting, we are able
to shrink that to the same interesting test case (which logically should
be the shortlex minimal interesting test case, though we may not be able
to detect if it is).
Will run until we have seen ``required_successes`` many interesting test
cases in a row normalize to the same value.
If ``allowed_to_update`` is True, whenever we fail to normalize we will
learn a new DFA-based shrink pass that allows us to make progress. Any
learned DFAs will be written back into the learned DFA file at the end
of this function. If ``allowed_to_update`` is False, this will raise an
error as soon as it encounters a failure to normalize.
Additionally, if more than ``max_dfas` DFAs are required to normalize
this test function, this function will raise an error - it's essentially
designed for small patches that other shrink passes don't cover, and
if it's learning too many patches then you need a better shrink pass
than this can provide.
"""
# Need import inside the function to avoid circular imports
from hypothesis.internal.conjecture.engine import BUFFER_SIZE, ConjectureRunner
runner = ConjectureRunner(
test_function,
settings=settings(database=None, suppress_health_check=list(HealthCheck)),
ignore_limits=True,
random=random,
)
seen = set()
dfas_added = 0
found_interesting = False
consecutive_successes = 0
failures_to_find_interesting = 0
while consecutive_successes < required_successes:
attempt = runner.cached_test_function(b"", extend=BUFFER_SIZE)
if attempt.status < Status.INTERESTING:
failures_to_find_interesting += 1
assert (
found_interesting or failures_to_find_interesting <= 1000
), "Test function seems to have no interesting test cases"
continue
found_interesting = True
target = attempt.interesting_origin
def shrinking_predicate(d):
return d.status == Status.INTERESTING and d.interesting_origin == target
if target not in seen:
seen.add(target)
runner.shrink(attempt, shrinking_predicate)
continue
previous = fully_shrink(
runner, runner.interesting_examples[target], shrinking_predicate
)
current = fully_shrink(runner, attempt, shrinking_predicate)
if current.buffer == previous.buffer:
consecutive_successes += 1
continue
consecutive_successes = 0
if not allowed_to_update:
raise FailedToNormalise(
f"Shrinker failed to normalize {previous.buffer!r} to "
f"{current.buffer!r} and we are not allowed to learn new DFAs."
)
if dfas_added >= max_dfas:
raise FailedToNormalise(
f"Test function is too hard to learn: Added {dfas_added} "
"DFAs and still not done."
)
dfas_added += 1
new_dfa = learn_a_new_dfa(
runner, previous.buffer, current.buffer, shrinking_predicate
)
name = base_name + "-" + hashlib.sha256(repr(new_dfa).encode()).hexdigest()[:10]
# If there is a name collision this DFA should already be being
# used for shrinking, so we should have already been able to shrink
# v further.
assert name not in SHRINKING_DFAS
SHRINKING_DFAS[name] = new_dfa
if dfas_added > 0:
# We've learned one or more DFAs in the course of normalising, so now
# we update the file to record those for posterity.
update_learned_dfas()

View File

@@ -0,0 +1,90 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
import math
import sys
from hypothesis.internal.conjecture.floats import float_to_lex
from hypothesis.internal.conjecture.shrinking.common import Shrinker
from hypothesis.internal.conjecture.shrinking.integer import Integer
MAX_PRECISE_INTEGER = 2**53
class Float(Shrinker):
def setup(self):
self.NAN = math.nan
self.debugging_enabled = True
def make_immutable(self, f):
f = float(f)
if math.isnan(f):
# Always use the same NAN so it works properly in self.seen
f = self.NAN
return f
def check_invariants(self, value):
# We only handle positive floats because we encode the sign separately
# anyway.
assert not (value < 0)
def left_is_better(self, left, right):
lex1 = float_to_lex(left)
lex2 = float_to_lex(right)
return lex1 < lex2
def short_circuit(self):
# We check for a bunch of standard "large" floats. If we're currently
# worse than them and the shrink downwards doesn't help, abort early
# because there's not much useful we can do here.
for g in [sys.float_info.max, math.inf, math.nan]:
self.consider(g)
# If we're stuck at a nasty float don't try to shrink it further.
if not math.isfinite(self.current):
return True
# If its too large to represent as an integer, bail out here. It's
# better to try shrinking it in the main representation.
return self.current >= MAX_PRECISE_INTEGER
def run_step(self):
# Finally we get to the important bit: Each of these is a small change
# to the floating point number that corresponds to a large change in
# the lexical representation. Trying these ensures that our floating
# point shrink can always move past these obstacles. In particular it
# ensures we can always move to integer boundaries and shrink past a
# change that would require shifting the exponent while not changing
# the float value much.
# First, try dropping precision bits by rounding the scaled value. We
# try values ordered from least-precise (integer) to more precise, ie.
# approximate lexicographical order. Once we find an acceptable shrink,
# self.consider discards the remaining attempts early and skips test
# invocation. The loop count sets max fractional bits to keep, and is a
# compromise between completeness and performance.
for p in range(10):
scaled = self.current * 2**p # note: self.current may change in loop
for truncate in [math.floor, math.ceil]:
self.consider(truncate(scaled) / 2**p)
if self.consider(int(self.current)):
self.debug("Just an integer now")
self.delegate(Integer, convert_to=int, convert_from=float)
return
# Now try to minimize the top part of the fraction as an integer. This
# basically splits the float as k + x with 0 <= x < 1 and minimizes
# k as an integer, but without the precision issues that would have.
m, n = self.current.as_integer_ratio()
i, r = divmod(m, n)
self.call_shrinker(Integer, i, lambda k: self.consider((k * n + r) / n))

View File

@@ -0,0 +1,75 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from hypothesis.internal.conjecture.junkdrawer import find_integer
from hypothesis.internal.conjecture.shrinking.common import Shrinker
"""
This module implements a shrinker for non-negative integers.
"""
class Integer(Shrinker):
"""Attempts to find a smaller integer. Guaranteed things to try ``0``,
``1``, ``initial - 1``, ``initial - 2``. Plenty of optimisations beyond
that but those are the guaranteed ones.
"""
def short_circuit(self):
for i in range(2):
if self.consider(i):
return True
self.mask_high_bits()
if self.size > 8:
# see if we can squeeze the integer into a single byte.
self.consider(self.current >> (self.size - 8))
self.consider(self.current & 0xFF)
return self.current == 2
def check_invariants(self, value):
assert value >= 0
def left_is_better(self, left, right):
return left < right
def run_step(self):
self.shift_right()
self.shrink_by_multiples(2)
self.shrink_by_multiples(1)
def shift_right(self):
base = self.current
find_integer(lambda k: k <= self.size and self.consider(base >> k))
def mask_high_bits(self):
base = self.current
n = base.bit_length()
@find_integer
def try_mask(k):
if k >= n:
return False
mask = (1 << (n - k)) - 1
return self.consider(mask & base)
@property
def size(self):
return self.current.bit_length()
def shrink_by_multiples(self, k):
base = self.current
@find_integer
def shrunk(n):
attempt = base - n * k
return attempt >= 0 and self.consider(attempt)
return shrunk > 0

View File

@@ -0,0 +1,32 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from hypothesis.internal.conjecture.dfa import ConcreteDFA
SHRINKING_DFAS = {}
# Note: Everything below the following line is auto generated.
# Any code added after this point will be deleted by an automated
# process. Don't write code below this point.
#
# AUTOGENERATED BEGINS
# fmt: off
SHRINKING_DFAS['datetimes()-d66625c3b7'] = ConcreteDFA([[(0, 1), (1, 255, 2)], [(0, 3), (1, 255, 4)], [(0, 255, 4)], [(0, 5), (1, 255, 6)], [(0, 255, 6)], [(5, 255, 7)], [(0, 255, 7)], []], {7}) # noqa: E501
SHRINKING_DFAS['emails()-fde8f71142'] = ConcreteDFA([[(0, 1), (1, 255, 2)], [(0, 255, 2)], []], {2}) # noqa: E501
SHRINKING_DFAS['floats()-58ab5aefc9'] = ConcreteDFA([[(1, 1), (2, 255, 2)], [(1, 3)], [(0, 1, 3)], []], {3}) # noqa: E501
SHRINKING_DFAS['floats()-6b86629f89'] = ConcreteDFA([[(3, 1), (4, 255, 2)], [(1, 3)], [(0, 1, 3)], []], {3}) # noqa: E501
SHRINKING_DFAS['floats()-aa8aef1e72'] = ConcreteDFA([[(2, 1), (3, 255, 2)], [(1, 3)], [(0, 1, 3)], []], {3}) # noqa: E501
SHRINKING_DFAS['floats()-bf71ffe70f'] = ConcreteDFA([[(4, 1), (5, 255, 2)], [(1, 3)], [(0, 1, 3)], []], {3}) # noqa: E501
SHRINKING_DFAS['text()-05c917b389'] = ConcreteDFA([[(0, 1), (1, 8, 2)], [(9, 255, 3)], [(0, 255, 4)], [], [(0, 255, 5)], [(0, 255, 3)]], {3}) # noqa: E501
SHRINKING_DFAS['text()-807e5f9650'] = ConcreteDFA([[(0, 8, 1), (9, 255, 2)], [(1, 8, 3)], [(1, 8, 3)], [(0, 4)], [(0, 255, 5)], []], {2, 5}) # noqa: E501
# fmt: on

View File

@@ -0,0 +1,59 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from hypothesis.internal.compat import int_from_bytes, int_to_bytes
from hypothesis.internal.conjecture.shrinking.common import Shrinker
from hypothesis.internal.conjecture.shrinking.integer import Integer
from hypothesis.internal.conjecture.shrinking.ordering import Ordering
"""
This module implements a lexicographic minimizer for blocks of bytes.
"""
class Lexical(Shrinker):
def make_immutable(self, value):
return bytes(value)
@property
def size(self):
return len(self.current)
def check_invariants(self, value):
assert len(value) == self.size
def left_is_better(self, left, right):
return left < right
def incorporate_int(self, i):
return self.incorporate(int_to_bytes(i, self.size))
@property
def current_int(self):
return int_from_bytes(self.current)
def minimize_as_integer(self):
Integer.shrink(
self.current_int,
lambda c: c == self.current_int or self.incorporate_int(c),
random=self.random,
)
def partial_sort(self):
Ordering.shrink(self.current, self.consider, random=self.random)
def short_circuit(self):
"""This is just an assemblage of other shrinkers, so we rely on their
short circuiting."""
return False
def run_step(self):
self.minimize_as_integer()
self.partial_sort()

View File

@@ -0,0 +1,99 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
from hypothesis.internal.conjecture.junkdrawer import find_integer
from hypothesis.internal.conjecture.shrinking.common import Shrinker
def identity(v):
return v
class Ordering(Shrinker):
"""A shrinker that tries to make a sequence more sorted.
Will not change the length or the contents, only tries to reorder
the elements of the sequence.
"""
def setup(self, key=identity):
self.key = key
def make_immutable(self, value):
return tuple(value)
def short_circuit(self):
# If we can flat out sort the target then there's nothing more to do.
return self.consider(sorted(self.current, key=self.key))
def left_is_better(self, left, right):
return tuple(map(self.key, left)) < tuple(map(self.key, right))
def check_invariants(self, value):
assert len(value) == len(self.current)
assert sorted(value) == sorted(self.current)
def run_step(self):
self.sort_regions()
self.sort_regions_with_gaps()
def sort_regions(self):
"""Guarantees that for each i we have tried to swap index i with
index i + 1.
This uses an adaptive algorithm that works by sorting contiguous
regions starting from each element.
"""
i = 0
while i + 1 < len(self.current):
prefix = list(self.current[:i])
k = find_integer(
lambda k: i + k <= len(self.current)
and self.consider(
prefix
+ sorted(self.current[i : i + k], key=self.key)
+ list(self.current[i + k :])
)
)
i += k
def sort_regions_with_gaps(self):
"""Guarantees that for each i we have tried to swap index i with
index i + 2.
This uses an adaptive algorithm that works by sorting contiguous
regions centered on each element, where that element is treated as
fixed and the elements around it are sorted..
"""
for i in range(1, len(self.current) - 1):
if self.current[i - 1] <= self.current[i] <= self.current[i + 1]:
# The `continue` line is optimised out of the bytecode on
# CPython >= 3.7 (https://bugs.python.org/issue2506) and on
# PyPy, and so coverage cannot tell that it has been taken.
continue # pragma: no cover
def can_sort(a, b):
if a < 0 or b > len(self.current):
return False
assert a <= i < b
split = i - a
values = sorted(self.current[a:i] + self.current[i + 1 : b])
return self.consider(
list(self.current[:a])
+ values[:split]
+ [self.current[i]]
+ values[split:]
+ list(self.current[b:])
)
left = i
right = i + 1
right += find_integer(lambda k: can_sort(left, right + k))
find_integer(lambda k: can_sort(left - k, right))

View File

@@ -0,0 +1,338 @@
# This file is part of Hypothesis, which may be found at
# https://github.com/HypothesisWorks/hypothesis/
#
# Copyright the Hypothesis Authors.
# Individual contributors are listed in AUTHORS.rst and the git log.
#
# This Source Code Form is subject to the terms of the Mozilla Public License,
# v. 2.0. If a copy of the MPL was not distributed with this file, You can
# obtain one at https://mozilla.org/MPL/2.0/.
import enum
import hashlib
import heapq
import sys
from collections import OrderedDict, abc
from functools import lru_cache
from typing import TYPE_CHECKING, List, Optional, Sequence, Tuple, Type, TypeVar, Union
from hypothesis.errors import InvalidArgument
from hypothesis.internal.compat import int_from_bytes
from hypothesis.internal.floats import next_up
if TYPE_CHECKING:
from hypothesis.internal.conjecture.data import ConjectureData
LABEL_MASK = 2**64 - 1
def calc_label_from_name(name: str) -> int:
hashed = hashlib.sha384(name.encode()).digest()
return int_from_bytes(hashed[:8])
def calc_label_from_cls(cls: type) -> int:
return calc_label_from_name(cls.__qualname__)
def combine_labels(*labels: int) -> int:
label = 0
for l in labels:
label = (label << 1) & LABEL_MASK
label ^= l
return label
SAMPLE_IN_SAMPLER_LABEL = calc_label_from_name("a sample() in Sampler")
ONE_FROM_MANY_LABEL = calc_label_from_name("one more from many()")
T = TypeVar("T")
def check_sample(
values: Union[Type[enum.Enum], Sequence[T]], strategy_name: str
) -> Sequence[T]:
if "numpy" in sys.modules and isinstance(values, sys.modules["numpy"].ndarray):
if values.ndim != 1:
raise InvalidArgument(
"Only one-dimensional arrays are supported for sampling, "
f"and the given value has {values.ndim} dimensions (shape "
f"{values.shape}). This array would give samples of array slices "
"instead of elements! Use np.ravel(values) to convert "
"to a one-dimensional array, or tuple(values) if you "
"want to sample slices."
)
elif not isinstance(values, (OrderedDict, abc.Sequence, enum.EnumMeta)):
raise InvalidArgument(
f"Cannot sample from {values!r}, not an ordered collection. "
f"Hypothesis goes to some length to ensure that the {strategy_name} "
"strategy has stable results between runs. To replay a saved "
"example, the sampled values must have the same iteration order "
"on every run - ruling out sets, dicts, etc due to hash "
"randomization. Most cases can simply use `sorted(values)`, but "
"mixed types or special values such as math.nan require careful "
"handling - and note that when simplifying an example, "
"Hypothesis treats earlier values as simpler."
)
if isinstance(values, range):
return values
return tuple(values)
def choice(
data: "ConjectureData", values: Sequence[T], *, forced: Optional[T] = None
) -> T:
forced_i = None if forced is None else values.index(forced)
i = data.draw_integer(0, len(values) - 1, forced=forced_i)
return values[i]
class Sampler:
"""Sampler based on Vose's algorithm for the alias method. See
http://www.keithschwarz.com/darts-dice-coins/ for a good explanation.
The general idea is that we store a table of triples (base, alternate, p).
base. We then pick a triple uniformly at random, and choose its alternate
value with probability p and else choose its base value. The triples are
chosen so that the resulting mixture has the right distribution.
We maintain the following invariants to try to produce good shrinks:
1. The table is in lexicographic (base, alternate) order, so that choosing
an earlier value in the list always lowers (or at least leaves
unchanged) the value.
2. base[i] < alternate[i], so that shrinking the draw always results in
shrinking the chosen element.
"""
table: List[Tuple[int, int, float]] # (base_idx, alt_idx, alt_chance)
def __init__(self, weights: Sequence[float]):
n = len(weights)
table: "list[list[int | float | None]]" = [[i, None, None] for i in range(n)]
total = sum(weights)
num_type = type(total)
zero = num_type(0) # type: ignore
one = num_type(1) # type: ignore
small: "List[int]" = []
large: "List[int]" = []
probabilities = [w / total for w in weights]
scaled_probabilities: "List[float]" = []
for i, alternate_chance in enumerate(probabilities):
scaled = alternate_chance * n
scaled_probabilities.append(scaled)
if scaled == 1:
table[i][2] = zero
elif scaled < 1:
small.append(i)
else:
large.append(i)
heapq.heapify(small)
heapq.heapify(large)
while small and large:
lo = heapq.heappop(small)
hi = heapq.heappop(large)
assert lo != hi
assert scaled_probabilities[hi] > one
assert table[lo][1] is None
table[lo][1] = hi
table[lo][2] = one - scaled_probabilities[lo]
scaled_probabilities[hi] = (
scaled_probabilities[hi] + scaled_probabilities[lo]
) - one
if scaled_probabilities[hi] < 1:
heapq.heappush(small, hi)
elif scaled_probabilities[hi] == 1:
table[hi][2] = zero
else:
heapq.heappush(large, hi)
while large:
table[large.pop()][2] = zero
while small:
table[small.pop()][2] = zero
self.table: "List[Tuple[int, int, float]]" = []
for base, alternate, alternate_chance in table: # type: ignore
assert isinstance(base, int)
assert isinstance(alternate, int) or alternate is None
if alternate is None:
self.table.append((base, base, alternate_chance))
elif alternate < base:
self.table.append((alternate, base, one - alternate_chance))
else:
self.table.append((base, alternate, alternate_chance))
self.table.sort()
def sample(self, data: "ConjectureData", forced: Optional[int] = None) -> int:
data.start_example(SAMPLE_IN_SAMPLER_LABEL)
forced_choice = ( # pragma: no branch # https://github.com/nedbat/coveragepy/issues/1617
None
if forced is None
else next((b, a, a_c) for (b, a, a_c) in self.table if forced in (b, a))
)
base, alternate, alternate_chance = choice(
data, self.table, forced=forced_choice
)
use_alternate = data.draw_boolean(
alternate_chance, forced=None if forced is None else forced == alternate
)
data.stop_example()
if use_alternate:
assert forced is None or alternate == forced, (forced, alternate)
return alternate
else:
assert forced is None or base == forced, (forced, base)
return base
INT_SIZES = (8, 16, 32, 64, 128)
INT_SIZES_SAMPLER = Sampler((4.0, 8.0, 1.0, 1.0, 0.5))
class many:
"""Utility class for collections. Bundles up the logic we use for "should I
keep drawing more values?" and handles starting and stopping examples in
the right place.
Intended usage is something like:
elements = many(data, ...)
while elements.more():
add_stuff_to_result()
"""
def __init__(
self,
data: "ConjectureData",
min_size: int,
max_size: Union[int, float],
average_size: Union[int, float],
*,
forced: Optional[int] = None,
) -> None:
assert 0 <= min_size <= average_size <= max_size
assert forced is None or min_size <= forced <= max_size
self.min_size = min_size
self.max_size = max_size
self.data = data
self.forced_size = forced
self.p_continue = _calc_p_continue(average_size - min_size, max_size - min_size)
self.count = 0
self.rejections = 0
self.drawn = False
self.force_stop = False
self.rejected = False
def more(self) -> bool:
"""Should I draw another element to add to the collection?"""
if self.drawn:
self.data.stop_example(discard=self.rejected)
self.drawn = True
self.rejected = False
self.data.start_example(ONE_FROM_MANY_LABEL)
if self.min_size == self.max_size:
# if we have to hit an exact size, draw unconditionally until that
# point, and no further.
should_continue = self.count < self.min_size
else:
forced_result = None
if self.force_stop:
# if our size is forced, we can't reject in a way that would
# cause us to differ from the forced size.
assert self.forced_size is None or self.count == self.forced_size
forced_result = False
elif self.count < self.min_size:
forced_result = True
elif self.count >= self.max_size:
forced_result = False
elif self.forced_size is not None:
forced_result = self.count < self.forced_size
should_continue = self.data.draw_boolean(
self.p_continue, forced=forced_result
)
if should_continue:
self.count += 1
return True
else:
self.data.stop_example()
return False
def reject(self, why: Optional[str] = None) -> None:
"""Reject the last example (i.e. don't count it towards our budget of
elements because it's not going to go in the final collection)."""
assert self.count > 0
self.count -= 1
self.rejections += 1
self.rejected = True
# We set a minimum number of rejections before we give up to avoid
# failing too fast when we reject the first draw.
if self.rejections > max(3, 2 * self.count):
if self.count < self.min_size:
self.data.mark_invalid(why)
else:
self.force_stop = True
SMALLEST_POSITIVE_FLOAT: float = next_up(0.0) or sys.float_info.min
@lru_cache
def _calc_p_continue(desired_avg: float, max_size: int) -> float:
"""Return the p_continue which will generate the desired average size."""
assert desired_avg <= max_size, (desired_avg, max_size)
if desired_avg == max_size:
return 1.0
p_continue = 1 - 1.0 / (1 + desired_avg)
if p_continue == 0 or max_size == float("inf"):
assert 0 <= p_continue < 1, p_continue
return p_continue
assert 0 < p_continue < 1, p_continue
# For small max_size, the infinite-series p_continue is a poor approximation,
# and while we can't solve the polynomial a few rounds of iteration quickly
# gets us a good approximate solution in almost all cases (sometimes exact!).
while _p_continue_to_avg(p_continue, max_size) > desired_avg:
# This is impossible over the reals, but *can* happen with floats.
p_continue -= 0.0001
# If we've reached zero or gone negative, we want to break out of this loop,
# and do so even if we're on a system with the unsafe denormals-are-zero flag.
# We make that an explicit error in st.floats(), but here we'd prefer to
# just get somewhat worse precision on collection lengths.
if p_continue < SMALLEST_POSITIVE_FLOAT:
p_continue = SMALLEST_POSITIVE_FLOAT
break
# Let's binary-search our way to a better estimate! We tried fancier options
# like gradient descent, but this is numerically stable and works better.
hi = 1.0
while desired_avg - _p_continue_to_avg(p_continue, max_size) > 0.01:
assert 0 < p_continue < hi, (p_continue, hi)
mid = (p_continue + hi) / 2
if _p_continue_to_avg(mid, max_size) <= desired_avg:
p_continue = mid
else:
hi = mid
assert 0 < p_continue < 1, p_continue
assert _p_continue_to_avg(p_continue, max_size) <= desired_avg
return p_continue
def _p_continue_to_avg(p_continue: float, max_size: int) -> float:
"""Return the average_size generated by this p_continue and max_size."""
if p_continue >= 1:
return max_size
return (1.0 / (1 - p_continue) - 1) * (1 - p_continue**max_size)