Coverage for src/sequin/__init__.py: 100.000%
97 statements
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-14 11:39 +0200
« prev ^ index » next coverage.py v7.6.0, created at 2024-07-14 11:39 +0200
1# SPDX-FileCopyrightText: 2024 Marco Ricci <m@the13thletter.info>
2#
3# SPDX-License-Identifier: MIT
5"""A Python reimplementation of James Coglan's "sequin" Node.js module.
7James Coglan's "sequin" Node.js module provides a pseudorandom number
8generator (using rejection sampling on a stream of input numbers) that
9attempts to minimize the amount of information it throws away:
10(non-degenerate) rejected samples are fed into a stream of higher-order
11numbers from which the next random number generation request will be
12served. The sequin module is used in Coglan's "vault" module (a
13deterministic, stateless password manager that recomputes passwords
14instead of storing them), and this reimplementation is used for
15a similar purpose.
17The main API is the [`Sequin`] [sequin.Sequin] class, which is
18thoroughly documented.
20"""
22from __future__ import annotations
24import collections
26from collections.abc import Iterator, Sequence
27from typing_extensions import assert_type
29__all__ = ('Sequin', 'SequinExhaustedError')
30__author__ = 'Marco Ricci <m@the13thletter.info>'
31__version__ = "0.1.0"
33class Sequin:
34 """Generate pseudorandom non-negative numbers in different ranges.
36 Given a (presumed high-quality) uniformly random sequence of input
37 bits, generate pseudorandom non-negative integers in a certain range
38 on each call of the `generate` method. (It is permissible to
39 specify a different range per call to `generate`; this is the main
40 use case.) We use a modified version of rejection sampling, where
41 rejected values are stored in "rejection queues" if possible, and
42 these rejection queues re-seed the next round of rejection sampling.
44 This is a Python reimplementation of James Coglan's [Node.js sequin
45 module][JS_SEQUIN], as introduced in [his blog post][BLOG_POST]. It
46 uses a [technique by Christian Lawson-Perfect][SEQUIN_TECHNIQUE].
47 I do not know why the original module is called "sequin"; I presume
48 it to be a pun on "sequence".
50 [JS_SEQUIN]: https://www.npmjs.com/package/sequin
51 [BLOG_POST]: https://blog.jcoglan.com/2012/07/16/designing-vaults-generator-algorithm/
52 [SEQUIN_TECHNIQUE]: https://checkmyworking.com/2012/06/converting-a-stream-of-binary-digits-to-a-stream-of-base-n-digits/
54 """
55 def __init__(
56 self,
57 sequence: str | bytes | bytearray | Sequence[int],
58 /, *, is_bitstring: bool = False
59 ):
60 """Initialize the Sequin.
62 Args:
63 sequence:
64 A sequence of bits, or things convertible to bits, to
65 seed the pseudorandom number generator. Byte and text
66 strings are converted to 8-bit integer sequences.
67 (Conversion will fail if the text string contains
68 non-ISO-8859-1 characters.) The numbers are then
69 converted to bits.
70 is_bitstring:
71 If true, treat the input as a bitstring. By default,
72 the input is treated as a string of 8-bit integers, from
73 which the individual bits must still be extracted.
75 Raises:
76 ValueError:
77 The sequence contains values outside the permissible
78 range.
80 """
81 def uint8_to_bits(value):
82 """Yield individual bits of an 8-bit number, MSB first."""
83 for i in (0x80, 0x40, 0x20, 0x10, 0x08, 0x04, 0x02, 0x01):
84 yield 1 if value | i == value else 0
85 if isinstance(sequence, str):
86 try:
87 sequence = tuple(sequence.encode('iso-8859-1'))
88 except UnicodeError as e:
89 raise ValueError('sequence item out of range') from e
90 else:
91 sequence = tuple(sequence)
92 assert_type(sequence, tuple[int, ...])
93 self.bases: dict[int, collections.deque[int]] = {}
94 def gen() -> Iterator[int]:
95 for num in sequence:
96 if num not in range(2 if is_bitstring else 256):
97 raise ValueError('sequence item out of range')
98 if is_bitstring:
99 yield num
100 else:
101 yield from uint8_to_bits(num)
102 self.bases[2] = collections.deque(gen())
104 def _all_or_nothing_shift(
105 self, count: int, /, *, base: int = 2
106 ) -> Sequence[int]:
107 """Shift and return items if and only if there are enough.
109 Args:
110 count: Number of items to shift/consume.
111 base: Use the base `base` sequence.
113 Returns:
114 If there are sufficient items in the sequence left, then
115 consume them from the sequence and return them. Otherwise,
116 consume nothing, and return nothing.
118 Notes:
119 We currently remove now-empty sequences from the registry of
120 sequences.
122 Examples:
123 >>> seq = Sequin([1, 0, 1, 0, 0, 1, 0, 0, 0, 1],
124 ... is_bitstring=True)
125 >>> seq.bases
126 {2: deque([1, 0, 1, 0, 0, 1, 0, 0, 0, 1])}
127 >>> seq._all_or_nothing_shift(3)
128 (1, 0, 1)
129 >>> seq._all_or_nothing_shift(3)
130 (0, 0, 1)
131 >>> seq.bases[2]
132 deque([0, 0, 0, 1])
133 >>> seq._all_or_nothing_shift(5)
134 ()
135 >>> seq.bases[2]
136 deque([0, 0, 0, 1])
137 >>> seq._all_or_nothing_shift(4)
138 (0, 0, 0, 1)
139 >>> 2 in seq.bases # now-empty sequences are removed
140 False
142 """
143 try:
144 seq = self.bases[base]
145 except KeyError:
146 return ()
147 stash: collections.deque[int] = collections.deque()
148 try:
149 for i in range(count):
150 stash.append(seq.popleft())
151 except IndexError:
152 seq.extendleft(reversed(stash))
153 return ()
154 # Clean up queues.
155 if not seq:
156 del self.bases[base]
157 return tuple(stash)
159 @staticmethod
160 def _big_endian_number(
161 digits: Sequence[int], /, *, base: int = 2
162 ) -> int:
163 """Evaluate the given integer sequence as a big endian number.
165 Args:
166 digits: A sequence of integers to evaluate.
167 base: The number base to evaluate those integers in.
169 Raises:
170 ValueError: `base` is an invalid base.
171 ValueError: Not all integers are valid base `base` digits.
173 Examples:
174 >>> Sequin._big_endian_number([1, 2, 3, 4, 5, 6, 7, 8], base=10)
175 12345678
176 >>> Sequin._big_endian_number([1, 2, 3, 4, 5, 6, 7, 8], base=100)
177 102030405060708
178 >>> Sequin._big_endian_number([0, 0, 0, 0, 1, 4, 9, 7], base=10)
179 1497
180 >>> Sequin._big_endian_number([1, 0, 0, 1, 0, 0, 0, 0], base=2)
181 144
182 >>> Sequin._big_endian_number([1, 7, 5, 5], base=8) == 0o1755
183 True
185 """
186 if base < 2:
187 raise ValueError(f'invalid base: {base!r}')
188 ret = 0
189 allowed_range = range(base)
190 n = len(digits)
191 for i in range(n):
192 i2 = (n - 1) - i
193 x = digits[i]
194 if not isinstance(x, int):
195 raise TypeError(f'not an integer: {x!r}')
196 if x not in allowed_range:
197 raise ValueError(f'invalid base {base!r} digit: {x!r}')
198 ret += (base ** i2) * x
199 return ret
201 def generate(self, n: int, /) -> int:
202 """Generate a base `n` non-negative integer.
204 We attempt to generate a value using rejection sampling. If the
205 generated sample is outside the desired range (i.e., is
206 rejected), then attempt to reuse the sample by seeding
207 a "higher-order" input sequence of uniformly random numbers (for
208 a different base).
210 Args:
211 n:
212 Generate numbers in the range 0, ..., `n` - 1.
213 (Inclusive.) Must be larger than 0.
215 Returns:
216 A pseudorandom number in the range 0, ..., `n` - 1.
218 Raises:
219 ValueError:
220 The range is empty.
221 SequinExhaustedError:
222 The sequin is exhausted.
224 Examples:
225 >>> seq = Sequin([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1],
226 ... is_bitstring=True)
227 >>> seq2 = Sequin([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1],
228 ... is_bitstring=True)
229 >>> seq.generate(5)
230 3
231 >>> seq.generate(5)
232 3
233 >>> seq.generate(5)
234 1
235 >>> seq.generate(5) # doctest: +IGNORE_EXCEPTION_DETAIL
236 Traceback (most recent call last):
237 ...
238 SequinExhaustedError: Sequin is exhausted
240 Using `n = 1` does not actually consume input bits:
242 >>> seq2.generate(1)
243 0
245 But it still won't work on exhausted sequins:
247 >>> seq.generate(1) # doctest: +IGNORE_EXCEPTION_DETAIL
248 Traceback (most recent call last):
249 ...
250 SequinExhaustedError: Sequin is exhausted
252 """
253 if 2 not in self.bases:
254 raise SequinExhaustedError('Sequin is exhausted')
255 value = self._generate_inner(n, base=2)
256 if value == n:
257 raise SequinExhaustedError('Sequin is exhausted')
258 return value
260 def _generate_inner(
261 self, n: int, /, *, base: int = 2
262 ) -> int:
263 """Recursive call to generate a base `n` non-negative integer.
265 We first determine the correct exponent `k` to generate base `n`
266 numbers from a stream of base `base` numbers, then attempt to
267 take `k` numbers from the base `base` sequence (or bail if not
268 possible). If the resulting number `v` is out of range for
269 base `n`, then push `v - n` onto the rejection queue for
270 base `r` = `base` ** `k` - `n`, and attempt to generate the
271 requested base `n` integer from the sequence of base `r` numbers
272 next. (This recursion is not attempted if `r` = 1.) Otherwise,
273 return the number.
275 Args:
276 n:
277 Generate numbers in the range 0, ..., `n` - 1.
278 (Inclusive.) Must be larger than 0.
279 base:
280 Use the base `base` sequence as a source for
281 pseudorandom numbers.
283 Returns:
284 A pseudorandom number in the range 0, ..., `n` - 1 if
285 possible, or `n` if the stream is exhausted.
287 Raises:
288 ValueError:
289 The range is empty.
291 Examples:
292 >>> seq = Sequin([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1],
293 ... is_bitstring=True)
294 >>> seq2 = Sequin([1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1],
295 ... is_bitstring=True)
296 >>> seq._generate_inner(5)
297 3
298 >>> seq._generate_inner(5)
299 3
300 >>> seq._generate_inner(5)
301 1
302 >>> seq._generate_inner(5) # error condition: sequin exhausted
303 5
305 Using `n = 1` does not actually consume input bits, and
306 always works, regardless of sequin exhaustion:
308 >>> seq2._generate_inner(1)
309 0
310 >>> seq._generate_inner(1)
311 0
313 Using an unsuitable range will raise:
315 >>> seq2._generate_inner(0) # doctest: +IGNORE_EXCEPTION_DETAIL
316 Traceback (most recent call last):
317 ...
318 ValueError: invalid target range
320 """
321 if n < 1:
322 raise ValueError('invalid target range')
323 if base < 2:
324 raise ValueError(f'invalid base: {base!r}')
325 # p = base ** k, where k is the smallest integer such that
326 # p >= n. We determine p and k inductively.
327 p = 1
328 k = 0
329 while p < n:
330 p *= base
331 k += 1
332 # The remainder r of p and n is used as the base for rejection
333 # queue.
334 r = p - n
335 # The generated number v is initialized to n because of the
336 # while loop below.
337 v = n
338 while v > n - 1:
339 list_slice = self._all_or_nothing_shift(k, base=base)
340 if not list_slice:
341 if n != 1:
342 return n
343 else:
344 v = 0
345 v = self._big_endian_number(list_slice, base=base)
346 if v > n - 1:
347 # If r is 0, then p == n, so v < n, or rather
348 # v <= n - 1.
349 assert r > 0
350 if r == 1:
351 continue
352 self._stash(v - n, base=r)
353 v = self._generate_inner(n, base=r)
354 return v
356 def _stash(self, value: int, /, *, base: int = 2) -> None:
357 """Stash `value` on the base `base` sequence."""
358 if base not in self.bases:
359 self.bases[base] = collections.deque()
360 self.bases[base].append(value)
362class SequinExhaustedError(Exception):
363 """The sequin is exhausted.
365 No more values can be generated from this sequin.
367 """