Coverage for src/derivepassphrase/__init__.py: 100.000%

134 statements  

« prev     ^ index     » next       coverage.py v7.6.0, created at 2024-07-14 11:39 +0200

1# SPDX-FileCopyrightText: 2024 Marco Ricci <m@the13thletter.info> 

2# 

3# SPDX-License-Identifier: MIT 

4 

5"""Work-alike of vault(1) – a deterministic, stateless password manager 

6 

7""" 

8 

9from __future__ import annotations 

10 

11import base64 

12import collections 

13import hashlib 

14import math 

15import unicodedata 

16 

17from typing_extensions import assert_type 

18 

19import sequin 

20import ssh_agent_client 

21 

22__author__ = "Marco Ricci <m@the13thletter.info>" 

23__version__ = "0.1.0" 

24 

25class AmbiguousByteRepresentationError(ValueError): 

26 """The object has an ambiguous byte representation.""" 

27 

28class Vault: 

29 """A work-alike of James Coglan's vault. 

30 

31 Store settings for generating (actually: deriving) passphrases for 

32 named services, with various constraints, given only a master 

33 passphrase. Also, actually generate the passphrase. The derivation 

34 is deterministic and non-secret; only the master passphrase need be 

35 kept secret. The implementation is compatible with [vault][]. 

36 

37 [James Coglan explains the passphrase derivation algorithm in great 

38 detail][ALGORITHM] in his blog post on said topic: A principally 

39 infinite bit stream is obtained by running a key-derivation function 

40 on the master passphrase and the service name, then this bit stream 

41 is fed into a [Sequin][sequin.Sequin] to generate random numbers in 

42 the correct range, and finally these random numbers select 

43 passphrase characters until the desired length is reached. 

44 

45 [vault]: https://getvau.lt 

46 [ALGORITHM]: https://blog.jcoglan.com/2012/07/16/designing-vaults-generator-algorithm/ 

47 

48 """ 

49 _UUID = b'e87eb0f4-34cb-46b9-93ad-766c5ab063e7' 

50 """A tag used by vault in the bit stream generation.""" 

51 _CHARSETS: collections.OrderedDict[str, bytes] 

52 """ 

53 Known character sets from which to draw passphrase characters. 

54 Relies on a certain, fixed order for their definition and their 

55 contents. 

56 

57 """ 

58 _CHARSETS = collections.OrderedDict([ 

59 ('lower', b'abcdefghijklmnopqrstuvwxyz'), 

60 ('upper', b'ABCDEFGHIJKLMNOPQRSTUVWXYZ'), 

61 ('alpha', b''), # Placeholder. 

62 ('number', b'0123456789'), 

63 ('alphanum', b''), # Placeholder. 

64 ('space', b' '), 

65 ('dash', b'-_'), 

66 ('symbol', b'!"#$%&\'()*+,./:;<=>?@[\\]^{|}~-_'), 

67 ('all', b''), # Placeholder. 

68 ]) 

69 _CHARSETS['alpha'] = _CHARSETS['lower'] + _CHARSETS['upper'] 

70 _CHARSETS['alphanum'] = _CHARSETS['alpha'] + _CHARSETS['number'] 

71 _CHARSETS['all'] = (_CHARSETS['alphanum'] + _CHARSETS['space'] 

72 + _CHARSETS['symbol']) 

73 

74 def __init__( 

75 self, *, phrase: bytes | bytearray | str = b'', 

76 length: int = 20, repeat: int = 0, lower: int | None = None, 

77 upper: int | None = None, number: int | None = None, 

78 space: int | None = None, dash: int | None = None, 

79 symbol: int | None = None, 

80 ) -> None: 

81 """Initialize the Vault object. 

82 

83 Args: 

84 phrase: 

85 The master passphrase from which to derive the service 

86 passphrases. If a text string, then the byte 

87 representation must be unique. 

88 length: 

89 Desired passphrase length. 

90 repeat: 

91 The maximum number of immediate character repetitions 

92 allowed in the passphrase. Disabled if set to 0. 

93 lower: 

94 Optional constraint on ASCII lowercase characters. If 

95 positive, include this many lowercase characters 

96 somewhere in the passphrase. If 0, avoid lowercase 

97 characters altogether. 

98 upper: 

99 Same as `lower`, but for ASCII uppercase characters. 

100 number: 

101 Same as `lower`, but for ASCII digits. 

102 space: 

103 Same as `lower`, but for the space character. 

104 dash: 

105 Same as `lower`, but for the hyphen-minus and underscore 

106 characters. 

107 symbol: 

108 Same as `lower`, but for all other hitherto unlisted 

109 ASCII printable characters (except backquote). 

110 

111 Raises: 

112 AmbiguousByteRepresentationError: 

113 The phrase is a text string with differing NFC- and 

114 NFD-normalized UTF-8 byte representations. 

115 

116 """ 

117 self._phrase = self._get_binary_string(phrase) 

118 self._length = length 

119 self._repeat = repeat 

120 self._allowed = bytearray(self._CHARSETS['all']) 

121 self._required: list[bytes] = [] 

122 def subtract_or_require( 

123 count: int | None, characters: bytes | bytearray 

124 ) -> None: 

125 if not isinstance(count, int): 

126 return 

127 elif count <= 0: 

128 self._allowed = self._subtract(characters, self._allowed) 

129 else: 

130 for _ in range(count): 

131 self._required.append(characters) 

132 subtract_or_require(lower, self._CHARSETS['lower']) 

133 subtract_or_require(upper, self._CHARSETS['upper']) 

134 subtract_or_require(number, self._CHARSETS['number']) 

135 subtract_or_require(space, self._CHARSETS['space']) 

136 subtract_or_require(dash, self._CHARSETS['dash']) 

137 subtract_or_require(symbol, self._CHARSETS['symbol']) 

138 if len(self._required) > self._length: 

139 raise ValueError('requested passphrase length too short') 

140 if not self._allowed: 

141 raise ValueError('no allowed characters left') 

142 for _ in range(len(self._required), self._length): 

143 self._required.append(bytes(self._allowed)) 

144 

145 def _entropy(self) -> float: 

146 """Estimate the passphrase entropy, given the current settings. 

147 

148 The entropy is the base 2 logarithm of the amount of 

149 possibilities. We operate directly on the logarithms, and use 

150 sorting and [`math.fsum`][] to keep high accuracy. 

151 

152 Note: 

153 We actually overestimate the entropy here because of poor 

154 handling of character repetitions. In the extreme, assuming 

155 that only one character were allowed, then because there is 

156 only one possible string of each given length, the entropy 

157 of that string `s` is always be zero. However, we calculate 

158 the entropy as `math.log2(math.factorial(len(s)))`, i.e. we 

159 assume the characters at the respective string position are 

160 distinguishable from each other. 

161 

162 Returns: 

163 A valid (and somewhat close) upper bound to the entropy. 

164 

165 """ 

166 factors: list[int] = [] 

167 if not self._required or any(not x for x in self._required): 

168 return float('-inf') 

169 for i, charset in enumerate(self._required): 

170 factors.append(i + 1) 

171 factors.append(len(charset)) 

172 factors.sort() 

173 return math.fsum(math.log2(f) for f in factors) 

174 

175 def _estimate_sufficient_hash_length( 

176 self, safety_factor: float = 2.0, 

177 ) -> int: 

178 """Estimate the sufficient hash length, given the current settings. 

179 

180 Using the entropy (via `_entropy`) and a safety factor, give an 

181 initial estimate of the length to use for `create_hash` such 

182 that using a `Sequin` with this hash will not exhaust it during 

183 passphrase generation. 

184 

185 Args: 

186 safety_factor: The safety factor. Must be at least 1. 

187 

188 Returns: 

189 The estimated sufficient hash length. 

190 

191 Warning: 

192 This is a heuristic, not an exact computation; it may 

193 underestimate the true necessary hash length. It is 

194 intended as a starting point for searching for a sufficient 

195 hash length, usually by doubling the hash length each time 

196 it does not yet prove so. 

197 

198 """ 

199 try: 

200 safety_factor = float(safety_factor) 

201 except TypeError as e: 

202 raise TypeError(f'invalid safety factor: not a float: ' 

203 f'{safety_factor!r}') from e 

204 if not math.isfinite(safety_factor) or safety_factor < 1.0: 

205 raise ValueError(f'invalid safety factor {safety_factor!r}') 

206 # Ensure the bound is strictly positive. 

207 entropy_bound = max(1, self._entropy()) 

208 return int(math.ceil(safety_factor * entropy_bound / 8)) 

209 

210 @staticmethod 

211 def _get_binary_string(s: bytes | bytearray | str, /) -> bytes: 

212 """Convert the input string to a read-only, binary string. 

213 

214 If it is a text string, then test for an unambiguous UTF-8 

215 representation, otherwise abort. (That is, check whether the 

216 NFC and NFD forms of the string coincide.) 

217 

218 Args: 

219 s: The string to (check and) convert. 

220 

221 Returns: 

222 A read-only, binary copy of the string. 

223 

224 Raises: 

225 AmbiguousByteRepresentationError: 

226 The text string has differing NFC- and NFD-normalized 

227 UTF-8 byte representations. 

228 

229 """ 

230 if isinstance(s, str): 

231 norm = unicodedata.normalize 

232 if norm('NFC', s) != norm('NFD', s): 

233 raise AmbiguousByteRepresentationError( 

234 'text string has ambiguous byte representation') 

235 return s.encode('UTF-8') 

236 return bytes(s) 

237 

238 @classmethod 

239 def create_hash( 

240 cls, phrase: bytes | bytearray | str, 

241 service: bytes | bytearray, *, length: int = 32, 

242 ) -> bytes: 

243 r"""Create a pseudorandom byte stream from phrase and service. 

244 

245 Create a pseudorandom byte stream from `phrase` and `service` by 

246 feeding them into the key-derivation function PBKDF2 

247 (8 iterations, using SHA-1). 

248 

249 Args: 

250 phrase: 

251 A master passphrase, or sometimes an SSH signature. 

252 Used as the key for PBKDF2, the underlying cryptographic 

253 primitive. 

254 

255 If a text string, then the byte representation must be 

256 unique. 

257 service: 

258 A vault service name. Will be suffixed with 

259 `Vault._UUID`, and then used as the salt value for 

260 PBKDF2. 

261 length: 

262 The length of the byte stream to generate. 

263 

264 Returns: 

265 A pseudorandom byte string of length `length`. 

266 

267 Raises: 

268 AmbiguousByteRepresentationError: 

269 The phrase is a text string with differing NFC- and 

270 NFD-normalized UTF-8 byte representations. 

271 

272 Note: 

273 Shorter values returned from this method (with the same key 

274 and message) are prefixes of longer values returned from 

275 this method. (This property is inherited from the 

276 underlying PBKDF2 function.) It is thus safe (if slow) to 

277 call this method with the same input with ever-increasing 

278 target lengths. 

279 

280 Examples: 

281 >>> # See also Vault.phrase_from_key examples. 

282 >>> phrase = bytes.fromhex(''' 

283 ... 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39 

284 ... 00 00 00 40 

285 ... f0 98 19 80 6c 1a 97 d5 26 03 6e cc e3 65 8f 86 

286 ... 66 07 13 19 13 09 21 33 33 f9 e4 36 53 1d af fd 

287 ... 0d 08 1f ec f8 73 9b 8c 5f 55 39 16 7c 53 54 2c 

288 ... 1e 52 bb 30 ed 7f 89 e2 2f 69 51 55 d8 9e a6 02 

289 ... ''') 

290 >>> Vault.create_hash(phrase, b'some_service', length=4) 

291 b'M\xb1<S' 

292 >>> Vault.create_hash(phrase, b'some_service', length=16) 

293 b'M\xb1<S\x827E\xd1M\xaf\xf8~\xc8n\x10\xcc' 

294 >>> Vault.create_hash(phrase, b'NOSUCHSERVICE', length=16) 

295 b'\x1c\xc3\x9c\xd9\xb6\x1a\x99CS\x07\xc41\xf4\x85#s' 

296 

297 """ 

298 phrase = cls._get_binary_string(phrase) 

299 assert not isinstance(phrase, str) 

300 salt = bytes(service) + cls._UUID 

301 return hashlib.pbkdf2_hmac(hash_name='sha1', password=phrase, 

302 salt=salt, iterations=8, dklen=length) 

303 

304 def generate( 

305 self, service_name: str | bytes | bytearray, /, *, 

306 phrase: bytes | bytearray | str = b'', 

307 ) -> bytes: 

308 r"""Generate a service passphrase. 

309 

310 Args: 

311 service_name: 

312 The service name. 

313 phrase: 

314 If given, override the passphrase given during 

315 construction. 

316 

317 If a text string, then the byte representation must be 

318 unique. 

319 

320 Returns: 

321 The service passphrase. 

322 

323 Raises: 

324 AmbiguousByteRepresentationError: 

325 The phrase is a text string with differing NFC- and 

326 NFD-normalized UTF-8 byte representations. 

327 

328 Examples: 

329 >>> phrase = b'She cells C shells bye the sea shoars' 

330 >>> # Using default options in constructor. 

331 >>> Vault(phrase=phrase).generate(b'google') 

332 b': 4TVH#5:aZl8LueOT\\{' 

333 >>> # Also possible: 

334 >>> Vault().generate(b'google', phrase=phrase) 

335 b': 4TVH#5:aZl8LueOT\\{' 

336 

337 """ 

338 hash_length = self._estimate_sufficient_hash_length() 

339 assert hash_length >= 1 

340 # Ensure the phrase is a bytes object. Needed later for safe 

341 # concatenation. 

342 if isinstance(service_name, str): 

343 service_name = service_name.encode('utf-8') 

344 elif not isinstance(service_name, bytes): 

345 service_name = bytes(service_name) 

346 assert_type(service_name, bytes) 

347 if not phrase: 

348 phrase = self._phrase 

349 phrase = self._get_binary_string(phrase) 

350 # Repeat the passphrase generation with ever-increasing hash 

351 # lengths, until the passphrase can be formed without exhausting 

352 # the sequin. See the guarantee in the create_hash method for 

353 # why this works. 

354 while True: 

355 try: 

356 required = self._required[:] 

357 seq = sequin.Sequin(self.create_hash( 

358 phrase=phrase, service=service_name, length=hash_length)) 

359 result = bytearray() 

360 while len(result) < self._length: 

361 pos = seq.generate(len(required)) 

362 charset = required.pop(pos) 

363 # Determine if an unlucky choice right now might 

364 # violate the restriction on repeated characters. 

365 # That is, check if the current partial passphrase 

366 # ends with r - 1 copies of the same character 

367 # (where r is the repeat limit that must not be 

368 # reached), and if so, remove this same character 

369 # from the current character's allowed set. 

370 if self._repeat and result: 

371 bad_suffix = bytes(result[-1:]) * (self._repeat - 1) 

372 if result.endswith(bad_suffix): 

373 charset = self._subtract(bytes(result[-1:]), 

374 charset) 

375 pos = seq.generate(len(charset)) 

376 result.extend(charset[pos:pos+1]) 

377 except sequin.SequinExhaustedError: 

378 hash_length *= 2 

379 else: 

380 return bytes(result) 

381 

382 @staticmethod 

383 def _is_suitable_ssh_key(key: bytes | bytearray, /) -> bool: 

384 """Check whether the key is suitable for passphrase derivation. 

385 

386 Currently, this only checks whether signatures with this key 

387 type are deterministic. 

388 

389 Args: 

390 key: SSH public key to check. 

391 

392 Returns: 

393 True if and only if the key is suitable for use in deriving 

394 a passphrase deterministically. 

395 

396 """ 

397 deterministic_signature_types = { 

398 'ssh-ed25519': 

399 lambda k: k.startswith(b'\x00\x00\x00\x0bssh-ed25519'), 

400 'ssh-ed448': 

401 lambda k: k.startswith(b'\x00\x00\x00\x09ssh-ed448'), 

402 'ssh-rsa': 

403 lambda k: k.startswith(b'\x00\x00\x00\x07ssh-rsa'), 

404 } 

405 return any(v(key) for v in deterministic_signature_types.values()) 

406 

407 @classmethod 

408 def phrase_from_key( 

409 cls, key: bytes | bytearray, / 

410 ) -> bytes: 

411 """Obtain the master passphrase from a configured SSH key. 

412 

413 vault allows the usage of certain SSH keys to derive a master 

414 passphrase, by signing the vault UUID with the SSH key. The key 

415 type must ensure that signatures are deterministic. 

416 

417 Args: 

418 key: The (public) SSH key to use for signing. 

419 

420 Returns: 

421 The signature of the vault UUID under this key, unframed but 

422 encoded in base64. 

423 

424 Raises: 

425 ValueError: 

426 The SSH key is principally unsuitable for this use case. 

427 Usually this means that the signature is not 

428 deterministic. 

429 

430 Examples: 

431 >>> import base64 

432 >>> # Actual Ed25519 test public key. 

433 >>> public_key = bytes.fromhex(''' 

434 ... 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39 

435 ... 00 00 00 20 

436 ... 81 78 81 68 26 d6 02 48 5f 0f ff 32 48 6f e4 c1 

437 ... 30 89 dc 1c 6a 45 06 09 e9 09 0f fb c2 12 69 76 

438 ... ''') 

439 >>> expected_sig_raw = bytes.fromhex(''' 

440 ... 00 00 00 0b 73 73 68 2d 65 64 32 35 35 31 39 

441 ... 00 00 00 40 

442 ... f0 98 19 80 6c 1a 97 d5 26 03 6e cc e3 65 8f 86 

443 ... 66 07 13 19 13 09 21 33 33 f9 e4 36 53 1d af fd 

444 ... 0d 08 1f ec f8 73 9b 8c 5f 55 39 16 7c 53 54 2c 

445 ... 1e 52 bb 30 ed 7f 89 e2 2f 69 51 55 d8 9e a6 02 

446 ... ''') 

447 >>> # Raw Ed25519 signatures are 64 bytes long. 

448 >>> signature_blob = expected_sig_raw[-64:] 

449 >>> phrase = base64.standard_b64encode(signature_blob) 

450 >>> Vault.phrase_from_key(phrase) == expected # doctest:+SKIP 

451 True 

452 

453 """ 

454 if not cls._is_suitable_ssh_key(key): 

455 raise ValueError( 

456 'unsuitable SSH key: bad key, or signature not deterministic') 

457 with ssh_agent_client.SSHAgentClient() as client: 

458 raw_sig = client.sign(key, cls._UUID) 

459 keytype, trailer = client.unstring_prefix(raw_sig) 

460 signature_blob = client.unstring(trailer) 

461 return bytes(base64.standard_b64encode(signature_blob)) 

462 

463 @staticmethod 

464 def _subtract( 

465 charset: bytes | bytearray, allowed: bytes | bytearray, 

466 ) -> bytearray: 

467 """Remove the characters in charset from allowed. 

468 

469 This preserves the relative order of characters in `allowed`. 

470 

471 Args: 

472 charset: 

473 Characters to remove. Must not contain duplicate 

474 characters. 

475 allowed: 

476 Character set to remove the other characters from. Must 

477 not contain duplicate characters. 

478 

479 Returns: 

480 The pruned "allowed" character set. 

481 

482 Raises: 

483 ValueError: 

484 `allowed` or `charset` contained duplicate characters. 

485 

486 """ 

487 allowed = (allowed if isinstance(allowed, bytearray) 

488 else bytearray(allowed)) 

489 assert_type(allowed, bytearray) 

490 if len(frozenset(allowed)) != len(allowed): 

491 raise ValueError('duplicate characters in set') 

492 if len(frozenset(charset)) != len(charset): 

493 raise ValueError('duplicate characters in set') 

494 for c in charset: 

495 try: 

496 pos = allowed.index(c) 

497 except ValueError: 

498 pass 

499 else: 

500 allowed[pos:pos+1] = [] 

501 return allowed