Hide keyboard shortcuts

Hot-keys on this page

r m x p   toggle line displays

j k   next/prev highlighted chunk

0   (zero) top of page

1   (one) first highlighted chunk

1""" 

2A module for reading dvi files output by TeX. Several limitations make 

3this not (currently) useful as a general-purpose dvi preprocessor, but 

4it is currently used by the pdf backend for processing usetex text. 

5 

6Interface:: 

7 

8 with Dvi(filename, 72) as dvi: 

9 # iterate over pages: 

10 for page in dvi: 

11 w, h, d = page.width, page.height, page.descent 

12 for x, y, font, glyph, width in page.text: 

13 fontname = font.texname 

14 pointsize = font.size 

15 ... 

16 for x, y, height, width in page.boxes: 

17 ... 

18""" 

19 

20from collections import namedtuple 

21import enum 

22from functools import lru_cache, partial, wraps 

23import logging 

24import os 

25import re 

26import struct 

27import textwrap 

28 

29import numpy as np 

30 

31from matplotlib import cbook, rcParams 

32 

33_log = logging.getLogger(__name__) 

34 

35# Many dvi related files are looked for by external processes, require 

36# additional parsing, and are used many times per rendering, which is why they 

37# are cached using lru_cache(). 

38 

39# Dvi is a bytecode format documented in 

40# http://mirrors.ctan.org/systems/knuth/dist/texware/dvitype.web 

41# http://texdoc.net/texmf-dist/doc/generic/knuth/texware/dvitype.pdf 

42# 

43# The file consists of a preamble, some number of pages, a postamble, 

44# and a finale. Different opcodes are allowed in different contexts, 

45# so the Dvi object has a parser state: 

46# 

47# pre: expecting the preamble 

48# outer: between pages (followed by a page or the postamble, 

49# also e.g. font definitions are allowed) 

50# page: processing a page 

51# post_post: state after the postamble (our current implementation 

52# just stops reading) 

53# finale: the finale (unimplemented in our current implementation) 

54 

55_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale') 

56 

57# The marks on a page consist of text and boxes. A page also has dimensions. 

58Page = namedtuple('Page', 'text boxes height width descent') 

59Text = namedtuple('Text', 'x y font glyph width') 

60Box = namedtuple('Box', 'x y height width') 

61 

62 

63# Opcode argument parsing 

64# 

65# Each of the following functions takes a Dvi object and delta, 

66# which is the difference between the opcode and the minimum opcode 

67# with the same meaning. Dvi opcodes often encode the number of 

68# argument bytes in this delta. 

69 

70def _arg_raw(dvi, delta): 

71 """Return *delta* without reading anything more from the dvi file""" 

72 return delta 

73 

74 

75def _arg(bytes, signed, dvi, _): 

76 """Read *bytes* bytes, returning the bytes interpreted as a 

77 signed integer if *signed* is true, unsigned otherwise.""" 

78 return dvi._arg(bytes, signed) 

79 

80 

81def _arg_slen(dvi, delta): 

82 """Signed, length *delta* 

83 

84 Read *delta* bytes, returning None if *delta* is zero, and 

85 the bytes interpreted as a signed integer otherwise.""" 

86 if delta == 0: 

87 return None 

88 return dvi._arg(delta, True) 

89 

90 

91def _arg_slen1(dvi, delta): 

92 """Signed, length *delta*+1 

93 

94 Read *delta*+1 bytes, returning the bytes interpreted as signed.""" 

95 return dvi._arg(delta+1, True) 

96 

97 

98def _arg_ulen1(dvi, delta): 

99 """Unsigned length *delta*+1 

100 

101 Read *delta*+1 bytes, returning the bytes interpreted as unsigned.""" 

102 return dvi._arg(delta+1, False) 

103 

104 

105def _arg_olen1(dvi, delta): 

106 """Optionally signed, length *delta*+1 

107 

108 Read *delta*+1 bytes, returning the bytes interpreted as 

109 unsigned integer for 0<=*delta*<3 and signed if *delta*==3.""" 

110 return dvi._arg(delta + 1, delta == 3) 

111 

112 

113_arg_mapping = dict(raw=_arg_raw, 

114 u1=partial(_arg, 1, False), 

115 u4=partial(_arg, 4, False), 

116 s4=partial(_arg, 4, True), 

117 slen=_arg_slen, 

118 olen1=_arg_olen1, 

119 slen1=_arg_slen1, 

120 ulen1=_arg_ulen1) 

121 

122 

123def _dispatch(table, min, max=None, state=None, args=('raw',)): 

124 """Decorator for dispatch by opcode. Sets the values in *table* 

125 from *min* to *max* to this method, adds a check that the Dvi state 

126 matches *state* if not None, reads arguments from the file according 

127 to *args*. 

128 

129 *table* 

130 the dispatch table to be filled in 

131 

132 *min* 

133 minimum opcode for calling this function 

134 

135 *max* 

136 maximum opcode for calling this function, None if only *min* is allowed 

137 

138 *state* 

139 state of the Dvi object in which these opcodes are allowed 

140 

141 *args* 

142 sequence of argument specifications: 

143 

144 ``'raw'``: opcode minus minimum 

145 ``'u1'``: read one unsigned byte 

146 ``'u4'``: read four bytes, treat as an unsigned number 

147 ``'s4'``: read four bytes, treat as a signed number 

148 ``'slen'``: read (opcode - minimum) bytes, treat as signed 

149 ``'slen1'``: read (opcode - minimum + 1) bytes, treat as signed 

150 ``'ulen1'``: read (opcode - minimum + 1) bytes, treat as unsigned 

151 ``'olen1'``: read (opcode - minimum + 1) bytes, treat as unsigned 

152 if under four bytes, signed if four bytes 

153 """ 

154 def decorate(method): 

155 get_args = [_arg_mapping[x] for x in args] 

156 

157 @wraps(method) 

158 def wrapper(self, byte): 

159 if state is not None and self.state != state: 

160 raise ValueError("state precondition failed") 

161 return method(self, *[f(self, byte-min) for f in get_args]) 

162 if max is None: 

163 table[min] = wrapper 

164 else: 

165 for i in range(min, max+1): 

166 assert table[i] is None 

167 table[i] = wrapper 

168 return wrapper 

169 return decorate 

170 

171 

172class Dvi: 

173 """ 

174 A reader for a dvi ("device-independent") file, as produced by TeX. 

175 The current implementation can only iterate through pages in order, 

176 and does not even attempt to verify the postamble. 

177 

178 This class can be used as a context manager to close the underlying 

179 file upon exit. Pages can be read via iteration. Here is an overly 

180 simple way to extract text without trying to detect whitespace:: 

181 

182 >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi: 

183 ... for page in dvi: 

184 ... print(''.join(chr(t.glyph) for t in page.text)) 

185 """ 

186 # dispatch table 

187 _dtable = [None] * 256 

188 _dispatch = partial(_dispatch, _dtable) 

189 

190 def __init__(self, filename, dpi): 

191 """ 

192 Read the data from the file named *filename* and convert 

193 TeX's internal units to units of *dpi* per inch. 

194 *dpi* only sets the units and does not limit the resolution. 

195 Use None to return TeX's internal units. 

196 """ 

197 _log.debug('Dvi: %s', filename) 

198 self.file = open(filename, 'rb') 

199 self.dpi = dpi 

200 self.fonts = {} 

201 self.state = _dvistate.pre 

202 self.baseline = self._get_baseline(filename) 

203 

204 def _get_baseline(self, filename): 

205 if rcParams['text.latex.preview']: 

206 base, ext = os.path.splitext(filename) 

207 baseline_filename = base + ".baseline" 

208 if os.path.exists(baseline_filename): 

209 with open(baseline_filename, 'rb') as fd: 

210 l = fd.read().split() 

211 height, depth, width = l 

212 return float(depth) 

213 return None 

214 

215 def __enter__(self): 

216 """ 

217 Context manager enter method, does nothing. 

218 """ 

219 return self 

220 

221 def __exit__(self, etype, evalue, etrace): 

222 """ 

223 Context manager exit method, closes the underlying file if it is open. 

224 """ 

225 self.close() 

226 

227 def __iter__(self): 

228 """ 

229 Iterate through the pages of the file. 

230 

231 Yields 

232 ------ 

233 Page 

234 Details of all the text and box objects on the page. 

235 The Page tuple contains lists of Text and Box tuples and 

236 the page dimensions, and the Text and Box tuples contain 

237 coordinates transformed into a standard Cartesian 

238 coordinate system at the dpi value given when initializing. 

239 The coordinates are floating point numbers, but otherwise 

240 precision is not lost and coordinate values are not clipped to 

241 integers. 

242 """ 

243 while self._read(): 

244 yield self._output() 

245 

246 def close(self): 

247 """ 

248 Close the underlying file if it is open. 

249 """ 

250 if not self.file.closed: 

251 self.file.close() 

252 

253 def _output(self): 

254 """ 

255 Output the text and boxes belonging to the most recent page. 

256 page = dvi._output() 

257 """ 

258 minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf 

259 maxy_pure = -np.inf 

260 for elt in self.text + self.boxes: 

261 if isinstance(elt, Box): 

262 x, y, h, w = elt 

263 e = 0 # zero depth 

264 else: # glyph 

265 x, y, font, g, w = elt 

266 h, e = font._height_depth_of(g) 

267 minx = min(minx, x) 

268 miny = min(miny, y - h) 

269 maxx = max(maxx, x + w) 

270 maxy = max(maxy, y + e) 

271 maxy_pure = max(maxy_pure, y) 

272 if self._baseline_v is not None: 

273 maxy_pure = self._baseline_v # This should normally be the case. 

274 self._baseline_v = None 

275 

276 if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf. 

277 return Page(text=[], boxes=[], width=0, height=0, descent=0) 

278 

279 if self.dpi is None: 

280 # special case for ease of debugging: output raw dvi coordinates 

281 return Page(text=self.text, boxes=self.boxes, 

282 width=maxx-minx, height=maxy_pure-miny, 

283 descent=maxy-maxy_pure) 

284 

285 # convert from TeX's "scaled points" to dpi units 

286 d = self.dpi / (72.27 * 2**16) 

287 if self.baseline is None: 

288 descent = (maxy - maxy_pure) * d 

289 else: 

290 descent = self.baseline 

291 

292 text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d) 

293 for (x, y, f, g, w) in self.text] 

294 boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d) 

295 for (x, y, h, w) in self.boxes] 

296 

297 return Page(text=text, boxes=boxes, width=(maxx-minx)*d, 

298 height=(maxy_pure-miny)*d, descent=descent) 

299 

300 def _read(self): 

301 """ 

302 Read one page from the file. Return True if successful, 

303 False if there were no more pages. 

304 """ 

305 # Pages appear to start with the sequence 

306 # bop (begin of page) 

307 # xxx comment 

308 # down 

309 # push 

310 # down, down 

311 # push 

312 # down (possibly multiple) 

313 # push <= here, v is the baseline position. 

314 # etc. 

315 # (dviasm is useful to explore this structure.) 

316 self._baseline_v = None 

317 while True: 

318 byte = self.file.read(1)[0] 

319 self._dtable[byte](self, byte) 

320 if (self._baseline_v is None 

321 and len(getattr(self, "stack", [])) == 3): 

322 self._baseline_v = self.v 

323 if byte == 140: # end of page 

324 return True 

325 if self.state is _dvistate.post_post: # end of file 

326 self.close() 

327 return False 

328 

329 def _arg(self, nbytes, signed=False): 

330 """ 

331 Read and return an integer argument *nbytes* long. 

332 Signedness is determined by the *signed* keyword. 

333 """ 

334 str = self.file.read(nbytes) 

335 value = str[0] 

336 if signed and value >= 0x80: 

337 value = value - 0x100 

338 for i in range(1, nbytes): 

339 value = 0x100*value + str[i] 

340 return value 

341 

342 @_dispatch(min=0, max=127, state=_dvistate.inpage) 

343 def _set_char_immediate(self, char): 

344 self._put_char_real(char) 

345 self.h += self.fonts[self.f]._width_of(char) 

346 

347 @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',)) 

348 def _set_char(self, char): 

349 self._put_char_real(char) 

350 self.h += self.fonts[self.f]._width_of(char) 

351 

352 @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4')) 

353 def _set_rule(self, a, b): 

354 self._put_rule_real(a, b) 

355 self.h += b 

356 

357 @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',)) 

358 def _put_char(self, char): 

359 self._put_char_real(char) 

360 

361 def _put_char_real(self, char): 

362 font = self.fonts[self.f] 

363 if font._vf is None: 

364 self.text.append(Text(self.h, self.v, font, char, 

365 font._width_of(char))) 

366 else: 

367 scale = font._scale 

368 for x, y, f, g, w in font._vf[char].text: 

369 newf = DviFont(scale=_mul2012(scale, f._scale), 

370 tfm=f._tfm, texname=f.texname, vf=f._vf) 

371 self.text.append(Text(self.h + _mul2012(x, scale), 

372 self.v + _mul2012(y, scale), 

373 newf, g, newf._width_of(g))) 

374 self.boxes.extend([Box(self.h + _mul2012(x, scale), 

375 self.v + _mul2012(y, scale), 

376 _mul2012(a, scale), _mul2012(b, scale)) 

377 for x, y, a, b in font._vf[char].boxes]) 

378 

379 @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4')) 

380 def _put_rule(self, a, b): 

381 self._put_rule_real(a, b) 

382 

383 def _put_rule_real(self, a, b): 

384 if a > 0 and b > 0: 

385 self.boxes.append(Box(self.h, self.v, a, b)) 

386 

387 @_dispatch(138) 

388 def _nop(self, _): 

389 pass 

390 

391 @_dispatch(139, state=_dvistate.outer, args=('s4',)*11) 

392 def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p): 

393 self.state = _dvistate.inpage 

394 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 

395 self.stack = [] 

396 self.text = [] # list of Text objects 

397 self.boxes = [] # list of Box objects 

398 

399 @_dispatch(140, state=_dvistate.inpage) 

400 def _eop(self, _): 

401 self.state = _dvistate.outer 

402 del self.h, self.v, self.w, self.x, self.y, self.z, self.stack 

403 

404 @_dispatch(141, state=_dvistate.inpage) 

405 def _push(self, _): 

406 self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z)) 

407 

408 @_dispatch(142, state=_dvistate.inpage) 

409 def _pop(self, _): 

410 self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop() 

411 

412 @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',)) 

413 def _right(self, b): 

414 self.h += b 

415 

416 @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',)) 

417 def _right_w(self, new_w): 

418 if new_w is not None: 

419 self.w = new_w 

420 self.h += self.w 

421 

422 @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',)) 

423 def _right_x(self, new_x): 

424 if new_x is not None: 

425 self.x = new_x 

426 self.h += self.x 

427 

428 @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',)) 

429 def _down(self, a): 

430 self.v += a 

431 

432 @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',)) 

433 def _down_y(self, new_y): 

434 if new_y is not None: 

435 self.y = new_y 

436 self.v += self.y 

437 

438 @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',)) 

439 def _down_z(self, new_z): 

440 if new_z is not None: 

441 self.z = new_z 

442 self.v += self.z 

443 

444 @_dispatch(min=171, max=234, state=_dvistate.inpage) 

445 def _fnt_num_immediate(self, k): 

446 self.f = k 

447 

448 @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',)) 

449 def _fnt_num(self, new_f): 

450 self.f = new_f 

451 

452 @_dispatch(min=239, max=242, args=('ulen1',)) 

453 def _xxx(self, datalen): 

454 special = self.file.read(datalen) 

455 _log.debug( 

456 'Dvi._xxx: encountered special: %s', 

457 ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch 

458 for ch in special])) 

459 

460 @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1')) 

461 def _fnt_def(self, k, c, s, d, a, l): 

462 self._fnt_def_real(k, c, s, d, a, l) 

463 

464 def _fnt_def_real(self, k, c, s, d, a, l): 

465 n = self.file.read(a + l) 

466 fontname = n[-l:].decode('ascii') 

467 tfm = _tfmfile(fontname) 

468 if tfm is None: 

469 raise FileNotFoundError("missing font metrics file: %s" % fontname) 

470 if c != 0 and tfm.checksum != 0 and c != tfm.checksum: 

471 raise ValueError('tfm checksum mismatch: %s' % n) 

472 

473 vf = _vffile(fontname) 

474 

475 self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf) 

476 

477 @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1')) 

478 def _pre(self, i, num, den, mag, k): 

479 self.file.read(k) # comment in the dvi file 

480 if i != 2: 

481 raise ValueError("Unknown dvi format %d" % i) 

482 if num != 25400000 or den != 7227 * 2**16: 

483 raise ValueError("nonstandard units in dvi file") 

484 # meaning: TeX always uses those exact values, so it 

485 # should be enough for us to support those 

486 # (There are 72.27 pt to an inch so 7227 pt = 

487 # 7227 * 2**16 sp to 100 in. The numerator is multiplied 

488 # by 10^5 to get units of 10**-7 meters.) 

489 if mag != 1000: 

490 raise ValueError("nonstandard magnification in dvi file") 

491 # meaning: LaTeX seems to frown on setting \mag, so 

492 # I think we can assume this is constant 

493 self.state = _dvistate.outer 

494 

495 @_dispatch(248, state=_dvistate.outer) 

496 def _post(self, _): 

497 self.state = _dvistate.post_post 

498 # TODO: actually read the postamble and finale? 

499 # currently post_post just triggers closing the file 

500 

501 @_dispatch(249) 

502 def _post_post(self, _): 

503 raise NotImplementedError 

504 

505 @_dispatch(min=250, max=255) 

506 def _malformed(self, offset): 

507 raise ValueError("unknown command: byte %d", 250 + offset) 

508 

509 

510class DviFont: 

511 """ 

512 Encapsulation of a font that a DVI file can refer to. 

513 

514 This class holds a font's texname and size, supports comparison, 

515 and knows the widths of glyphs in the same units as the AFM file. 

516 There are also internal attributes (for use by dviread.py) that 

517 are *not* used for comparison. 

518 

519 The size is in Adobe points (converted from TeX points). 

520 

521 Parameters 

522 ---------- 

523 scale : float 

524 Factor by which the font is scaled from its natural size. 

525 tfm : Tfm 

526 TeX font metrics for this font 

527 texname : bytes 

528 Name of the font as used internally by TeX and friends, as an 

529 ASCII bytestring. This is usually very different from any external 

530 font names, and :class:`dviread.PsfontsMap` can be used to find 

531 the external name of the font. 

532 vf : Vf 

533 A TeX "virtual font" file, or None if this font is not virtual. 

534 

535 Attributes 

536 ---------- 

537 texname : bytes 

538 size : float 

539 Size of the font in Adobe points, converted from the slightly 

540 smaller TeX points. 

541 widths : list 

542 Widths of glyphs in glyph-space units, typically 1/1000ths of 

543 the point size. 

544 

545 """ 

546 __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm') 

547 

548 def __init__(self, scale, tfm, texname, vf): 

549 cbook._check_isinstance(bytes, texname=texname) 

550 self._scale = scale 

551 self._tfm = tfm 

552 self.texname = texname 

553 self._vf = vf 

554 self.size = scale * (72.0 / (72.27 * 2**16)) 

555 try: 

556 nchars = max(tfm.width) + 1 

557 except ValueError: 

558 nchars = 0 

559 self.widths = [(1000*tfm.width.get(char, 0)) >> 20 

560 for char in range(nchars)] 

561 

562 def __eq__(self, other): 

563 return (type(self) == type(other) 

564 and self.texname == other.texname and self.size == other.size) 

565 

566 def __ne__(self, other): 

567 return not self.__eq__(other) 

568 

569 def __repr__(self): 

570 return "<{}: {}>".format(type(self).__name__, self.texname) 

571 

572 def _width_of(self, char): 

573 """Width of char in dvi units.""" 

574 width = self._tfm.width.get(char, None) 

575 if width is not None: 

576 return _mul2012(width, self._scale) 

577 _log.debug('No width for char %d in font %s.', char, self.texname) 

578 return 0 

579 

580 def _height_depth_of(self, char): 

581 """Height and depth of char in dvi units.""" 

582 result = [] 

583 for metric, name in ((self._tfm.height, "height"), 

584 (self._tfm.depth, "depth")): 

585 value = metric.get(char, None) 

586 if value is None: 

587 _log.debug('No %s for char %d in font %s', 

588 name, char, self.texname) 

589 result.append(0) 

590 else: 

591 result.append(_mul2012(value, self._scale)) 

592 return result 

593 

594 

595class Vf(Dvi): 

596 r""" 

597 A virtual font (\*.vf file) containing subroutines for dvi files. 

598 

599 Usage:: 

600 

601 vf = Vf(filename) 

602 glyph = vf[code] 

603 glyph.text, glyph.boxes, glyph.width 

604 

605 Parameters 

606 ---------- 

607 filename : str or path-like 

608 

609 Notes 

610 ----- 

611 The virtual font format is a derivative of dvi: 

612 http://mirrors.ctan.org/info/knuth/virtual-fonts 

613 This class reuses some of the machinery of `Dvi` 

614 but replaces the `_read` loop and dispatch mechanism. 

615 """ 

616 

617 def __init__(self, filename): 

618 Dvi.__init__(self, filename, 0) 

619 try: 

620 self._first_font = None 

621 self._chars = {} 

622 self._read() 

623 finally: 

624 self.close() 

625 

626 def __getitem__(self, code): 

627 return self._chars[code] 

628 

629 def _read(self): 

630 """ 

631 Read one page from the file. Return True if successful, 

632 False if there were no more pages. 

633 """ 

634 packet_char, packet_ends = None, None 

635 packet_len, packet_width = None, None 

636 while True: 

637 byte = self.file.read(1)[0] 

638 # If we are in a packet, execute the dvi instructions 

639 if self.state is _dvistate.inpage: 

640 byte_at = self.file.tell()-1 

641 if byte_at == packet_ends: 

642 self._finalize_packet(packet_char, packet_width) 

643 packet_len, packet_char, packet_width = None, None, None 

644 # fall through to out-of-packet code 

645 elif byte_at > packet_ends: 

646 raise ValueError("Packet length mismatch in vf file") 

647 else: 

648 if byte in (139, 140) or byte >= 243: 

649 raise ValueError( 

650 "Inappropriate opcode %d in vf file" % byte) 

651 Dvi._dtable[byte](self, byte) 

652 continue 

653 

654 # We are outside a packet 

655 if byte < 242: # a short packet (length given by byte) 

656 packet_len = byte 

657 packet_char, packet_width = self._arg(1), self._arg(3) 

658 packet_ends = self._init_packet(byte) 

659 self.state = _dvistate.inpage 

660 elif byte == 242: # a long packet 

661 packet_len, packet_char, packet_width = \ 

662 [self._arg(x) for x in (4, 4, 4)] 

663 self._init_packet(packet_len) 

664 elif 243 <= byte <= 246: 

665 k = self._arg(byte - 242, byte == 246) 

666 c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)] 

667 self._fnt_def_real(k, c, s, d, a, l) 

668 if self._first_font is None: 

669 self._first_font = k 

670 elif byte == 247: # preamble 

671 i, k = self._arg(1), self._arg(1) 

672 x = self.file.read(k) 

673 cs, ds = self._arg(4), self._arg(4) 

674 self._pre(i, x, cs, ds) 

675 elif byte == 248: # postamble (just some number of 248s) 

676 break 

677 else: 

678 raise ValueError("unknown vf opcode %d" % byte) 

679 

680 def _init_packet(self, pl): 

681 if self.state != _dvistate.outer: 

682 raise ValueError("Misplaced packet in vf file") 

683 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0 

684 self.stack, self.text, self.boxes = [], [], [] 

685 self.f = self._first_font 

686 return self.file.tell() + pl 

687 

688 def _finalize_packet(self, packet_char, packet_width): 

689 self._chars[packet_char] = Page( 

690 text=self.text, boxes=self.boxes, width=packet_width, 

691 height=None, descent=None) 

692 self.state = _dvistate.outer 

693 

694 def _pre(self, i, x, cs, ds): 

695 if self.state is not _dvistate.pre: 

696 raise ValueError("pre command in middle of vf file") 

697 if i != 202: 

698 raise ValueError("Unknown vf format %d" % i) 

699 if len(x): 

700 _log.debug('vf file comment: %s', x) 

701 self.state = _dvistate.outer 

702 # cs = checksum, ds = design size 

703 

704 

705def _fix2comp(num): 

706 """Convert from two's complement to negative.""" 

707 assert 0 <= num < 2**32 

708 if num & 2**31: 

709 return num - 2**32 

710 else: 

711 return num 

712 

713 

714def _mul2012(num1, num2): 

715 """Multiply two numbers in 20.12 fixed point format.""" 

716 # Separated into a function because >> has surprising precedence 

717 return (num1*num2) >> 20 

718 

719 

720class Tfm: 

721 """ 

722 A TeX Font Metric file. 

723 

724 This implementation covers only the bare minimum needed by the Dvi class. 

725 

726 Parameters 

727 ---------- 

728 filename : str or path-like 

729 

730 Attributes 

731 ---------- 

732 checksum : int 

733 Used for verifying against the dvi file. 

734 design_size : int 

735 Design size of the font (unknown units) 

736 width, height, depth : dict 

737 Dimensions of each character, need to be scaled by the factor 

738 specified in the dvi file. These are dicts because indexing may 

739 not start from 0. 

740 """ 

741 __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth') 

742 

743 def __init__(self, filename): 

744 _log.debug('opening tfm file %s', filename) 

745 with open(filename, 'rb') as file: 

746 header1 = file.read(24) 

747 lh, bc, ec, nw, nh, nd = \ 

748 struct.unpack('!6H', header1[2:14]) 

749 _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d', 

750 lh, bc, ec, nw, nh, nd) 

751 header2 = file.read(4*lh) 

752 self.checksum, self.design_size = \ 

753 struct.unpack('!2I', header2[:8]) 

754 # there is also encoding information etc. 

755 char_info = file.read(4*(ec-bc+1)) 

756 widths = file.read(4*nw) 

757 heights = file.read(4*nh) 

758 depths = file.read(4*nd) 

759 

760 self.width, self.height, self.depth = {}, {}, {} 

761 widths, heights, depths = \ 

762 [struct.unpack('!%dI' % (len(x)/4), x) 

763 for x in (widths, heights, depths)] 

764 for idx, char in enumerate(range(bc, ec+1)): 

765 byte0 = char_info[4*idx] 

766 byte1 = char_info[4*idx+1] 

767 self.width[char] = _fix2comp(widths[byte0]) 

768 self.height[char] = _fix2comp(heights[byte1 >> 4]) 

769 self.depth[char] = _fix2comp(depths[byte1 & 0xf]) 

770 

771 

772PsFont = namedtuple('Font', 'texname psname effects encoding filename') 

773 

774 

775class PsfontsMap: 

776 """ 

777 A psfonts.map formatted file, mapping TeX fonts to PS fonts. 

778 

779 Usage:: 

780 

781 >>> map = PsfontsMap(find_tex_file('pdftex.map')) 

782 >>> entry = map[b'ptmbo8r'] 

783 >>> entry.texname 

784 b'ptmbo8r' 

785 >>> entry.psname 

786 b'Times-Bold' 

787 >>> entry.encoding 

788 '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc' 

789 >>> entry.effects 

790 {'slant': 0.16700000000000001} 

791 >>> entry.filename 

792 

793 Parameters 

794 ---------- 

795 filename : str or path-like 

796 

797 Notes 

798 ----- 

799 For historical reasons, TeX knows many Type-1 fonts by different 

800 names than the outside world. (For one thing, the names have to 

801 fit in eight characters.) Also, TeX's native fonts are not Type-1 

802 but Metafont, which is nontrivial to convert to PostScript except 

803 as a bitmap. While high-quality conversions to Type-1 format exist 

804 and are shipped with modern TeX distributions, we need to know 

805 which Type-1 fonts are the counterparts of which native fonts. For 

806 these reasons a mapping is needed from internal font names to font 

807 file names. 

808 

809 A texmf tree typically includes mapping files called e.g. 

810 :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`. 

811 The file :file:`psfonts.map` is used by :program:`dvips`, 

812 :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map` 

813 by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding 

814 the 35 PostScript fonts (i.e., have no filename for them, as in 

815 the Times-Bold example above), while the pdf-related files perhaps 

816 only avoid the "Base 14" pdf fonts. But the user may have 

817 configured these files differently. 

818 """ 

819 __slots__ = ('_font', '_filename') 

820 

821 # Create a filename -> PsfontsMap cache, so that calling 

822 # `PsfontsMap(filename)` with the same filename a second time immediately 

823 # returns the same object. 

824 @lru_cache() 

825 def __new__(cls, filename): 

826 self = object.__new__(cls) 

827 self._font = {} 

828 self._filename = os.fsdecode(filename) 

829 with open(filename, 'rb') as file: 

830 self._parse(file) 

831 return self 

832 

833 def __getitem__(self, texname): 

834 assert isinstance(texname, bytes) 

835 try: 

836 result = self._font[texname] 

837 except KeyError: 

838 fmt = ('A PostScript file for the font whose TeX name is "{0}" ' 

839 'could not be found in the file "{1}". The dviread module ' 

840 'can only handle fonts that have an associated PostScript ' 

841 'font file. ' 

842 'This problem can often be solved by installing ' 

843 'a suitable PostScript font package in your (TeX) ' 

844 'package manager.') 

845 msg = fmt.format(texname.decode('ascii'), self._filename) 

846 msg = textwrap.fill(msg, break_on_hyphens=False, 

847 break_long_words=False) 

848 _log.info(msg) 

849 raise 

850 fn, enc = result.filename, result.encoding 

851 if fn is not None and not fn.startswith(b'/'): 

852 fn = find_tex_file(fn) 

853 if enc is not None and not enc.startswith(b'/'): 

854 enc = find_tex_file(result.encoding) 

855 return result._replace(filename=fn, encoding=enc) 

856 

857 def _parse(self, file): 

858 """ 

859 Parse the font mapping file. 

860 

861 The format is, AFAIK: texname fontname [effects and filenames] 

862 Effects are PostScript snippets like ".177 SlantFont", 

863 filenames begin with one or two less-than signs. A filename 

864 ending in enc is an encoding file, other filenames are font 

865 files. This can be overridden with a left bracket: <[foobar 

866 indicates an encoding file named foobar. 

867 

868 There is some difference between <foo.pfb and <<bar.pfb in 

869 subsetting, but I have no example of << in my TeX installation. 

870 """ 

871 # If the map file specifies multiple encodings for a font, we 

872 # follow pdfTeX in choosing the last one specified. Such 

873 # entries are probably mistakes but they have occurred. 

874 # http://tex.stackexchange.com/questions/10826/ 

875 # http://article.gmane.org/gmane.comp.tex.pdftex/4914 

876 

877 empty_re = re.compile(br'%|\s*$') 

878 word_re = re.compile( 

879 br'''(?x) (?: 

880 "<\[ (?P<enc1> [^"]+ )" | # quoted encoding marked by [ 

881 "< (?P<enc2> [^"]+.enc)" | # quoted encoding, ends in .enc 

882 "<<? (?P<file1> [^"]+ )" | # quoted font file name 

883 " (?P<eff1> [^"]+ )" | # quoted effects or font name 

884 <\[ (?P<enc3> \S+ ) | # encoding marked by [ 

885 < (?P<enc4> \S+ .enc) | # encoding, ends in .enc 

886 <<? (?P<file2> \S+ ) | # font file name 

887 (?P<eff2> \S+ ) # effects or font name 

888 )''') 

889 effects_re = re.compile( 

890 br'''(?x) (?P<slant> -?[0-9]*(?:\.[0-9]+)) \s* SlantFont 

891 | (?P<extend>-?[0-9]*(?:\.[0-9]+)) \s* ExtendFont''') 

892 

893 lines = (line.strip() 

894 for line in file 

895 if not empty_re.match(line)) 

896 for line in lines: 

897 effects, encoding, filename = b'', None, None 

898 words = word_re.finditer(line) 

899 

900 # The named groups are mutually exclusive and are 

901 # referenced below at an estimated order of probability of 

902 # occurrence based on looking at my copy of pdftex.map. 

903 # The font names are probably unquoted: 

904 w = next(words) 

905 texname = w.group('eff2') or w.group('eff1') 

906 w = next(words) 

907 psname = w.group('eff2') or w.group('eff1') 

908 

909 for w in words: 

910 # Any effects are almost always quoted: 

911 eff = w.group('eff1') or w.group('eff2') 

912 if eff: 

913 effects = eff 

914 continue 

915 # Encoding files usually have the .enc suffix 

916 # and almost never need quoting: 

917 enc = (w.group('enc4') or w.group('enc3') or 

918 w.group('enc2') or w.group('enc1')) 

919 if enc: 

920 if encoding is not None: 

921 _log.debug('Multiple encodings for %s = %s', 

922 texname, psname) 

923 encoding = enc 

924 continue 

925 # File names are probably unquoted: 

926 filename = w.group('file2') or w.group('file1') 

927 

928 effects_dict = {} 

929 for match in effects_re.finditer(effects): 

930 slant = match.group('slant') 

931 if slant: 

932 effects_dict['slant'] = float(slant) 

933 else: 

934 effects_dict['extend'] = float(match.group('extend')) 

935 

936 self._font[texname] = PsFont( 

937 texname=texname, psname=psname, effects=effects_dict, 

938 encoding=encoding, filename=filename) 

939 

940 

941class Encoding: 

942 r""" 

943 Parses a \*.enc file referenced from a psfonts.map style file. 

944 The format this class understands is a very limited subset of 

945 PostScript. 

946 

947 Usage (subject to change):: 

948 

949 for name in Encoding(filename): 

950 whatever(name) 

951 

952 Parameters 

953 ---------- 

954 filename : str or path-like 

955 

956 Attributes 

957 ---------- 

958 encoding : list 

959 List of character names 

960 """ 

961 __slots__ = ('encoding',) 

962 

963 def __init__(self, filename): 

964 with open(filename, 'rb') as file: 

965 _log.debug('Parsing TeX encoding %s', filename) 

966 self.encoding = self._parse(file) 

967 _log.debug('Result: %s', self.encoding) 

968 

969 def __iter__(self): 

970 yield from self.encoding 

971 

972 @staticmethod 

973 def _parse(file): 

974 lines = (line.split(b'%', 1)[0].strip() for line in file) 

975 data = b''.join(lines) 

976 beginning = data.find(b'[') 

977 if beginning < 0: 

978 raise ValueError("Cannot locate beginning of encoding in {}" 

979 .format(file)) 

980 data = data[beginning:] 

981 end = data.find(b']') 

982 if end < 0: 

983 raise ValueError("Cannot locate end of encoding in {}" 

984 .format(file)) 

985 data = data[:end] 

986 return re.findall(br'/([^][{}<>\s]+)', data) 

987 

988 

989# Note: this function should ultimately replace the Encoding class, which 

990# appears to be mostly broken: because it uses b''.join(), there is no 

991# whitespace left between glyph names (only slashes) so the final re.findall 

992# returns a single string with all glyph names. However this does not appear 

993# to bother backend_pdf, so that needs to be investigated more. (The fixed 

994# version below is necessary for textpath/backend_svg, though.) 

995def _parse_enc(path): 

996 r""" 

997 Parses a \*.enc file referenced from a psfonts.map style file. 

998 The format this class understands is a very limited subset of PostScript. 

999 

1000 Parameters 

1001 ---------- 

1002 path : os.PathLike 

1003 

1004 Returns 

1005 ------- 

1006 encoding : list 

1007 The nth entry of the list is the PostScript glyph name of the nth 

1008 glyph. 

1009 """ 

1010 with open(path, encoding="ascii") as file: 

1011 no_comments = "\n".join(line.split("%")[0].rstrip() for line in file) 

1012 array = re.search(r"(?s)\[(.*)\]", no_comments).group(1) 

1013 lines = [line for line in array.split() if line] 

1014 if all(line.startswith("/") for line in lines): 

1015 return [line[1:] for line in lines] 

1016 else: 

1017 raise ValueError( 

1018 "Failed to parse {} as Postscript encoding".format(path)) 

1019 

1020 

1021@lru_cache() 

1022def find_tex_file(filename, format=None): 

1023 """ 

1024 Find a file in the texmf tree. 

1025 

1026 Calls :program:`kpsewhich` which is an interface to the kpathsea 

1027 library [1]_. Most existing TeX distributions on Unix-like systems use 

1028 kpathsea. It is also available as part of MikTeX, a popular 

1029 distribution on Windows. 

1030 

1031 *If the file is not found, an empty string is returned*. 

1032 

1033 Parameters 

1034 ---------- 

1035 filename : str or path-like 

1036 format : str or bytes 

1037 Used as the value of the `--format` option to :program:`kpsewhich`. 

1038 Could be e.g. 'tfm' or 'vf' to limit the search to that type of files. 

1039 

1040 References 

1041 ---------- 

1042 .. [1] `Kpathsea documentation <http://www.tug.org/kpathsea/>`_ 

1043 The library that :program:`kpsewhich` is part of. 

1044 """ 

1045 

1046 # we expect these to always be ascii encoded, but use utf-8 

1047 # out of caution 

1048 if isinstance(filename, bytes): 

1049 filename = filename.decode('utf-8', errors='replace') 

1050 if isinstance(format, bytes): 

1051 format = format.decode('utf-8', errors='replace') 

1052 

1053 if os.name == 'nt': 

1054 # On Windows only, kpathsea can use utf-8 for cmd args and output. 

1055 # The `command_line_encoding` environment variable is set to force it 

1056 # to always use utf-8 encoding. See Matplotlib issue #11848. 

1057 kwargs = dict(env=dict(os.environ, command_line_encoding='utf-8')) 

1058 else: 

1059 kwargs = {} 

1060 

1061 cmd = ['kpsewhich'] 

1062 if format is not None: 

1063 cmd += ['--format=' + format] 

1064 cmd += [filename] 

1065 try: 

1066 result = cbook._check_and_log_subprocess(cmd, _log, **kwargs) 

1067 except RuntimeError: 

1068 return '' 

1069 if os.name == 'nt': 

1070 return result.decode('utf-8').rstrip('\r\n') 

1071 else: 

1072 return os.fsdecode(result).rstrip('\n') 

1073 

1074 

1075@lru_cache() 

1076def _fontfile(cls, suffix, texname): 

1077 filename = find_tex_file(texname + suffix) 

1078 return cls(filename) if filename else None 

1079 

1080 

1081_tfmfile = partial(_fontfile, Tfm, ".tfm") 

1082_vffile = partial(_fontfile, Vf, ".vf") 

1083 

1084 

1085if __name__ == '__main__': 

1086 from argparse import ArgumentParser 

1087 import itertools 

1088 

1089 parser = ArgumentParser() 

1090 parser.add_argument("filename") 

1091 parser.add_argument("dpi", nargs="?", type=float, default=None) 

1092 args = parser.parse_args() 

1093 with Dvi(args.filename, args.dpi) as dvi: 

1094 fontmap = PsfontsMap(find_tex_file('pdftex.map')) 

1095 for page in dvi: 

1096 print('=== new page ===') 

1097 for font, group in itertools.groupby( 

1098 page.text, lambda text: text.font): 

1099 print('font', font.texname, 'scaled', font._scale / 2 ** 20) 

1100 for text in group: 

1101 print(text.x, text.y, text.glyph, 

1102 chr(text.glyph) if chr(text.glyph).isprintable() 

1103 else ".", 

1104 text.width) 

1105 for x, y, w, h in page.boxes: 

1106 print(x, y, 'BOX', w, h)