Coverage for /home/martinb/.local/share/virtualenvs/camcops/lib/python3.6/site-packages/matplotlib/dviread.py : 23%

Hot-keys on this page
r m x p toggle line displays
j k next/prev highlighted chunk
0 (zero) top of page
1 (one) first highlighted chunk
1"""
2A module for reading dvi files output by TeX. Several limitations make
3this not (currently) useful as a general-purpose dvi preprocessor, but
4it is currently used by the pdf backend for processing usetex text.
6Interface::
8 with Dvi(filename, 72) as dvi:
9 # iterate over pages:
10 for page in dvi:
11 w, h, d = page.width, page.height, page.descent
12 for x, y, font, glyph, width in page.text:
13 fontname = font.texname
14 pointsize = font.size
15 ...
16 for x, y, height, width in page.boxes:
17 ...
18"""
20from collections import namedtuple
21import enum
22from functools import lru_cache, partial, wraps
23import logging
24import os
25import re
26import struct
27import textwrap
29import numpy as np
31from matplotlib import cbook, rcParams
33_log = logging.getLogger(__name__)
35# Many dvi related files are looked for by external processes, require
36# additional parsing, and are used many times per rendering, which is why they
37# are cached using lru_cache().
39# Dvi is a bytecode format documented in
40# http://mirrors.ctan.org/systems/knuth/dist/texware/dvitype.web
41# http://texdoc.net/texmf-dist/doc/generic/knuth/texware/dvitype.pdf
42#
43# The file consists of a preamble, some number of pages, a postamble,
44# and a finale. Different opcodes are allowed in different contexts,
45# so the Dvi object has a parser state:
46#
47# pre: expecting the preamble
48# outer: between pages (followed by a page or the postamble,
49# also e.g. font definitions are allowed)
50# page: processing a page
51# post_post: state after the postamble (our current implementation
52# just stops reading)
53# finale: the finale (unimplemented in our current implementation)
55_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale')
57# The marks on a page consist of text and boxes. A page also has dimensions.
58Page = namedtuple('Page', 'text boxes height width descent')
59Text = namedtuple('Text', 'x y font glyph width')
60Box = namedtuple('Box', 'x y height width')
63# Opcode argument parsing
64#
65# Each of the following functions takes a Dvi object and delta,
66# which is the difference between the opcode and the minimum opcode
67# with the same meaning. Dvi opcodes often encode the number of
68# argument bytes in this delta.
70def _arg_raw(dvi, delta):
71 """Return *delta* without reading anything more from the dvi file"""
72 return delta
75def _arg(bytes, signed, dvi, _):
76 """Read *bytes* bytes, returning the bytes interpreted as a
77 signed integer if *signed* is true, unsigned otherwise."""
78 return dvi._arg(bytes, signed)
81def _arg_slen(dvi, delta):
82 """Signed, length *delta*
84 Read *delta* bytes, returning None if *delta* is zero, and
85 the bytes interpreted as a signed integer otherwise."""
86 if delta == 0:
87 return None
88 return dvi._arg(delta, True)
91def _arg_slen1(dvi, delta):
92 """Signed, length *delta*+1
94 Read *delta*+1 bytes, returning the bytes interpreted as signed."""
95 return dvi._arg(delta+1, True)
98def _arg_ulen1(dvi, delta):
99 """Unsigned length *delta*+1
101 Read *delta*+1 bytes, returning the bytes interpreted as unsigned."""
102 return dvi._arg(delta+1, False)
105def _arg_olen1(dvi, delta):
106 """Optionally signed, length *delta*+1
108 Read *delta*+1 bytes, returning the bytes interpreted as
109 unsigned integer for 0<=*delta*<3 and signed if *delta*==3."""
110 return dvi._arg(delta + 1, delta == 3)
113_arg_mapping = dict(raw=_arg_raw,
114 u1=partial(_arg, 1, False),
115 u4=partial(_arg, 4, False),
116 s4=partial(_arg, 4, True),
117 slen=_arg_slen,
118 olen1=_arg_olen1,
119 slen1=_arg_slen1,
120 ulen1=_arg_ulen1)
123def _dispatch(table, min, max=None, state=None, args=('raw',)):
124 """Decorator for dispatch by opcode. Sets the values in *table*
125 from *min* to *max* to this method, adds a check that the Dvi state
126 matches *state* if not None, reads arguments from the file according
127 to *args*.
129 *table*
130 the dispatch table to be filled in
132 *min*
133 minimum opcode for calling this function
135 *max*
136 maximum opcode for calling this function, None if only *min* is allowed
138 *state*
139 state of the Dvi object in which these opcodes are allowed
141 *args*
142 sequence of argument specifications:
144 ``'raw'``: opcode minus minimum
145 ``'u1'``: read one unsigned byte
146 ``'u4'``: read four bytes, treat as an unsigned number
147 ``'s4'``: read four bytes, treat as a signed number
148 ``'slen'``: read (opcode - minimum) bytes, treat as signed
149 ``'slen1'``: read (opcode - minimum + 1) bytes, treat as signed
150 ``'ulen1'``: read (opcode - minimum + 1) bytes, treat as unsigned
151 ``'olen1'``: read (opcode - minimum + 1) bytes, treat as unsigned
152 if under four bytes, signed if four bytes
153 """
154 def decorate(method):
155 get_args = [_arg_mapping[x] for x in args]
157 @wraps(method)
158 def wrapper(self, byte):
159 if state is not None and self.state != state:
160 raise ValueError("state precondition failed")
161 return method(self, *[f(self, byte-min) for f in get_args])
162 if max is None:
163 table[min] = wrapper
164 else:
165 for i in range(min, max+1):
166 assert table[i] is None
167 table[i] = wrapper
168 return wrapper
169 return decorate
172class Dvi:
173 """
174 A reader for a dvi ("device-independent") file, as produced by TeX.
175 The current implementation can only iterate through pages in order,
176 and does not even attempt to verify the postamble.
178 This class can be used as a context manager to close the underlying
179 file upon exit. Pages can be read via iteration. Here is an overly
180 simple way to extract text without trying to detect whitespace::
182 >>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi:
183 ... for page in dvi:
184 ... print(''.join(chr(t.glyph) for t in page.text))
185 """
186 # dispatch table
187 _dtable = [None] * 256
188 _dispatch = partial(_dispatch, _dtable)
190 def __init__(self, filename, dpi):
191 """
192 Read the data from the file named *filename* and convert
193 TeX's internal units to units of *dpi* per inch.
194 *dpi* only sets the units and does not limit the resolution.
195 Use None to return TeX's internal units.
196 """
197 _log.debug('Dvi: %s', filename)
198 self.file = open(filename, 'rb')
199 self.dpi = dpi
200 self.fonts = {}
201 self.state = _dvistate.pre
202 self.baseline = self._get_baseline(filename)
204 def _get_baseline(self, filename):
205 if rcParams['text.latex.preview']:
206 base, ext = os.path.splitext(filename)
207 baseline_filename = base + ".baseline"
208 if os.path.exists(baseline_filename):
209 with open(baseline_filename, 'rb') as fd:
210 l = fd.read().split()
211 height, depth, width = l
212 return float(depth)
213 return None
215 def __enter__(self):
216 """
217 Context manager enter method, does nothing.
218 """
219 return self
221 def __exit__(self, etype, evalue, etrace):
222 """
223 Context manager exit method, closes the underlying file if it is open.
224 """
225 self.close()
227 def __iter__(self):
228 """
229 Iterate through the pages of the file.
231 Yields
232 ------
233 Page
234 Details of all the text and box objects on the page.
235 The Page tuple contains lists of Text and Box tuples and
236 the page dimensions, and the Text and Box tuples contain
237 coordinates transformed into a standard Cartesian
238 coordinate system at the dpi value given when initializing.
239 The coordinates are floating point numbers, but otherwise
240 precision is not lost and coordinate values are not clipped to
241 integers.
242 """
243 while self._read():
244 yield self._output()
246 def close(self):
247 """
248 Close the underlying file if it is open.
249 """
250 if not self.file.closed:
251 self.file.close()
253 def _output(self):
254 """
255 Output the text and boxes belonging to the most recent page.
256 page = dvi._output()
257 """
258 minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf
259 maxy_pure = -np.inf
260 for elt in self.text + self.boxes:
261 if isinstance(elt, Box):
262 x, y, h, w = elt
263 e = 0 # zero depth
264 else: # glyph
265 x, y, font, g, w = elt
266 h, e = font._height_depth_of(g)
267 minx = min(minx, x)
268 miny = min(miny, y - h)
269 maxx = max(maxx, x + w)
270 maxy = max(maxy, y + e)
271 maxy_pure = max(maxy_pure, y)
272 if self._baseline_v is not None:
273 maxy_pure = self._baseline_v # This should normally be the case.
274 self._baseline_v = None
276 if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf.
277 return Page(text=[], boxes=[], width=0, height=0, descent=0)
279 if self.dpi is None:
280 # special case for ease of debugging: output raw dvi coordinates
281 return Page(text=self.text, boxes=self.boxes,
282 width=maxx-minx, height=maxy_pure-miny,
283 descent=maxy-maxy_pure)
285 # convert from TeX's "scaled points" to dpi units
286 d = self.dpi / (72.27 * 2**16)
287 if self.baseline is None:
288 descent = (maxy - maxy_pure) * d
289 else:
290 descent = self.baseline
292 text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d)
293 for (x, y, f, g, w) in self.text]
294 boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d)
295 for (x, y, h, w) in self.boxes]
297 return Page(text=text, boxes=boxes, width=(maxx-minx)*d,
298 height=(maxy_pure-miny)*d, descent=descent)
300 def _read(self):
301 """
302 Read one page from the file. Return True if successful,
303 False if there were no more pages.
304 """
305 # Pages appear to start with the sequence
306 # bop (begin of page)
307 # xxx comment
308 # down
309 # push
310 # down, down
311 # push
312 # down (possibly multiple)
313 # push <= here, v is the baseline position.
314 # etc.
315 # (dviasm is useful to explore this structure.)
316 self._baseline_v = None
317 while True:
318 byte = self.file.read(1)[0]
319 self._dtable[byte](self, byte)
320 if (self._baseline_v is None
321 and len(getattr(self, "stack", [])) == 3):
322 self._baseline_v = self.v
323 if byte == 140: # end of page
324 return True
325 if self.state is _dvistate.post_post: # end of file
326 self.close()
327 return False
329 def _arg(self, nbytes, signed=False):
330 """
331 Read and return an integer argument *nbytes* long.
332 Signedness is determined by the *signed* keyword.
333 """
334 str = self.file.read(nbytes)
335 value = str[0]
336 if signed and value >= 0x80:
337 value = value - 0x100
338 for i in range(1, nbytes):
339 value = 0x100*value + str[i]
340 return value
342 @_dispatch(min=0, max=127, state=_dvistate.inpage)
343 def _set_char_immediate(self, char):
344 self._put_char_real(char)
345 self.h += self.fonts[self.f]._width_of(char)
347 @_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',))
348 def _set_char(self, char):
349 self._put_char_real(char)
350 self.h += self.fonts[self.f]._width_of(char)
352 @_dispatch(132, state=_dvistate.inpage, args=('s4', 's4'))
353 def _set_rule(self, a, b):
354 self._put_rule_real(a, b)
355 self.h += b
357 @_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',))
358 def _put_char(self, char):
359 self._put_char_real(char)
361 def _put_char_real(self, char):
362 font = self.fonts[self.f]
363 if font._vf is None:
364 self.text.append(Text(self.h, self.v, font, char,
365 font._width_of(char)))
366 else:
367 scale = font._scale
368 for x, y, f, g, w in font._vf[char].text:
369 newf = DviFont(scale=_mul2012(scale, f._scale),
370 tfm=f._tfm, texname=f.texname, vf=f._vf)
371 self.text.append(Text(self.h + _mul2012(x, scale),
372 self.v + _mul2012(y, scale),
373 newf, g, newf._width_of(g)))
374 self.boxes.extend([Box(self.h + _mul2012(x, scale),
375 self.v + _mul2012(y, scale),
376 _mul2012(a, scale), _mul2012(b, scale))
377 for x, y, a, b in font._vf[char].boxes])
379 @_dispatch(137, state=_dvistate.inpage, args=('s4', 's4'))
380 def _put_rule(self, a, b):
381 self._put_rule_real(a, b)
383 def _put_rule_real(self, a, b):
384 if a > 0 and b > 0:
385 self.boxes.append(Box(self.h, self.v, a, b))
387 @_dispatch(138)
388 def _nop(self, _):
389 pass
391 @_dispatch(139, state=_dvistate.outer, args=('s4',)*11)
392 def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p):
393 self.state = _dvistate.inpage
394 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0
395 self.stack = []
396 self.text = [] # list of Text objects
397 self.boxes = [] # list of Box objects
399 @_dispatch(140, state=_dvistate.inpage)
400 def _eop(self, _):
401 self.state = _dvistate.outer
402 del self.h, self.v, self.w, self.x, self.y, self.z, self.stack
404 @_dispatch(141, state=_dvistate.inpage)
405 def _push(self, _):
406 self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z))
408 @_dispatch(142, state=_dvistate.inpage)
409 def _pop(self, _):
410 self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop()
412 @_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',))
413 def _right(self, b):
414 self.h += b
416 @_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',))
417 def _right_w(self, new_w):
418 if new_w is not None:
419 self.w = new_w
420 self.h += self.w
422 @_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',))
423 def _right_x(self, new_x):
424 if new_x is not None:
425 self.x = new_x
426 self.h += self.x
428 @_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',))
429 def _down(self, a):
430 self.v += a
432 @_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',))
433 def _down_y(self, new_y):
434 if new_y is not None:
435 self.y = new_y
436 self.v += self.y
438 @_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',))
439 def _down_z(self, new_z):
440 if new_z is not None:
441 self.z = new_z
442 self.v += self.z
444 @_dispatch(min=171, max=234, state=_dvistate.inpage)
445 def _fnt_num_immediate(self, k):
446 self.f = k
448 @_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',))
449 def _fnt_num(self, new_f):
450 self.f = new_f
452 @_dispatch(min=239, max=242, args=('ulen1',))
453 def _xxx(self, datalen):
454 special = self.file.read(datalen)
455 _log.debug(
456 'Dvi._xxx: encountered special: %s',
457 ''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch
458 for ch in special]))
460 @_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1'))
461 def _fnt_def(self, k, c, s, d, a, l):
462 self._fnt_def_real(k, c, s, d, a, l)
464 def _fnt_def_real(self, k, c, s, d, a, l):
465 n = self.file.read(a + l)
466 fontname = n[-l:].decode('ascii')
467 tfm = _tfmfile(fontname)
468 if tfm is None:
469 raise FileNotFoundError("missing font metrics file: %s" % fontname)
470 if c != 0 and tfm.checksum != 0 and c != tfm.checksum:
471 raise ValueError('tfm checksum mismatch: %s' % n)
473 vf = _vffile(fontname)
475 self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf)
477 @_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1'))
478 def _pre(self, i, num, den, mag, k):
479 self.file.read(k) # comment in the dvi file
480 if i != 2:
481 raise ValueError("Unknown dvi format %d" % i)
482 if num != 25400000 or den != 7227 * 2**16:
483 raise ValueError("nonstandard units in dvi file")
484 # meaning: TeX always uses those exact values, so it
485 # should be enough for us to support those
486 # (There are 72.27 pt to an inch so 7227 pt =
487 # 7227 * 2**16 sp to 100 in. The numerator is multiplied
488 # by 10^5 to get units of 10**-7 meters.)
489 if mag != 1000:
490 raise ValueError("nonstandard magnification in dvi file")
491 # meaning: LaTeX seems to frown on setting \mag, so
492 # I think we can assume this is constant
493 self.state = _dvistate.outer
495 @_dispatch(248, state=_dvistate.outer)
496 def _post(self, _):
497 self.state = _dvistate.post_post
498 # TODO: actually read the postamble and finale?
499 # currently post_post just triggers closing the file
501 @_dispatch(249)
502 def _post_post(self, _):
503 raise NotImplementedError
505 @_dispatch(min=250, max=255)
506 def _malformed(self, offset):
507 raise ValueError("unknown command: byte %d", 250 + offset)
510class DviFont:
511 """
512 Encapsulation of a font that a DVI file can refer to.
514 This class holds a font's texname and size, supports comparison,
515 and knows the widths of glyphs in the same units as the AFM file.
516 There are also internal attributes (for use by dviread.py) that
517 are *not* used for comparison.
519 The size is in Adobe points (converted from TeX points).
521 Parameters
522 ----------
523 scale : float
524 Factor by which the font is scaled from its natural size.
525 tfm : Tfm
526 TeX font metrics for this font
527 texname : bytes
528 Name of the font as used internally by TeX and friends, as an
529 ASCII bytestring. This is usually very different from any external
530 font names, and :class:`dviread.PsfontsMap` can be used to find
531 the external name of the font.
532 vf : Vf
533 A TeX "virtual font" file, or None if this font is not virtual.
535 Attributes
536 ----------
537 texname : bytes
538 size : float
539 Size of the font in Adobe points, converted from the slightly
540 smaller TeX points.
541 widths : list
542 Widths of glyphs in glyph-space units, typically 1/1000ths of
543 the point size.
545 """
546 __slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm')
548 def __init__(self, scale, tfm, texname, vf):
549 cbook._check_isinstance(bytes, texname=texname)
550 self._scale = scale
551 self._tfm = tfm
552 self.texname = texname
553 self._vf = vf
554 self.size = scale * (72.0 / (72.27 * 2**16))
555 try:
556 nchars = max(tfm.width) + 1
557 except ValueError:
558 nchars = 0
559 self.widths = [(1000*tfm.width.get(char, 0)) >> 20
560 for char in range(nchars)]
562 def __eq__(self, other):
563 return (type(self) == type(other)
564 and self.texname == other.texname and self.size == other.size)
566 def __ne__(self, other):
567 return not self.__eq__(other)
569 def __repr__(self):
570 return "<{}: {}>".format(type(self).__name__, self.texname)
572 def _width_of(self, char):
573 """Width of char in dvi units."""
574 width = self._tfm.width.get(char, None)
575 if width is not None:
576 return _mul2012(width, self._scale)
577 _log.debug('No width for char %d in font %s.', char, self.texname)
578 return 0
580 def _height_depth_of(self, char):
581 """Height and depth of char in dvi units."""
582 result = []
583 for metric, name in ((self._tfm.height, "height"),
584 (self._tfm.depth, "depth")):
585 value = metric.get(char, None)
586 if value is None:
587 _log.debug('No %s for char %d in font %s',
588 name, char, self.texname)
589 result.append(0)
590 else:
591 result.append(_mul2012(value, self._scale))
592 return result
595class Vf(Dvi):
596 r"""
597 A virtual font (\*.vf file) containing subroutines for dvi files.
599 Usage::
601 vf = Vf(filename)
602 glyph = vf[code]
603 glyph.text, glyph.boxes, glyph.width
605 Parameters
606 ----------
607 filename : str or path-like
609 Notes
610 -----
611 The virtual font format is a derivative of dvi:
612 http://mirrors.ctan.org/info/knuth/virtual-fonts
613 This class reuses some of the machinery of `Dvi`
614 but replaces the `_read` loop and dispatch mechanism.
615 """
617 def __init__(self, filename):
618 Dvi.__init__(self, filename, 0)
619 try:
620 self._first_font = None
621 self._chars = {}
622 self._read()
623 finally:
624 self.close()
626 def __getitem__(self, code):
627 return self._chars[code]
629 def _read(self):
630 """
631 Read one page from the file. Return True if successful,
632 False if there were no more pages.
633 """
634 packet_char, packet_ends = None, None
635 packet_len, packet_width = None, None
636 while True:
637 byte = self.file.read(1)[0]
638 # If we are in a packet, execute the dvi instructions
639 if self.state is _dvistate.inpage:
640 byte_at = self.file.tell()-1
641 if byte_at == packet_ends:
642 self._finalize_packet(packet_char, packet_width)
643 packet_len, packet_char, packet_width = None, None, None
644 # fall through to out-of-packet code
645 elif byte_at > packet_ends:
646 raise ValueError("Packet length mismatch in vf file")
647 else:
648 if byte in (139, 140) or byte >= 243:
649 raise ValueError(
650 "Inappropriate opcode %d in vf file" % byte)
651 Dvi._dtable[byte](self, byte)
652 continue
654 # We are outside a packet
655 if byte < 242: # a short packet (length given by byte)
656 packet_len = byte
657 packet_char, packet_width = self._arg(1), self._arg(3)
658 packet_ends = self._init_packet(byte)
659 self.state = _dvistate.inpage
660 elif byte == 242: # a long packet
661 packet_len, packet_char, packet_width = \
662 [self._arg(x) for x in (4, 4, 4)]
663 self._init_packet(packet_len)
664 elif 243 <= byte <= 246:
665 k = self._arg(byte - 242, byte == 246)
666 c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)]
667 self._fnt_def_real(k, c, s, d, a, l)
668 if self._first_font is None:
669 self._first_font = k
670 elif byte == 247: # preamble
671 i, k = self._arg(1), self._arg(1)
672 x = self.file.read(k)
673 cs, ds = self._arg(4), self._arg(4)
674 self._pre(i, x, cs, ds)
675 elif byte == 248: # postamble (just some number of 248s)
676 break
677 else:
678 raise ValueError("unknown vf opcode %d" % byte)
680 def _init_packet(self, pl):
681 if self.state != _dvistate.outer:
682 raise ValueError("Misplaced packet in vf file")
683 self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0
684 self.stack, self.text, self.boxes = [], [], []
685 self.f = self._first_font
686 return self.file.tell() + pl
688 def _finalize_packet(self, packet_char, packet_width):
689 self._chars[packet_char] = Page(
690 text=self.text, boxes=self.boxes, width=packet_width,
691 height=None, descent=None)
692 self.state = _dvistate.outer
694 def _pre(self, i, x, cs, ds):
695 if self.state is not _dvistate.pre:
696 raise ValueError("pre command in middle of vf file")
697 if i != 202:
698 raise ValueError("Unknown vf format %d" % i)
699 if len(x):
700 _log.debug('vf file comment: %s', x)
701 self.state = _dvistate.outer
702 # cs = checksum, ds = design size
705def _fix2comp(num):
706 """Convert from two's complement to negative."""
707 assert 0 <= num < 2**32
708 if num & 2**31:
709 return num - 2**32
710 else:
711 return num
714def _mul2012(num1, num2):
715 """Multiply two numbers in 20.12 fixed point format."""
716 # Separated into a function because >> has surprising precedence
717 return (num1*num2) >> 20
720class Tfm:
721 """
722 A TeX Font Metric file.
724 This implementation covers only the bare minimum needed by the Dvi class.
726 Parameters
727 ----------
728 filename : str or path-like
730 Attributes
731 ----------
732 checksum : int
733 Used for verifying against the dvi file.
734 design_size : int
735 Design size of the font (unknown units)
736 width, height, depth : dict
737 Dimensions of each character, need to be scaled by the factor
738 specified in the dvi file. These are dicts because indexing may
739 not start from 0.
740 """
741 __slots__ = ('checksum', 'design_size', 'width', 'height', 'depth')
743 def __init__(self, filename):
744 _log.debug('opening tfm file %s', filename)
745 with open(filename, 'rb') as file:
746 header1 = file.read(24)
747 lh, bc, ec, nw, nh, nd = \
748 struct.unpack('!6H', header1[2:14])
749 _log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d',
750 lh, bc, ec, nw, nh, nd)
751 header2 = file.read(4*lh)
752 self.checksum, self.design_size = \
753 struct.unpack('!2I', header2[:8])
754 # there is also encoding information etc.
755 char_info = file.read(4*(ec-bc+1))
756 widths = file.read(4*nw)
757 heights = file.read(4*nh)
758 depths = file.read(4*nd)
760 self.width, self.height, self.depth = {}, {}, {}
761 widths, heights, depths = \
762 [struct.unpack('!%dI' % (len(x)/4), x)
763 for x in (widths, heights, depths)]
764 for idx, char in enumerate(range(bc, ec+1)):
765 byte0 = char_info[4*idx]
766 byte1 = char_info[4*idx+1]
767 self.width[char] = _fix2comp(widths[byte0])
768 self.height[char] = _fix2comp(heights[byte1 >> 4])
769 self.depth[char] = _fix2comp(depths[byte1 & 0xf])
772PsFont = namedtuple('Font', 'texname psname effects encoding filename')
775class PsfontsMap:
776 """
777 A psfonts.map formatted file, mapping TeX fonts to PS fonts.
779 Usage::
781 >>> map = PsfontsMap(find_tex_file('pdftex.map'))
782 >>> entry = map[b'ptmbo8r']
783 >>> entry.texname
784 b'ptmbo8r'
785 >>> entry.psname
786 b'Times-Bold'
787 >>> entry.encoding
788 '/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc'
789 >>> entry.effects
790 {'slant': 0.16700000000000001}
791 >>> entry.filename
793 Parameters
794 ----------
795 filename : str or path-like
797 Notes
798 -----
799 For historical reasons, TeX knows many Type-1 fonts by different
800 names than the outside world. (For one thing, the names have to
801 fit in eight characters.) Also, TeX's native fonts are not Type-1
802 but Metafont, which is nontrivial to convert to PostScript except
803 as a bitmap. While high-quality conversions to Type-1 format exist
804 and are shipped with modern TeX distributions, we need to know
805 which Type-1 fonts are the counterparts of which native fonts. For
806 these reasons a mapping is needed from internal font names to font
807 file names.
809 A texmf tree typically includes mapping files called e.g.
810 :file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`.
811 The file :file:`psfonts.map` is used by :program:`dvips`,
812 :file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map`
813 by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding
814 the 35 PostScript fonts (i.e., have no filename for them, as in
815 the Times-Bold example above), while the pdf-related files perhaps
816 only avoid the "Base 14" pdf fonts. But the user may have
817 configured these files differently.
818 """
819 __slots__ = ('_font', '_filename')
821 # Create a filename -> PsfontsMap cache, so that calling
822 # `PsfontsMap(filename)` with the same filename a second time immediately
823 # returns the same object.
824 @lru_cache()
825 def __new__(cls, filename):
826 self = object.__new__(cls)
827 self._font = {}
828 self._filename = os.fsdecode(filename)
829 with open(filename, 'rb') as file:
830 self._parse(file)
831 return self
833 def __getitem__(self, texname):
834 assert isinstance(texname, bytes)
835 try:
836 result = self._font[texname]
837 except KeyError:
838 fmt = ('A PostScript file for the font whose TeX name is "{0}" '
839 'could not be found in the file "{1}". The dviread module '
840 'can only handle fonts that have an associated PostScript '
841 'font file. '
842 'This problem can often be solved by installing '
843 'a suitable PostScript font package in your (TeX) '
844 'package manager.')
845 msg = fmt.format(texname.decode('ascii'), self._filename)
846 msg = textwrap.fill(msg, break_on_hyphens=False,
847 break_long_words=False)
848 _log.info(msg)
849 raise
850 fn, enc = result.filename, result.encoding
851 if fn is not None and not fn.startswith(b'/'):
852 fn = find_tex_file(fn)
853 if enc is not None and not enc.startswith(b'/'):
854 enc = find_tex_file(result.encoding)
855 return result._replace(filename=fn, encoding=enc)
857 def _parse(self, file):
858 """
859 Parse the font mapping file.
861 The format is, AFAIK: texname fontname [effects and filenames]
862 Effects are PostScript snippets like ".177 SlantFont",
863 filenames begin with one or two less-than signs. A filename
864 ending in enc is an encoding file, other filenames are font
865 files. This can be overridden with a left bracket: <[foobar
866 indicates an encoding file named foobar.
868 There is some difference between <foo.pfb and <<bar.pfb in
869 subsetting, but I have no example of << in my TeX installation.
870 """
871 # If the map file specifies multiple encodings for a font, we
872 # follow pdfTeX in choosing the last one specified. Such
873 # entries are probably mistakes but they have occurred.
874 # http://tex.stackexchange.com/questions/10826/
875 # http://article.gmane.org/gmane.comp.tex.pdftex/4914
877 empty_re = re.compile(br'%|\s*$')
878 word_re = re.compile(
879 br'''(?x) (?:
880 "<\[ (?P<enc1> [^"]+ )" | # quoted encoding marked by [
881 "< (?P<enc2> [^"]+.enc)" | # quoted encoding, ends in .enc
882 "<<? (?P<file1> [^"]+ )" | # quoted font file name
883 " (?P<eff1> [^"]+ )" | # quoted effects or font name
884 <\[ (?P<enc3> \S+ ) | # encoding marked by [
885 < (?P<enc4> \S+ .enc) | # encoding, ends in .enc
886 <<? (?P<file2> \S+ ) | # font file name
887 (?P<eff2> \S+ ) # effects or font name
888 )''')
889 effects_re = re.compile(
890 br'''(?x) (?P<slant> -?[0-9]*(?:\.[0-9]+)) \s* SlantFont
891 | (?P<extend>-?[0-9]*(?:\.[0-9]+)) \s* ExtendFont''')
893 lines = (line.strip()
894 for line in file
895 if not empty_re.match(line))
896 for line in lines:
897 effects, encoding, filename = b'', None, None
898 words = word_re.finditer(line)
900 # The named groups are mutually exclusive and are
901 # referenced below at an estimated order of probability of
902 # occurrence based on looking at my copy of pdftex.map.
903 # The font names are probably unquoted:
904 w = next(words)
905 texname = w.group('eff2') or w.group('eff1')
906 w = next(words)
907 psname = w.group('eff2') or w.group('eff1')
909 for w in words:
910 # Any effects are almost always quoted:
911 eff = w.group('eff1') or w.group('eff2')
912 if eff:
913 effects = eff
914 continue
915 # Encoding files usually have the .enc suffix
916 # and almost never need quoting:
917 enc = (w.group('enc4') or w.group('enc3') or
918 w.group('enc2') or w.group('enc1'))
919 if enc:
920 if encoding is not None:
921 _log.debug('Multiple encodings for %s = %s',
922 texname, psname)
923 encoding = enc
924 continue
925 # File names are probably unquoted:
926 filename = w.group('file2') or w.group('file1')
928 effects_dict = {}
929 for match in effects_re.finditer(effects):
930 slant = match.group('slant')
931 if slant:
932 effects_dict['slant'] = float(slant)
933 else:
934 effects_dict['extend'] = float(match.group('extend'))
936 self._font[texname] = PsFont(
937 texname=texname, psname=psname, effects=effects_dict,
938 encoding=encoding, filename=filename)
941class Encoding:
942 r"""
943 Parses a \*.enc file referenced from a psfonts.map style file.
944 The format this class understands is a very limited subset of
945 PostScript.
947 Usage (subject to change)::
949 for name in Encoding(filename):
950 whatever(name)
952 Parameters
953 ----------
954 filename : str or path-like
956 Attributes
957 ----------
958 encoding : list
959 List of character names
960 """
961 __slots__ = ('encoding',)
963 def __init__(self, filename):
964 with open(filename, 'rb') as file:
965 _log.debug('Parsing TeX encoding %s', filename)
966 self.encoding = self._parse(file)
967 _log.debug('Result: %s', self.encoding)
969 def __iter__(self):
970 yield from self.encoding
972 @staticmethod
973 def _parse(file):
974 lines = (line.split(b'%', 1)[0].strip() for line in file)
975 data = b''.join(lines)
976 beginning = data.find(b'[')
977 if beginning < 0:
978 raise ValueError("Cannot locate beginning of encoding in {}"
979 .format(file))
980 data = data[beginning:]
981 end = data.find(b']')
982 if end < 0:
983 raise ValueError("Cannot locate end of encoding in {}"
984 .format(file))
985 data = data[:end]
986 return re.findall(br'/([^][{}<>\s]+)', data)
989# Note: this function should ultimately replace the Encoding class, which
990# appears to be mostly broken: because it uses b''.join(), there is no
991# whitespace left between glyph names (only slashes) so the final re.findall
992# returns a single string with all glyph names. However this does not appear
993# to bother backend_pdf, so that needs to be investigated more. (The fixed
994# version below is necessary for textpath/backend_svg, though.)
995def _parse_enc(path):
996 r"""
997 Parses a \*.enc file referenced from a psfonts.map style file.
998 The format this class understands is a very limited subset of PostScript.
1000 Parameters
1001 ----------
1002 path : os.PathLike
1004 Returns
1005 -------
1006 encoding : list
1007 The nth entry of the list is the PostScript glyph name of the nth
1008 glyph.
1009 """
1010 with open(path, encoding="ascii") as file:
1011 no_comments = "\n".join(line.split("%")[0].rstrip() for line in file)
1012 array = re.search(r"(?s)\[(.*)\]", no_comments).group(1)
1013 lines = [line for line in array.split() if line]
1014 if all(line.startswith("/") for line in lines):
1015 return [line[1:] for line in lines]
1016 else:
1017 raise ValueError(
1018 "Failed to parse {} as Postscript encoding".format(path))
1021@lru_cache()
1022def find_tex_file(filename, format=None):
1023 """
1024 Find a file in the texmf tree.
1026 Calls :program:`kpsewhich` which is an interface to the kpathsea
1027 library [1]_. Most existing TeX distributions on Unix-like systems use
1028 kpathsea. It is also available as part of MikTeX, a popular
1029 distribution on Windows.
1031 *If the file is not found, an empty string is returned*.
1033 Parameters
1034 ----------
1035 filename : str or path-like
1036 format : str or bytes
1037 Used as the value of the `--format` option to :program:`kpsewhich`.
1038 Could be e.g. 'tfm' or 'vf' to limit the search to that type of files.
1040 References
1041 ----------
1042 .. [1] `Kpathsea documentation <http://www.tug.org/kpathsea/>`_
1043 The library that :program:`kpsewhich` is part of.
1044 """
1046 # we expect these to always be ascii encoded, but use utf-8
1047 # out of caution
1048 if isinstance(filename, bytes):
1049 filename = filename.decode('utf-8', errors='replace')
1050 if isinstance(format, bytes):
1051 format = format.decode('utf-8', errors='replace')
1053 if os.name == 'nt':
1054 # On Windows only, kpathsea can use utf-8 for cmd args and output.
1055 # The `command_line_encoding` environment variable is set to force it
1056 # to always use utf-8 encoding. See Matplotlib issue #11848.
1057 kwargs = dict(env=dict(os.environ, command_line_encoding='utf-8'))
1058 else:
1059 kwargs = {}
1061 cmd = ['kpsewhich']
1062 if format is not None:
1063 cmd += ['--format=' + format]
1064 cmd += [filename]
1065 try:
1066 result = cbook._check_and_log_subprocess(cmd, _log, **kwargs)
1067 except RuntimeError:
1068 return ''
1069 if os.name == 'nt':
1070 return result.decode('utf-8').rstrip('\r\n')
1071 else:
1072 return os.fsdecode(result).rstrip('\n')
1075@lru_cache()
1076def _fontfile(cls, suffix, texname):
1077 filename = find_tex_file(texname + suffix)
1078 return cls(filename) if filename else None
1081_tfmfile = partial(_fontfile, Tfm, ".tfm")
1082_vffile = partial(_fontfile, Vf, ".vf")
1085if __name__ == '__main__':
1086 from argparse import ArgumentParser
1087 import itertools
1089 parser = ArgumentParser()
1090 parser.add_argument("filename")
1091 parser.add_argument("dpi", nargs="?", type=float, default=None)
1092 args = parser.parse_args()
1093 with Dvi(args.filename, args.dpi) as dvi:
1094 fontmap = PsfontsMap(find_tex_file('pdftex.map'))
1095 for page in dvi:
1096 print('=== new page ===')
1097 for font, group in itertools.groupby(
1098 page.text, lambda text: text.font):
1099 print('font', font.texname, 'scaled', font._scale / 2 ** 20)
1100 for text in group:
1101 print(text.x, text.y, text.glyph,
1102 chr(text.glyph) if chr(text.glyph).isprintable()
1103 else ".",
1104 text.width)
1105 for x, y, w, h in page.boxes:
1106 print(x, y, 'BOX', w, h)