1108 lines
40 KiB
Python
1108 lines
40 KiB
Python
"""
|
|
A module for reading dvi files output by TeX. Several limitations make
|
|
this not (currently) useful as a general-purpose dvi preprocessor, but
|
|
it is currently used by the pdf backend for processing usetex text.
|
|
|
|
Interface::
|
|
|
|
with Dvi(filename, 72) as dvi:
|
|
# iterate over pages:
|
|
for page in dvi:
|
|
w, h, d = page.width, page.height, page.descent
|
|
for x, y, font, glyph, width in page.text:
|
|
fontname = font.texname
|
|
pointsize = font.size
|
|
...
|
|
for x, y, height, width in page.boxes:
|
|
...
|
|
"""
|
|
|
|
from collections import namedtuple
|
|
import enum
|
|
from functools import lru_cache, partial, wraps
|
|
import logging
|
|
import os
|
|
from pathlib import Path
|
|
import re
|
|
import struct
|
|
import subprocess
|
|
import sys
|
|
|
|
import numpy as np
|
|
|
|
from matplotlib import _api, cbook
|
|
|
|
_log = logging.getLogger(__name__)
|
|
|
|
# Many dvi related files are looked for by external processes, require
|
|
# additional parsing, and are used many times per rendering, which is why they
|
|
# are cached using lru_cache().
|
|
|
|
# Dvi is a bytecode format documented in
|
|
# https://ctan.org/pkg/dvitype
|
|
# https://texdoc.org/serve/dvitype.pdf/0
|
|
#
|
|
# The file consists of a preamble, some number of pages, a postamble,
|
|
# and a finale. Different opcodes are allowed in different contexts,
|
|
# so the Dvi object has a parser state:
|
|
#
|
|
# pre: expecting the preamble
|
|
# outer: between pages (followed by a page or the postamble,
|
|
# also e.g. font definitions are allowed)
|
|
# page: processing a page
|
|
# post_post: state after the postamble (our current implementation
|
|
# just stops reading)
|
|
# finale: the finale (unimplemented in our current implementation)
|
|
|
|
_dvistate = enum.Enum('DviState', 'pre outer inpage post_post finale')
|
|
|
|
# The marks on a page consist of text and boxes. A page also has dimensions.
|
|
Page = namedtuple('Page', 'text boxes height width descent')
|
|
Box = namedtuple('Box', 'x y height width')
|
|
|
|
|
|
# Also a namedtuple, for backcompat.
|
|
class Text(namedtuple('Text', 'x y font glyph width')):
|
|
"""
|
|
A glyph in the dvi file.
|
|
|
|
The *x* and *y* attributes directly position the glyph. The *font*,
|
|
*glyph*, and *width* attributes are kept public for back-compatibility,
|
|
but users wanting to draw the glyph themselves are encouraged to instead
|
|
load the font specified by `font_path` at `font_size`, warp it with the
|
|
effects specified by `font_effects`, and load the glyph specified by
|
|
`glyph_name_or_index`.
|
|
"""
|
|
|
|
def _get_pdftexmap_entry(self):
|
|
return PsfontsMap(find_tex_file("pdftex.map"))[self.font.texname]
|
|
|
|
@property
|
|
def font_path(self):
|
|
"""The `~pathlib.Path` to the font for this glyph."""
|
|
psfont = self._get_pdftexmap_entry()
|
|
if psfont.filename is None:
|
|
raise ValueError("No usable font file found for {} ({}); "
|
|
"the font may lack a Type-1 version"
|
|
.format(psfont.psname.decode("ascii"),
|
|
psfont.texname.decode("ascii")))
|
|
return Path(psfont.filename)
|
|
|
|
@property
|
|
def font_size(self):
|
|
"""The font size."""
|
|
return self.font.size
|
|
|
|
@property
|
|
def font_effects(self):
|
|
"""
|
|
The "font effects" dict for this glyph.
|
|
|
|
This dict contains the values for this glyph of SlantFont and
|
|
ExtendFont (if any), read off :file:`pdftex.map`.
|
|
"""
|
|
return self._get_pdftexmap_entry().effects
|
|
|
|
@property
|
|
def glyph_name_or_index(self):
|
|
"""
|
|
Either the glyph name or the native charmap glyph index.
|
|
|
|
If :file:`pdftex.map` specifies an encoding for this glyph's font, that
|
|
is a mapping of glyph indices to Adobe glyph names; use it to convert
|
|
dvi indices to glyph names. Callers can then convert glyph names to
|
|
glyph indices (with FT_Get_Name_Index/get_name_index), and load the
|
|
glyph using FT_Load_Glyph/load_glyph.
|
|
|
|
If :file:`pdftex.map` specifies no encoding, the indices directly map
|
|
to the font's "native" charmap; glyphs should directly load using
|
|
FT_Load_Char/load_char after selecting the native charmap.
|
|
"""
|
|
entry = self._get_pdftexmap_entry()
|
|
return (_parse_enc(entry.encoding)[self.glyph]
|
|
if entry.encoding is not None else self.glyph)
|
|
|
|
|
|
# Opcode argument parsing
|
|
#
|
|
# Each of the following functions takes a Dvi object and delta, which is the
|
|
# difference between the opcode and the minimum opcode with the same meaning.
|
|
# Dvi opcodes often encode the number of argument bytes in this delta.
|
|
_arg_mapping = dict(
|
|
# raw: Return delta as is.
|
|
raw=lambda dvi, delta: delta,
|
|
# u1: Read 1 byte as an unsigned number.
|
|
u1=lambda dvi, delta: dvi._arg(1, signed=False),
|
|
# u4: Read 4 bytes as an unsigned number.
|
|
u4=lambda dvi, delta: dvi._arg(4, signed=False),
|
|
# s4: Read 4 bytes as a signed number.
|
|
s4=lambda dvi, delta: dvi._arg(4, signed=True),
|
|
# slen: Read delta bytes as a signed number, or None if delta is None.
|
|
slen=lambda dvi, delta: dvi._arg(delta, signed=True) if delta else None,
|
|
# slen1: Read (delta + 1) bytes as a signed number.
|
|
slen1=lambda dvi, delta: dvi._arg(delta + 1, signed=True),
|
|
# ulen1: Read (delta + 1) bytes as an unsigned number.
|
|
ulen1=lambda dvi, delta: dvi._arg(delta + 1, signed=False),
|
|
# olen1: Read (delta + 1) bytes as an unsigned number if less than 4 bytes,
|
|
# as a signed number if 4 bytes.
|
|
olen1=lambda dvi, delta: dvi._arg(delta + 1, signed=(delta == 3)),
|
|
)
|
|
|
|
|
|
def _dispatch(table, min, max=None, state=None, args=('raw',)):
|
|
"""
|
|
Decorator for dispatch by opcode. Sets the values in *table*
|
|
from *min* to *max* to this method, adds a check that the Dvi state
|
|
matches *state* if not None, reads arguments from the file according
|
|
to *args*.
|
|
|
|
Parameters
|
|
----------
|
|
table : dict[int, callable]
|
|
The dispatch table to be filled in.
|
|
|
|
min, max : int
|
|
Range of opcodes that calls the registered function; *max* defaults to
|
|
*min*.
|
|
|
|
state : _dvistate, optional
|
|
State of the Dvi object in which these opcodes are allowed.
|
|
|
|
args : list[str], default: ['raw']
|
|
Sequence of argument specifications:
|
|
|
|
- 'raw': opcode minus minimum
|
|
- 'u1': read one unsigned byte
|
|
- 'u4': read four bytes, treat as an unsigned number
|
|
- 's4': read four bytes, treat as a signed number
|
|
- 'slen': read (opcode - minimum) bytes, treat as signed
|
|
- 'slen1': read (opcode - minimum + 1) bytes, treat as signed
|
|
- 'ulen1': read (opcode - minimum + 1) bytes, treat as unsigned
|
|
- 'olen1': read (opcode - minimum + 1) bytes, treat as unsigned
|
|
if under four bytes, signed if four bytes
|
|
"""
|
|
def decorate(method):
|
|
get_args = [_arg_mapping[x] for x in args]
|
|
|
|
@wraps(method)
|
|
def wrapper(self, byte):
|
|
if state is not None and self.state != state:
|
|
raise ValueError("state precondition failed")
|
|
return method(self, *[f(self, byte-min) for f in get_args])
|
|
if max is None:
|
|
table[min] = wrapper
|
|
else:
|
|
for i in range(min, max+1):
|
|
assert table[i] is None
|
|
table[i] = wrapper
|
|
return wrapper
|
|
return decorate
|
|
|
|
|
|
class Dvi:
|
|
"""
|
|
A reader for a dvi ("device-independent") file, as produced by TeX.
|
|
|
|
The current implementation can only iterate through pages in order,
|
|
and does not even attempt to verify the postamble.
|
|
|
|
This class can be used as a context manager to close the underlying
|
|
file upon exit. Pages can be read via iteration. Here is an overly
|
|
simple way to extract text without trying to detect whitespace::
|
|
|
|
>>> with matplotlib.dviread.Dvi('input.dvi', 72) as dvi:
|
|
... for page in dvi:
|
|
... print(''.join(chr(t.glyph) for t in page.text))
|
|
"""
|
|
# dispatch table
|
|
_dtable = [None] * 256
|
|
_dispatch = partial(_dispatch, _dtable)
|
|
|
|
def __init__(self, filename, dpi):
|
|
"""
|
|
Read the data from the file named *filename* and convert
|
|
TeX's internal units to units of *dpi* per inch.
|
|
*dpi* only sets the units and does not limit the resolution.
|
|
Use None to return TeX's internal units.
|
|
"""
|
|
_log.debug('Dvi: %s', filename)
|
|
self.file = open(filename, 'rb')
|
|
self.dpi = dpi
|
|
self.fonts = {}
|
|
self.state = _dvistate.pre
|
|
|
|
def __enter__(self):
|
|
"""Context manager enter method, does nothing."""
|
|
return self
|
|
|
|
def __exit__(self, etype, evalue, etrace):
|
|
"""
|
|
Context manager exit method, closes the underlying file if it is open.
|
|
"""
|
|
self.close()
|
|
|
|
def __iter__(self):
|
|
"""
|
|
Iterate through the pages of the file.
|
|
|
|
Yields
|
|
------
|
|
Page
|
|
Details of all the text and box objects on the page.
|
|
The Page tuple contains lists of Text and Box tuples and
|
|
the page dimensions, and the Text and Box tuples contain
|
|
coordinates transformed into a standard Cartesian
|
|
coordinate system at the dpi value given when initializing.
|
|
The coordinates are floating point numbers, but otherwise
|
|
precision is not lost and coordinate values are not clipped to
|
|
integers.
|
|
"""
|
|
while self._read():
|
|
yield self._output()
|
|
|
|
def close(self):
|
|
"""Close the underlying file if it is open."""
|
|
if not self.file.closed:
|
|
self.file.close()
|
|
|
|
def _output(self):
|
|
"""
|
|
Output the text and boxes belonging to the most recent page.
|
|
page = dvi._output()
|
|
"""
|
|
minx, miny, maxx, maxy = np.inf, np.inf, -np.inf, -np.inf
|
|
maxy_pure = -np.inf
|
|
for elt in self.text + self.boxes:
|
|
if isinstance(elt, Box):
|
|
x, y, h, w = elt
|
|
e = 0 # zero depth
|
|
else: # glyph
|
|
x, y, font, g, w = elt
|
|
h, e = font._height_depth_of(g)
|
|
minx = min(minx, x)
|
|
miny = min(miny, y - h)
|
|
maxx = max(maxx, x + w)
|
|
maxy = max(maxy, y + e)
|
|
maxy_pure = max(maxy_pure, y)
|
|
if self._baseline_v is not None:
|
|
maxy_pure = self._baseline_v # This should normally be the case.
|
|
self._baseline_v = None
|
|
|
|
if not self.text and not self.boxes: # Avoid infs/nans from inf+/-inf.
|
|
return Page(text=[], boxes=[], width=0, height=0, descent=0)
|
|
|
|
if self.dpi is None:
|
|
# special case for ease of debugging: output raw dvi coordinates
|
|
return Page(text=self.text, boxes=self.boxes,
|
|
width=maxx-minx, height=maxy_pure-miny,
|
|
descent=maxy-maxy_pure)
|
|
|
|
# convert from TeX's "scaled points" to dpi units
|
|
d = self.dpi / (72.27 * 2**16)
|
|
descent = (maxy - maxy_pure) * d
|
|
|
|
text = [Text((x-minx)*d, (maxy-y)*d - descent, f, g, w*d)
|
|
for (x, y, f, g, w) in self.text]
|
|
boxes = [Box((x-minx)*d, (maxy-y)*d - descent, h*d, w*d)
|
|
for (x, y, h, w) in self.boxes]
|
|
|
|
return Page(text=text, boxes=boxes, width=(maxx-minx)*d,
|
|
height=(maxy_pure-miny)*d, descent=descent)
|
|
|
|
def _read(self):
|
|
"""
|
|
Read one page from the file. Return True if successful,
|
|
False if there were no more pages.
|
|
"""
|
|
# Pages appear to start with the sequence
|
|
# bop (begin of page)
|
|
# xxx comment
|
|
# <push, ..., pop> # if using chemformula
|
|
# down
|
|
# push
|
|
# down
|
|
# <push, push, xxx, right, xxx, pop, pop> # if using xcolor
|
|
# down
|
|
# push
|
|
# down (possibly multiple)
|
|
# push <= here, v is the baseline position.
|
|
# etc.
|
|
# (dviasm is useful to explore this structure.)
|
|
# Thus, we use the vertical position at the first time the stack depth
|
|
# reaches 3, while at least three "downs" have been executed (excluding
|
|
# those popped out (corresponding to the chemformula preamble)), as the
|
|
# baseline (the "down" count is necessary to handle xcolor).
|
|
down_stack = [0]
|
|
self._baseline_v = None
|
|
while True:
|
|
byte = self.file.read(1)[0]
|
|
self._dtable[byte](self, byte)
|
|
name = self._dtable[byte].__name__
|
|
if name == "_push":
|
|
down_stack.append(down_stack[-1])
|
|
elif name == "_pop":
|
|
down_stack.pop()
|
|
elif name == "_down":
|
|
down_stack[-1] += 1
|
|
if (self._baseline_v is None
|
|
and len(getattr(self, "stack", [])) == 3
|
|
and down_stack[-1] >= 4):
|
|
self._baseline_v = self.v
|
|
if byte == 140: # end of page
|
|
return True
|
|
if self.state is _dvistate.post_post: # end of file
|
|
self.close()
|
|
return False
|
|
|
|
def _arg(self, nbytes, signed=False):
|
|
"""
|
|
Read and return a big-endian integer *nbytes* long.
|
|
Signedness is determined by the *signed* keyword.
|
|
"""
|
|
return int.from_bytes(self.file.read(nbytes), "big", signed=signed)
|
|
|
|
@_dispatch(min=0, max=127, state=_dvistate.inpage)
|
|
def _set_char_immediate(self, char):
|
|
self._put_char_real(char)
|
|
self.h += self.fonts[self.f]._width_of(char)
|
|
|
|
@_dispatch(min=128, max=131, state=_dvistate.inpage, args=('olen1',))
|
|
def _set_char(self, char):
|
|
self._put_char_real(char)
|
|
self.h += self.fonts[self.f]._width_of(char)
|
|
|
|
@_dispatch(132, state=_dvistate.inpage, args=('s4', 's4'))
|
|
def _set_rule(self, a, b):
|
|
self._put_rule_real(a, b)
|
|
self.h += b
|
|
|
|
@_dispatch(min=133, max=136, state=_dvistate.inpage, args=('olen1',))
|
|
def _put_char(self, char):
|
|
self._put_char_real(char)
|
|
|
|
def _put_char_real(self, char):
|
|
font = self.fonts[self.f]
|
|
if font._vf is None:
|
|
self.text.append(Text(self.h, self.v, font, char,
|
|
font._width_of(char)))
|
|
else:
|
|
scale = font._scale
|
|
for x, y, f, g, w in font._vf[char].text:
|
|
newf = DviFont(scale=_mul2012(scale, f._scale),
|
|
tfm=f._tfm, texname=f.texname, vf=f._vf)
|
|
self.text.append(Text(self.h + _mul2012(x, scale),
|
|
self.v + _mul2012(y, scale),
|
|
newf, g, newf._width_of(g)))
|
|
self.boxes.extend([Box(self.h + _mul2012(x, scale),
|
|
self.v + _mul2012(y, scale),
|
|
_mul2012(a, scale), _mul2012(b, scale))
|
|
for x, y, a, b in font._vf[char].boxes])
|
|
|
|
@_dispatch(137, state=_dvistate.inpage, args=('s4', 's4'))
|
|
def _put_rule(self, a, b):
|
|
self._put_rule_real(a, b)
|
|
|
|
def _put_rule_real(self, a, b):
|
|
if a > 0 and b > 0:
|
|
self.boxes.append(Box(self.h, self.v, a, b))
|
|
|
|
@_dispatch(138)
|
|
def _nop(self, _):
|
|
pass
|
|
|
|
@_dispatch(139, state=_dvistate.outer, args=('s4',)*11)
|
|
def _bop(self, c0, c1, c2, c3, c4, c5, c6, c7, c8, c9, p):
|
|
self.state = _dvistate.inpage
|
|
self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0
|
|
self.stack = []
|
|
self.text = [] # list of Text objects
|
|
self.boxes = [] # list of Box objects
|
|
|
|
@_dispatch(140, state=_dvistate.inpage)
|
|
def _eop(self, _):
|
|
self.state = _dvistate.outer
|
|
del self.h, self.v, self.w, self.x, self.y, self.z, self.stack
|
|
|
|
@_dispatch(141, state=_dvistate.inpage)
|
|
def _push(self, _):
|
|
self.stack.append((self.h, self.v, self.w, self.x, self.y, self.z))
|
|
|
|
@_dispatch(142, state=_dvistate.inpage)
|
|
def _pop(self, _):
|
|
self.h, self.v, self.w, self.x, self.y, self.z = self.stack.pop()
|
|
|
|
@_dispatch(min=143, max=146, state=_dvistate.inpage, args=('slen1',))
|
|
def _right(self, b):
|
|
self.h += b
|
|
|
|
@_dispatch(min=147, max=151, state=_dvistate.inpage, args=('slen',))
|
|
def _right_w(self, new_w):
|
|
if new_w is not None:
|
|
self.w = new_w
|
|
self.h += self.w
|
|
|
|
@_dispatch(min=152, max=156, state=_dvistate.inpage, args=('slen',))
|
|
def _right_x(self, new_x):
|
|
if new_x is not None:
|
|
self.x = new_x
|
|
self.h += self.x
|
|
|
|
@_dispatch(min=157, max=160, state=_dvistate.inpage, args=('slen1',))
|
|
def _down(self, a):
|
|
self.v += a
|
|
|
|
@_dispatch(min=161, max=165, state=_dvistate.inpage, args=('slen',))
|
|
def _down_y(self, new_y):
|
|
if new_y is not None:
|
|
self.y = new_y
|
|
self.v += self.y
|
|
|
|
@_dispatch(min=166, max=170, state=_dvistate.inpage, args=('slen',))
|
|
def _down_z(self, new_z):
|
|
if new_z is not None:
|
|
self.z = new_z
|
|
self.v += self.z
|
|
|
|
@_dispatch(min=171, max=234, state=_dvistate.inpage)
|
|
def _fnt_num_immediate(self, k):
|
|
self.f = k
|
|
|
|
@_dispatch(min=235, max=238, state=_dvistate.inpage, args=('olen1',))
|
|
def _fnt_num(self, new_f):
|
|
self.f = new_f
|
|
|
|
@_dispatch(min=239, max=242, args=('ulen1',))
|
|
def _xxx(self, datalen):
|
|
special = self.file.read(datalen)
|
|
_log.debug(
|
|
'Dvi._xxx: encountered special: %s',
|
|
''.join([chr(ch) if 32 <= ch < 127 else '<%02x>' % ch
|
|
for ch in special]))
|
|
|
|
@_dispatch(min=243, max=246, args=('olen1', 'u4', 'u4', 'u4', 'u1', 'u1'))
|
|
def _fnt_def(self, k, c, s, d, a, l):
|
|
self._fnt_def_real(k, c, s, d, a, l)
|
|
|
|
def _fnt_def_real(self, k, c, s, d, a, l):
|
|
n = self.file.read(a + l)
|
|
fontname = n[-l:].decode('ascii')
|
|
tfm = _tfmfile(fontname)
|
|
if c != 0 and tfm.checksum != 0 and c != tfm.checksum:
|
|
raise ValueError('tfm checksum mismatch: %s' % n)
|
|
try:
|
|
vf = _vffile(fontname)
|
|
except FileNotFoundError:
|
|
vf = None
|
|
self.fonts[k] = DviFont(scale=s, tfm=tfm, texname=n, vf=vf)
|
|
|
|
@_dispatch(247, state=_dvistate.pre, args=('u1', 'u4', 'u4', 'u4', 'u1'))
|
|
def _pre(self, i, num, den, mag, k):
|
|
self.file.read(k) # comment in the dvi file
|
|
if i != 2:
|
|
raise ValueError("Unknown dvi format %d" % i)
|
|
if num != 25400000 or den != 7227 * 2**16:
|
|
raise ValueError("Nonstandard units in dvi file")
|
|
# meaning: TeX always uses those exact values, so it
|
|
# should be enough for us to support those
|
|
# (There are 72.27 pt to an inch so 7227 pt =
|
|
# 7227 * 2**16 sp to 100 in. The numerator is multiplied
|
|
# by 10^5 to get units of 10**-7 meters.)
|
|
if mag != 1000:
|
|
raise ValueError("Nonstandard magnification in dvi file")
|
|
# meaning: LaTeX seems to frown on setting \mag, so
|
|
# I think we can assume this is constant
|
|
self.state = _dvistate.outer
|
|
|
|
@_dispatch(248, state=_dvistate.outer)
|
|
def _post(self, _):
|
|
self.state = _dvistate.post_post
|
|
# TODO: actually read the postamble and finale?
|
|
# currently post_post just triggers closing the file
|
|
|
|
@_dispatch(249)
|
|
def _post_post(self, _):
|
|
raise NotImplementedError
|
|
|
|
@_dispatch(min=250, max=255)
|
|
def _malformed(self, offset):
|
|
raise ValueError(f"unknown command: byte {250 + offset}")
|
|
|
|
|
|
class DviFont:
|
|
"""
|
|
Encapsulation of a font that a DVI file can refer to.
|
|
|
|
This class holds a font's texname and size, supports comparison,
|
|
and knows the widths of glyphs in the same units as the AFM file.
|
|
There are also internal attributes (for use by dviread.py) that
|
|
are *not* used for comparison.
|
|
|
|
The size is in Adobe points (converted from TeX points).
|
|
|
|
Parameters
|
|
----------
|
|
scale : float
|
|
Factor by which the font is scaled from its natural size.
|
|
tfm : Tfm
|
|
TeX font metrics for this font
|
|
texname : bytes
|
|
Name of the font as used internally by TeX and friends, as an ASCII
|
|
bytestring. This is usually very different from any external font
|
|
names; `PsfontsMap` can be used to find the external name of the font.
|
|
vf : Vf
|
|
A TeX "virtual font" file, or None if this font is not virtual.
|
|
|
|
Attributes
|
|
----------
|
|
texname : bytes
|
|
size : float
|
|
Size of the font in Adobe points, converted from the slightly
|
|
smaller TeX points.
|
|
widths : list
|
|
Widths of glyphs in glyph-space units, typically 1/1000ths of
|
|
the point size.
|
|
|
|
"""
|
|
__slots__ = ('texname', 'size', 'widths', '_scale', '_vf', '_tfm')
|
|
|
|
def __init__(self, scale, tfm, texname, vf):
|
|
_api.check_isinstance(bytes, texname=texname)
|
|
self._scale = scale
|
|
self._tfm = tfm
|
|
self.texname = texname
|
|
self._vf = vf
|
|
self.size = scale * (72.0 / (72.27 * 2**16))
|
|
try:
|
|
nchars = max(tfm.width) + 1
|
|
except ValueError:
|
|
nchars = 0
|
|
self.widths = [(1000*tfm.width.get(char, 0)) >> 20
|
|
for char in range(nchars)]
|
|
|
|
def __eq__(self, other):
|
|
return (type(self) is type(other)
|
|
and self.texname == other.texname and self.size == other.size)
|
|
|
|
def __ne__(self, other):
|
|
return not self.__eq__(other)
|
|
|
|
def __repr__(self):
|
|
return f"<{type(self).__name__}: {self.texname}>"
|
|
|
|
def _width_of(self, char):
|
|
"""Width of char in dvi units."""
|
|
width = self._tfm.width.get(char, None)
|
|
if width is not None:
|
|
return _mul2012(width, self._scale)
|
|
_log.debug('No width for char %d in font %s.', char, self.texname)
|
|
return 0
|
|
|
|
def _height_depth_of(self, char):
|
|
"""Height and depth of char in dvi units."""
|
|
result = []
|
|
for metric, name in ((self._tfm.height, "height"),
|
|
(self._tfm.depth, "depth")):
|
|
value = metric.get(char, None)
|
|
if value is None:
|
|
_log.debug('No %s for char %d in font %s',
|
|
name, char, self.texname)
|
|
result.append(0)
|
|
else:
|
|
result.append(_mul2012(value, self._scale))
|
|
# cmsyXX (symbols font) glyph 0 ("minus") has a nonzero descent
|
|
# so that TeX aligns equations properly
|
|
# (https://tex.stackexchange.com/q/526103/)
|
|
# but we actually care about the rasterization depth to align
|
|
# the dvipng-generated images.
|
|
if re.match(br'^cmsy\d+$', self.texname) and char == 0:
|
|
result[-1] = 0
|
|
return result
|
|
|
|
|
|
class Vf(Dvi):
|
|
r"""
|
|
A virtual font (\*.vf file) containing subroutines for dvi files.
|
|
|
|
Parameters
|
|
----------
|
|
filename : str or path-like
|
|
|
|
Notes
|
|
-----
|
|
The virtual font format is a derivative of dvi:
|
|
http://mirrors.ctan.org/info/knuth/virtual-fonts
|
|
This class reuses some of the machinery of `Dvi`
|
|
but replaces the `_read` loop and dispatch mechanism.
|
|
|
|
Examples
|
|
--------
|
|
::
|
|
|
|
vf = Vf(filename)
|
|
glyph = vf[code]
|
|
glyph.text, glyph.boxes, glyph.width
|
|
"""
|
|
|
|
def __init__(self, filename):
|
|
super().__init__(filename, 0)
|
|
try:
|
|
self._first_font = None
|
|
self._chars = {}
|
|
self._read()
|
|
finally:
|
|
self.close()
|
|
|
|
def __getitem__(self, code):
|
|
return self._chars[code]
|
|
|
|
def _read(self):
|
|
"""
|
|
Read one page from the file. Return True if successful,
|
|
False if there were no more pages.
|
|
"""
|
|
packet_char, packet_ends = None, None
|
|
packet_len, packet_width = None, None
|
|
while True:
|
|
byte = self.file.read(1)[0]
|
|
# If we are in a packet, execute the dvi instructions
|
|
if self.state is _dvistate.inpage:
|
|
byte_at = self.file.tell()-1
|
|
if byte_at == packet_ends:
|
|
self._finalize_packet(packet_char, packet_width)
|
|
packet_len, packet_char, packet_width = None, None, None
|
|
# fall through to out-of-packet code
|
|
elif byte_at > packet_ends:
|
|
raise ValueError("Packet length mismatch in vf file")
|
|
else:
|
|
if byte in (139, 140) or byte >= 243:
|
|
raise ValueError(
|
|
"Inappropriate opcode %d in vf file" % byte)
|
|
Dvi._dtable[byte](self, byte)
|
|
continue
|
|
|
|
# We are outside a packet
|
|
if byte < 242: # a short packet (length given by byte)
|
|
packet_len = byte
|
|
packet_char, packet_width = self._arg(1), self._arg(3)
|
|
packet_ends = self._init_packet(byte)
|
|
self.state = _dvistate.inpage
|
|
elif byte == 242: # a long packet
|
|
packet_len, packet_char, packet_width = \
|
|
[self._arg(x) for x in (4, 4, 4)]
|
|
self._init_packet(packet_len)
|
|
elif 243 <= byte <= 246:
|
|
k = self._arg(byte - 242, byte == 246)
|
|
c, s, d, a, l = [self._arg(x) for x in (4, 4, 4, 1, 1)]
|
|
self._fnt_def_real(k, c, s, d, a, l)
|
|
if self._first_font is None:
|
|
self._first_font = k
|
|
elif byte == 247: # preamble
|
|
i, k = self._arg(1), self._arg(1)
|
|
x = self.file.read(k)
|
|
cs, ds = self._arg(4), self._arg(4)
|
|
self._pre(i, x, cs, ds)
|
|
elif byte == 248: # postamble (just some number of 248s)
|
|
break
|
|
else:
|
|
raise ValueError("Unknown vf opcode %d" % byte)
|
|
|
|
def _init_packet(self, pl):
|
|
if self.state != _dvistate.outer:
|
|
raise ValueError("Misplaced packet in vf file")
|
|
self.h, self.v, self.w, self.x, self.y, self.z = 0, 0, 0, 0, 0, 0
|
|
self.stack, self.text, self.boxes = [], [], []
|
|
self.f = self._first_font
|
|
return self.file.tell() + pl
|
|
|
|
def _finalize_packet(self, packet_char, packet_width):
|
|
self._chars[packet_char] = Page(
|
|
text=self.text, boxes=self.boxes, width=packet_width,
|
|
height=None, descent=None)
|
|
self.state = _dvistate.outer
|
|
|
|
def _pre(self, i, x, cs, ds):
|
|
if self.state is not _dvistate.pre:
|
|
raise ValueError("pre command in middle of vf file")
|
|
if i != 202:
|
|
raise ValueError("Unknown vf format %d" % i)
|
|
if len(x):
|
|
_log.debug('vf file comment: %s', x)
|
|
self.state = _dvistate.outer
|
|
# cs = checksum, ds = design size
|
|
|
|
|
|
def _mul2012(num1, num2):
|
|
"""Multiply two numbers in 20.12 fixed point format."""
|
|
# Separated into a function because >> has surprising precedence
|
|
return (num1*num2) >> 20
|
|
|
|
|
|
class Tfm:
|
|
"""
|
|
A TeX Font Metric file.
|
|
|
|
This implementation covers only the bare minimum needed by the Dvi class.
|
|
|
|
Parameters
|
|
----------
|
|
filename : str or path-like
|
|
|
|
Attributes
|
|
----------
|
|
checksum : int
|
|
Used for verifying against the dvi file.
|
|
design_size : int
|
|
Design size of the font (unknown units)
|
|
width, height, depth : dict
|
|
Dimensions of each character, need to be scaled by the factor
|
|
specified in the dvi file. These are dicts because indexing may
|
|
not start from 0.
|
|
"""
|
|
__slots__ = ('checksum', 'design_size', 'width', 'height', 'depth')
|
|
|
|
def __init__(self, filename):
|
|
_log.debug('opening tfm file %s', filename)
|
|
with open(filename, 'rb') as file:
|
|
header1 = file.read(24)
|
|
lh, bc, ec, nw, nh, nd = struct.unpack('!6H', header1[2:14])
|
|
_log.debug('lh=%d, bc=%d, ec=%d, nw=%d, nh=%d, nd=%d',
|
|
lh, bc, ec, nw, nh, nd)
|
|
header2 = file.read(4*lh)
|
|
self.checksum, self.design_size = struct.unpack('!2I', header2[:8])
|
|
# there is also encoding information etc.
|
|
char_info = file.read(4*(ec-bc+1))
|
|
widths = struct.unpack(f'!{nw}i', file.read(4*nw))
|
|
heights = struct.unpack(f'!{nh}i', file.read(4*nh))
|
|
depths = struct.unpack(f'!{nd}i', file.read(4*nd))
|
|
self.width, self.height, self.depth = {}, {}, {}
|
|
for idx, char in enumerate(range(bc, ec+1)):
|
|
byte0 = char_info[4*idx]
|
|
byte1 = char_info[4*idx+1]
|
|
self.width[char] = widths[byte0]
|
|
self.height[char] = heights[byte1 >> 4]
|
|
self.depth[char] = depths[byte1 & 0xf]
|
|
|
|
|
|
PsFont = namedtuple('PsFont', 'texname psname effects encoding filename')
|
|
|
|
|
|
class PsfontsMap:
|
|
"""
|
|
A psfonts.map formatted file, mapping TeX fonts to PS fonts.
|
|
|
|
Parameters
|
|
----------
|
|
filename : str or path-like
|
|
|
|
Notes
|
|
-----
|
|
For historical reasons, TeX knows many Type-1 fonts by different
|
|
names than the outside world. (For one thing, the names have to
|
|
fit in eight characters.) Also, TeX's native fonts are not Type-1
|
|
but Metafont, which is nontrivial to convert to PostScript except
|
|
as a bitmap. While high-quality conversions to Type-1 format exist
|
|
and are shipped with modern TeX distributions, we need to know
|
|
which Type-1 fonts are the counterparts of which native fonts. For
|
|
these reasons a mapping is needed from internal font names to font
|
|
file names.
|
|
|
|
A texmf tree typically includes mapping files called e.g.
|
|
:file:`psfonts.map`, :file:`pdftex.map`, or :file:`dvipdfm.map`.
|
|
The file :file:`psfonts.map` is used by :program:`dvips`,
|
|
:file:`pdftex.map` by :program:`pdfTeX`, and :file:`dvipdfm.map`
|
|
by :program:`dvipdfm`. :file:`psfonts.map` might avoid embedding
|
|
the 35 PostScript fonts (i.e., have no filename for them, as in
|
|
the Times-Bold example above), while the pdf-related files perhaps
|
|
only avoid the "Base 14" pdf fonts. But the user may have
|
|
configured these files differently.
|
|
|
|
Examples
|
|
--------
|
|
>>> map = PsfontsMap(find_tex_file('pdftex.map'))
|
|
>>> entry = map[b'ptmbo8r']
|
|
>>> entry.texname
|
|
b'ptmbo8r'
|
|
>>> entry.psname
|
|
b'Times-Bold'
|
|
>>> entry.encoding
|
|
'/usr/local/texlive/2008/texmf-dist/fonts/enc/dvips/base/8r.enc'
|
|
>>> entry.effects
|
|
{'slant': 0.16700000000000001}
|
|
>>> entry.filename
|
|
"""
|
|
__slots__ = ('_filename', '_unparsed', '_parsed')
|
|
|
|
# Create a filename -> PsfontsMap cache, so that calling
|
|
# `PsfontsMap(filename)` with the same filename a second time immediately
|
|
# returns the same object.
|
|
@lru_cache
|
|
def __new__(cls, filename):
|
|
self = object.__new__(cls)
|
|
self._filename = os.fsdecode(filename)
|
|
# Some TeX distributions have enormous pdftex.map files which would
|
|
# take hundreds of milliseconds to parse, but it is easy enough to just
|
|
# store the unparsed lines (keyed by the first word, which is the
|
|
# texname) and parse them on-demand.
|
|
with open(filename, 'rb') as file:
|
|
self._unparsed = {}
|
|
for line in file:
|
|
tfmname = line.split(b' ', 1)[0]
|
|
self._unparsed.setdefault(tfmname, []).append(line)
|
|
self._parsed = {}
|
|
return self
|
|
|
|
def __getitem__(self, texname):
|
|
assert isinstance(texname, bytes)
|
|
if texname in self._unparsed:
|
|
for line in self._unparsed.pop(texname):
|
|
if self._parse_and_cache_line(line):
|
|
break
|
|
try:
|
|
return self._parsed[texname]
|
|
except KeyError:
|
|
raise LookupError(
|
|
f"An associated PostScript font (required by Matplotlib) "
|
|
f"could not be found for TeX font {texname.decode('ascii')!r} "
|
|
f"in {self._filename!r}; this problem can often be solved by "
|
|
f"installing a suitable PostScript font package in your TeX "
|
|
f"package manager") from None
|
|
|
|
def _parse_and_cache_line(self, line):
|
|
"""
|
|
Parse a line in the font mapping file.
|
|
|
|
The format is (partially) documented at
|
|
http://mirrors.ctan.org/systems/doc/pdftex/manual/pdftex-a.pdf
|
|
https://tug.org/texinfohtml/dvips.html#psfonts_002emap
|
|
Each line can have the following fields:
|
|
|
|
- tfmname (first, only required field),
|
|
- psname (defaults to tfmname, must come immediately after tfmname if
|
|
present),
|
|
- fontflags (integer, must come immediately after psname if present,
|
|
ignored by us),
|
|
- special (SlantFont and ExtendFont, only field that is double-quoted),
|
|
- fontfile, encodingfile (optional, prefixed by <, <<, or <[; << always
|
|
precedes a font, <[ always precedes an encoding, < can precede either
|
|
but then an encoding file must have extension .enc; < and << also
|
|
request different font subsetting behaviors but we ignore that; < can
|
|
be separated from the filename by whitespace).
|
|
|
|
special, fontfile, and encodingfile can appear in any order.
|
|
"""
|
|
# If the map file specifies multiple encodings for a font, we
|
|
# follow pdfTeX in choosing the last one specified. Such
|
|
# entries are probably mistakes but they have occurred.
|
|
# https://tex.stackexchange.com/q/10826/
|
|
|
|
if not line or line.startswith((b" ", b"%", b"*", b";", b"#")):
|
|
return
|
|
tfmname = basename = special = encodingfile = fontfile = None
|
|
is_subsetted = is_t1 = is_truetype = False
|
|
matches = re.finditer(br'"([^"]*)(?:"|$)|(\S+)', line)
|
|
for match in matches:
|
|
quoted, unquoted = match.groups()
|
|
if unquoted:
|
|
if unquoted.startswith(b"<<"): # font
|
|
fontfile = unquoted[2:]
|
|
elif unquoted.startswith(b"<["): # encoding
|
|
encodingfile = unquoted[2:]
|
|
elif unquoted.startswith(b"<"): # font or encoding
|
|
word = (
|
|
# <foo => foo
|
|
unquoted[1:]
|
|
# < by itself => read the next word
|
|
or next(filter(None, next(matches).groups())))
|
|
if word.endswith(b".enc"):
|
|
encodingfile = word
|
|
else:
|
|
fontfile = word
|
|
is_subsetted = True
|
|
elif tfmname is None:
|
|
tfmname = unquoted
|
|
elif basename is None:
|
|
basename = unquoted
|
|
elif quoted:
|
|
special = quoted
|
|
effects = {}
|
|
if special:
|
|
words = reversed(special.split())
|
|
for word in words:
|
|
if word == b"SlantFont":
|
|
effects["slant"] = float(next(words))
|
|
elif word == b"ExtendFont":
|
|
effects["extend"] = float(next(words))
|
|
|
|
# Verify some properties of the line that would cause it to be ignored
|
|
# otherwise.
|
|
if fontfile is not None:
|
|
if fontfile.endswith((b".ttf", b".ttc")):
|
|
is_truetype = True
|
|
elif not fontfile.endswith(b".otf"):
|
|
is_t1 = True
|
|
elif basename is not None:
|
|
is_t1 = True
|
|
if is_truetype and is_subsetted and encodingfile is None:
|
|
return
|
|
if not is_t1 and ("slant" in effects or "extend" in effects):
|
|
return
|
|
if abs(effects.get("slant", 0)) > 1:
|
|
return
|
|
if abs(effects.get("extend", 0)) > 2:
|
|
return
|
|
|
|
if basename is None:
|
|
basename = tfmname
|
|
if encodingfile is not None:
|
|
encodingfile = find_tex_file(encodingfile)
|
|
if fontfile is not None:
|
|
fontfile = find_tex_file(fontfile)
|
|
self._parsed[tfmname] = PsFont(
|
|
texname=tfmname, psname=basename, effects=effects,
|
|
encoding=encodingfile, filename=fontfile)
|
|
return True
|
|
|
|
|
|
def _parse_enc(path):
|
|
r"""
|
|
Parse a \*.enc file referenced from a psfonts.map style file.
|
|
|
|
The format supported by this function is a tiny subset of PostScript.
|
|
|
|
Parameters
|
|
----------
|
|
path : `os.PathLike`
|
|
|
|
Returns
|
|
-------
|
|
list
|
|
The nth entry of the list is the PostScript glyph name of the nth
|
|
glyph.
|
|
"""
|
|
no_comments = re.sub("%.*", "", Path(path).read_text(encoding="ascii"))
|
|
array = re.search(r"(?s)\[(.*)\]", no_comments).group(1)
|
|
lines = [line for line in array.split() if line]
|
|
if all(line.startswith("/") for line in lines):
|
|
return [line[1:] for line in lines]
|
|
else:
|
|
raise ValueError(f"Failed to parse {path} as Postscript encoding")
|
|
|
|
|
|
class _LuatexKpsewhich:
|
|
@lru_cache # A singleton.
|
|
def __new__(cls):
|
|
self = object.__new__(cls)
|
|
self._proc = self._new_proc()
|
|
return self
|
|
|
|
def _new_proc(self):
|
|
return subprocess.Popen(
|
|
["luatex", "--luaonly",
|
|
str(cbook._get_data_path("kpsewhich.lua"))],
|
|
stdin=subprocess.PIPE, stdout=subprocess.PIPE)
|
|
|
|
def search(self, filename):
|
|
if self._proc.poll() is not None: # Dead, restart it.
|
|
self._proc = self._new_proc()
|
|
self._proc.stdin.write(os.fsencode(filename) + b"\n")
|
|
self._proc.stdin.flush()
|
|
out = self._proc.stdout.readline().rstrip()
|
|
return None if out == b"nil" else os.fsdecode(out)
|
|
|
|
|
|
@lru_cache
|
|
def find_tex_file(filename):
|
|
"""
|
|
Find a file in the texmf tree using kpathsea_.
|
|
|
|
The kpathsea library, provided by most existing TeX distributions, both
|
|
on Unix-like systems and on Windows (MikTeX), is invoked via a long-lived
|
|
luatex process if luatex is installed, or via kpsewhich otherwise.
|
|
|
|
.. _kpathsea: https://www.tug.org/kpathsea/
|
|
|
|
Parameters
|
|
----------
|
|
filename : str or path-like
|
|
|
|
Raises
|
|
------
|
|
FileNotFoundError
|
|
If the file is not found.
|
|
"""
|
|
|
|
# we expect these to always be ascii encoded, but use utf-8
|
|
# out of caution
|
|
if isinstance(filename, bytes):
|
|
filename = filename.decode('utf-8', errors='replace')
|
|
|
|
try:
|
|
lk = _LuatexKpsewhich()
|
|
except FileNotFoundError:
|
|
lk = None # Fallback to directly calling kpsewhich, as below.
|
|
|
|
if lk:
|
|
path = lk.search(filename)
|
|
else:
|
|
if sys.platform == 'win32':
|
|
# On Windows only, kpathsea can use utf-8 for cmd args and output.
|
|
# The `command_line_encoding` environment variable is set to force
|
|
# it to always use utf-8 encoding. See Matplotlib issue #11848.
|
|
kwargs = {'env': {**os.environ, 'command_line_encoding': 'utf-8'},
|
|
'encoding': 'utf-8'}
|
|
else: # On POSIX, run through the equivalent of os.fsdecode().
|
|
kwargs = {'encoding': sys.getfilesystemencoding(),
|
|
'errors': 'surrogateescape'}
|
|
|
|
try:
|
|
path = (cbook._check_and_log_subprocess(['kpsewhich', filename],
|
|
_log, **kwargs)
|
|
.rstrip('\n'))
|
|
except (FileNotFoundError, RuntimeError):
|
|
path = None
|
|
|
|
if path:
|
|
return path
|
|
else:
|
|
raise FileNotFoundError(
|
|
f"Matplotlib's TeX implementation searched for a file named "
|
|
f"{filename!r} in your texmf tree, but could not find it")
|
|
|
|
|
|
@lru_cache
|
|
def _fontfile(cls, suffix, texname):
|
|
return cls(find_tex_file(texname + suffix))
|
|
|
|
|
|
_tfmfile = partial(_fontfile, Tfm, ".tfm")
|
|
_vffile = partial(_fontfile, Vf, ".vf")
|
|
|
|
|
|
if __name__ == '__main__':
|
|
from argparse import ArgumentParser
|
|
import itertools
|
|
|
|
parser = ArgumentParser()
|
|
parser.add_argument("filename")
|
|
parser.add_argument("dpi", nargs="?", type=float, default=None)
|
|
args = parser.parse_args()
|
|
with Dvi(args.filename, args.dpi) as dvi:
|
|
fontmap = PsfontsMap(find_tex_file('pdftex.map'))
|
|
for page in dvi:
|
|
print(f"=== new page === "
|
|
f"(w: {page.width}, h: {page.height}, d: {page.descent})")
|
|
for font, group in itertools.groupby(
|
|
page.text, lambda text: text.font):
|
|
print(f"font: {font.texname.decode('latin-1')!r}\t"
|
|
f"scale: {font._scale / 2 ** 20}")
|
|
print("x", "y", "glyph", "chr", "w", "(glyphs)", sep="\t")
|
|
for text in group:
|
|
print(text.x, text.y, text.glyph,
|
|
chr(text.glyph) if chr(text.glyph).isprintable()
|
|
else ".",
|
|
text.width, sep="\t")
|
|
if page.boxes:
|
|
print("x", "y", "h", "w", "", "(boxes)", sep="\t")
|
|
for box in page.boxes:
|
|
print(box.x, box.y, box.height, box.width, sep="\t")
|