1219 lines
38 KiB
Python
1219 lines
38 KiB
Python
import contextlib
|
|
import logging
|
|
import struct
|
|
from collections.abc import Iterable, Iterator, Mapping
|
|
from io import BytesIO
|
|
from typing import (
|
|
TYPE_CHECKING,
|
|
Any,
|
|
BinaryIO,
|
|
cast,
|
|
)
|
|
|
|
from pdfminer import settings
|
|
from pdfminer.casting import safe_float, safe_rect_list
|
|
from pdfminer.cmapdb import (
|
|
CMap,
|
|
CMapBase,
|
|
CMapDB,
|
|
CMapParser,
|
|
FileUnicodeMap,
|
|
IdentityUnicodeMap,
|
|
UnicodeMap,
|
|
)
|
|
from pdfminer.encodingdb import EncodingDB, name2unicode
|
|
from pdfminer.fontmetrics import FONT_METRICS
|
|
from pdfminer.pdfexceptions import PDFException, PDFKeyError, PDFValueError
|
|
from pdfminer.pdftypes import (
|
|
PDFStream,
|
|
dict_value,
|
|
int_value,
|
|
list_value,
|
|
num_value,
|
|
resolve1,
|
|
resolve_all,
|
|
stream_value,
|
|
)
|
|
from pdfminer.psexceptions import PSEOF
|
|
from pdfminer.psparser import (
|
|
KWD,
|
|
LIT,
|
|
PSKeyword,
|
|
PSLiteral,
|
|
PSStackParser,
|
|
literal_name,
|
|
)
|
|
from pdfminer.utils import Matrix, Point, Rect, apply_matrix_norm, choplist, nunpack
|
|
|
|
if TYPE_CHECKING:
|
|
from pdfminer.pdfinterp import PDFResourceManager
|
|
|
|
log = logging.getLogger(__name__)
|
|
|
|
|
|
def get_widths(seq: Iterable[object]) -> dict[str | int, float]:
|
|
"""Build a mapping of character widths for horizontal writing."""
|
|
widths: dict[int, float] = {}
|
|
r: list[float] = []
|
|
for v in seq:
|
|
v = resolve1(v)
|
|
if isinstance(v, list):
|
|
if r:
|
|
char1 = r[-1]
|
|
for i, w in enumerate(v):
|
|
widths[cast(int, char1) + i] = w
|
|
r = []
|
|
elif isinstance(v, (int, float)): # == utils.isnumber(v)
|
|
r.append(v)
|
|
if len(r) == 3:
|
|
(char1, char2, w) = r
|
|
if isinstance(char1, int) and isinstance(char2, int):
|
|
for i in range(char1, char2 + 1):
|
|
widths[i] = w
|
|
else:
|
|
log.warning(
|
|
f"Skipping invalid font width specification for {char1} to "
|
|
f"{char2} because either of them is not an int"
|
|
)
|
|
r = []
|
|
else:
|
|
log.warning(
|
|
f"Skipping invalid font width specification for {v} "
|
|
f"because it is not a number or a list"
|
|
)
|
|
return cast(dict[str | int, float], widths)
|
|
|
|
|
|
def get_widths2(seq: Iterable[object]) -> dict[int, tuple[float, Point]]:
|
|
"""Build a mapping of character widths for vertical writing."""
|
|
widths: dict[int, tuple[float, Point]] = {}
|
|
r: list[float] = []
|
|
for v in seq:
|
|
if isinstance(v, list):
|
|
if r:
|
|
char1 = r[-1]
|
|
for i, (w, vx, vy) in enumerate(choplist(3, v)):
|
|
widths[cast(int, char1) + i] = (w, (vx, vy))
|
|
r = []
|
|
elif isinstance(v, (int, float)): # == utils.isnumber(v)
|
|
r.append(v)
|
|
if len(r) == 5:
|
|
(char1, char2, w, vx, vy) = r
|
|
for i in range(cast(int, char1), cast(int, char2) + 1):
|
|
widths[i] = (w, (vx, vy))
|
|
r = []
|
|
return widths
|
|
|
|
|
|
class FontMetricsDB:
|
|
@classmethod
|
|
def get_metrics(cls, fontname: str) -> tuple[dict[str, object], dict[str, int]]:
|
|
return FONT_METRICS[fontname]
|
|
|
|
|
|
# int here means that we're not extending PSStackParser with additional types.
|
|
class Type1FontHeaderParser(PSStackParser[int]):
|
|
KEYWORD_BEGIN = KWD(b"begin")
|
|
KEYWORD_END = KWD(b"end")
|
|
KEYWORD_DEF = KWD(b"def")
|
|
KEYWORD_PUT = KWD(b"put")
|
|
KEYWORD_DICT = KWD(b"dict")
|
|
KEYWORD_ARRAY = KWD(b"array")
|
|
KEYWORD_READONLY = KWD(b"readonly")
|
|
KEYWORD_FOR = KWD(b"for")
|
|
|
|
def __init__(self, data: BinaryIO) -> None:
|
|
PSStackParser.__init__(self, data)
|
|
self._cid2unicode: dict[int, str] = {}
|
|
|
|
def get_encoding(self) -> dict[int, str]:
|
|
"""Parse the font encoding.
|
|
|
|
The Type1 font encoding maps character codes to character names. These
|
|
character names could either be standard Adobe glyph names, or
|
|
character names associated with custom CharStrings for this font. A
|
|
CharString is a sequence of operations that describe how the character
|
|
should be drawn. Currently, this function returns '' (empty string)
|
|
for character names that are associated with a CharStrings.
|
|
|
|
Reference: Adobe Systems Incorporated, Adobe Type 1 Font Format
|
|
|
|
:returns mapping of character identifiers (cid's) to unicode characters
|
|
"""
|
|
while 1:
|
|
try:
|
|
(cid, name) = self.nextobject()
|
|
except PSEOF:
|
|
break
|
|
try:
|
|
self._cid2unicode[cid] = name2unicode(cast(str, name))
|
|
except KeyError as e:
|
|
log.debug(str(e))
|
|
return self._cid2unicode
|
|
|
|
def do_keyword(self, pos: int, token: PSKeyword) -> None:
|
|
if token is self.KEYWORD_PUT:
|
|
((_, key), (_, value)) = self.pop(2)
|
|
if isinstance(key, int) and isinstance(value, PSLiteral):
|
|
self.add_results((key, literal_name(value)))
|
|
|
|
|
|
NIBBLES = ("0", "1", "2", "3", "4", "5", "6", "7", "8", "9", ".", "e", "e-", None, "-")
|
|
|
|
# Mapping of cmap names. Original cmap name is kept if not in the mapping.
|
|
# (missing reference for why DLIdent is mapped to Identity)
|
|
IDENTITY_ENCODER = {
|
|
"DLIdent-H": "Identity-H",
|
|
"DLIdent-V": "Identity-V",
|
|
}
|
|
|
|
|
|
def getdict(data: bytes) -> dict[int, list[float | int]]:
|
|
d: dict[int, list[float | int]] = {}
|
|
fp = BytesIO(data)
|
|
stack: list[float | int] = []
|
|
while 1:
|
|
c = fp.read(1)
|
|
if not c:
|
|
break
|
|
b0 = ord(c)
|
|
if b0 <= 21:
|
|
d[b0] = stack
|
|
stack = []
|
|
continue
|
|
if b0 == 30:
|
|
s = ""
|
|
loop = True
|
|
while loop:
|
|
b = ord(fp.read(1))
|
|
for n in (b >> 4, b & 15):
|
|
if n == 15:
|
|
loop = False
|
|
else:
|
|
nibble = NIBBLES[n]
|
|
assert nibble is not None
|
|
s += nibble
|
|
value = float(s)
|
|
elif b0 >= 32 and b0 <= 246:
|
|
value = b0 - 139
|
|
else:
|
|
b1 = ord(fp.read(1))
|
|
if b0 >= 247 and b0 <= 250:
|
|
value = ((b0 - 247) << 8) + b1 + 108
|
|
elif b0 >= 251 and b0 <= 254:
|
|
value = -((b0 - 251) << 8) - b1 - 108
|
|
else:
|
|
b2 = ord(fp.read(1))
|
|
if b1 >= 128:
|
|
b1 -= 256
|
|
if b0 == 28:
|
|
value = b1 << 8 | b2
|
|
else:
|
|
value = b1 << 24 | b2 << 16 | struct.unpack(">H", fp.read(2))[0]
|
|
stack.append(value)
|
|
return d
|
|
|
|
|
|
class CFFFont:
|
|
STANDARD_STRINGS = (
|
|
".notdef",
|
|
"space",
|
|
"exclam",
|
|
"quotedbl",
|
|
"numbersign",
|
|
"dollar",
|
|
"percent",
|
|
"ampersand",
|
|
"quoteright",
|
|
"parenleft",
|
|
"parenright",
|
|
"asterisk",
|
|
"plus",
|
|
"comma",
|
|
"hyphen",
|
|
"period",
|
|
"slash",
|
|
"zero",
|
|
"one",
|
|
"two",
|
|
"three",
|
|
"four",
|
|
"five",
|
|
"six",
|
|
"seven",
|
|
"eight",
|
|
"nine",
|
|
"colon",
|
|
"semicolon",
|
|
"less",
|
|
"equal",
|
|
"greater",
|
|
"question",
|
|
"at",
|
|
"A",
|
|
"B",
|
|
"C",
|
|
"D",
|
|
"E",
|
|
"F",
|
|
"G",
|
|
"H",
|
|
"I",
|
|
"J",
|
|
"K",
|
|
"L",
|
|
"M",
|
|
"N",
|
|
"O",
|
|
"P",
|
|
"Q",
|
|
"R",
|
|
"S",
|
|
"T",
|
|
"U",
|
|
"V",
|
|
"W",
|
|
"X",
|
|
"Y",
|
|
"Z",
|
|
"bracketleft",
|
|
"backslash",
|
|
"bracketright",
|
|
"asciicircum",
|
|
"underscore",
|
|
"quoteleft",
|
|
"a",
|
|
"b",
|
|
"c",
|
|
"d",
|
|
"e",
|
|
"f",
|
|
"g",
|
|
"h",
|
|
"i",
|
|
"j",
|
|
"k",
|
|
"l",
|
|
"m",
|
|
"n",
|
|
"o",
|
|
"p",
|
|
"q",
|
|
"r",
|
|
"s",
|
|
"t",
|
|
"u",
|
|
"v",
|
|
"w",
|
|
"x",
|
|
"y",
|
|
"z",
|
|
"braceleft",
|
|
"bar",
|
|
"braceright",
|
|
"asciitilde",
|
|
"exclamdown",
|
|
"cent",
|
|
"sterling",
|
|
"fraction",
|
|
"yen",
|
|
"florin",
|
|
"section",
|
|
"currency",
|
|
"quotesingle",
|
|
"quotedblleft",
|
|
"guillemotleft",
|
|
"guilsinglleft",
|
|
"guilsinglright",
|
|
"fi",
|
|
"fl",
|
|
"endash",
|
|
"dagger",
|
|
"daggerdbl",
|
|
"periodcentered",
|
|
"paragraph",
|
|
"bullet",
|
|
"quotesinglbase",
|
|
"quotedblbase",
|
|
"quotedblright",
|
|
"guillemotright",
|
|
"ellipsis",
|
|
"perthousand",
|
|
"questiondown",
|
|
"grave",
|
|
"acute",
|
|
"circumflex",
|
|
"tilde",
|
|
"macron",
|
|
"breve",
|
|
"dotaccent",
|
|
"dieresis",
|
|
"ring",
|
|
"cedilla",
|
|
"hungarumlaut",
|
|
"ogonek",
|
|
"caron",
|
|
"emdash",
|
|
"AE",
|
|
"ordfeminine",
|
|
"Lslash",
|
|
"Oslash",
|
|
"OE",
|
|
"ordmasculine",
|
|
"ae",
|
|
"dotlessi",
|
|
"lslash",
|
|
"oslash",
|
|
"oe",
|
|
"germandbls",
|
|
"onesuperior",
|
|
"logicalnot",
|
|
"mu",
|
|
"trademark",
|
|
"Eth",
|
|
"onehalf",
|
|
"plusminus",
|
|
"Thorn",
|
|
"onequarter",
|
|
"divide",
|
|
"brokenbar",
|
|
"degree",
|
|
"thorn",
|
|
"threequarters",
|
|
"twosuperior",
|
|
"registered",
|
|
"minus",
|
|
"eth",
|
|
"multiply",
|
|
"threesuperior",
|
|
"copyright",
|
|
"Aacute",
|
|
"Acircumflex",
|
|
"Adieresis",
|
|
"Agrave",
|
|
"Aring",
|
|
"Atilde",
|
|
"Ccedilla",
|
|
"Eacute",
|
|
"Ecircumflex",
|
|
"Edieresis",
|
|
"Egrave",
|
|
"Iacute",
|
|
"Icircumflex",
|
|
"Idieresis",
|
|
"Igrave",
|
|
"Ntilde",
|
|
"Oacute",
|
|
"Ocircumflex",
|
|
"Odieresis",
|
|
"Ograve",
|
|
"Otilde",
|
|
"Scaron",
|
|
"Uacute",
|
|
"Ucircumflex",
|
|
"Udieresis",
|
|
"Ugrave",
|
|
"Yacute",
|
|
"Ydieresis",
|
|
"Zcaron",
|
|
"aacute",
|
|
"acircumflex",
|
|
"adieresis",
|
|
"agrave",
|
|
"aring",
|
|
"atilde",
|
|
"ccedilla",
|
|
"eacute",
|
|
"ecircumflex",
|
|
"edieresis",
|
|
"egrave",
|
|
"iacute",
|
|
"icircumflex",
|
|
"idieresis",
|
|
"igrave",
|
|
"ntilde",
|
|
"oacute",
|
|
"ocircumflex",
|
|
"odieresis",
|
|
"ograve",
|
|
"otilde",
|
|
"scaron",
|
|
"uacute",
|
|
"ucircumflex",
|
|
"udieresis",
|
|
"ugrave",
|
|
"yacute",
|
|
"ydieresis",
|
|
"zcaron",
|
|
"exclamsmall",
|
|
"Hungarumlautsmall",
|
|
"dollaroldstyle",
|
|
"dollarsuperior",
|
|
"ampersandsmall",
|
|
"Acutesmall",
|
|
"parenleftsuperior",
|
|
"parenrightsuperior",
|
|
"twodotenleader",
|
|
"onedotenleader",
|
|
"zerooldstyle",
|
|
"oneoldstyle",
|
|
"twooldstyle",
|
|
"threeoldstyle",
|
|
"fouroldstyle",
|
|
"fiveoldstyle",
|
|
"sixoldstyle",
|
|
"sevenoldstyle",
|
|
"eightoldstyle",
|
|
"nineoldstyle",
|
|
"commasuperior",
|
|
"threequartersemdash",
|
|
"periodsuperior",
|
|
"questionsmall",
|
|
"asuperior",
|
|
"bsuperior",
|
|
"centsuperior",
|
|
"dsuperior",
|
|
"esuperior",
|
|
"isuperior",
|
|
"lsuperior",
|
|
"msuperior",
|
|
"nsuperior",
|
|
"osuperior",
|
|
"rsuperior",
|
|
"ssuperior",
|
|
"tsuperior",
|
|
"ff",
|
|
"ffi",
|
|
"ffl",
|
|
"parenleftinferior",
|
|
"parenrightinferior",
|
|
"Circumflexsmall",
|
|
"hyphensuperior",
|
|
"Gravesmall",
|
|
"Asmall",
|
|
"Bsmall",
|
|
"Csmall",
|
|
"Dsmall",
|
|
"Esmall",
|
|
"Fsmall",
|
|
"Gsmall",
|
|
"Hsmall",
|
|
"Ismall",
|
|
"Jsmall",
|
|
"Ksmall",
|
|
"Lsmall",
|
|
"Msmall",
|
|
"Nsmall",
|
|
"Osmall",
|
|
"Psmall",
|
|
"Qsmall",
|
|
"Rsmall",
|
|
"Ssmall",
|
|
"Tsmall",
|
|
"Usmall",
|
|
"Vsmall",
|
|
"Wsmall",
|
|
"Xsmall",
|
|
"Ysmall",
|
|
"Zsmall",
|
|
"colonmonetary",
|
|
"onefitted",
|
|
"rupiah",
|
|
"Tildesmall",
|
|
"exclamdownsmall",
|
|
"centoldstyle",
|
|
"Lslashsmall",
|
|
"Scaronsmall",
|
|
"Zcaronsmall",
|
|
"Dieresissmall",
|
|
"Brevesmall",
|
|
"Caronsmall",
|
|
"Dotaccentsmall",
|
|
"Macronsmall",
|
|
"figuredash",
|
|
"hypheninferior",
|
|
"Ogoneksmall",
|
|
"Ringsmall",
|
|
"Cedillasmall",
|
|
"questiondownsmall",
|
|
"oneeighth",
|
|
"threeeighths",
|
|
"fiveeighths",
|
|
"seveneighths",
|
|
"onethird",
|
|
"twothirds",
|
|
"zerosuperior",
|
|
"foursuperior",
|
|
"fivesuperior",
|
|
"sixsuperior",
|
|
"sevensuperior",
|
|
"eightsuperior",
|
|
"ninesuperior",
|
|
"zeroinferior",
|
|
"oneinferior",
|
|
"twoinferior",
|
|
"threeinferior",
|
|
"fourinferior",
|
|
"fiveinferior",
|
|
"sixinferior",
|
|
"seveninferior",
|
|
"eightinferior",
|
|
"nineinferior",
|
|
"centinferior",
|
|
"dollarinferior",
|
|
"periodinferior",
|
|
"commainferior",
|
|
"Agravesmall",
|
|
"Aacutesmall",
|
|
"Acircumflexsmall",
|
|
"Atildesmall",
|
|
"Adieresissmall",
|
|
"Aringsmall",
|
|
"AEsmall",
|
|
"Ccedillasmall",
|
|
"Egravesmall",
|
|
"Eacutesmall",
|
|
"Ecircumflexsmall",
|
|
"Edieresissmall",
|
|
"Igravesmall",
|
|
"Iacutesmall",
|
|
"Icircumflexsmall",
|
|
"Idieresissmall",
|
|
"Ethsmall",
|
|
"Ntildesmall",
|
|
"Ogravesmall",
|
|
"Oacutesmall",
|
|
"Ocircumflexsmall",
|
|
"Otildesmall",
|
|
"Odieresissmall",
|
|
"OEsmall",
|
|
"Oslashsmall",
|
|
"Ugravesmall",
|
|
"Uacutesmall",
|
|
"Ucircumflexsmall",
|
|
"Udieresissmall",
|
|
"Yacutesmall",
|
|
"Thornsmall",
|
|
"Ydieresissmall",
|
|
"001.000",
|
|
"001.001",
|
|
"001.002",
|
|
"001.003",
|
|
"Black",
|
|
"Bold",
|
|
"Book",
|
|
"Light",
|
|
"Medium",
|
|
"Regular",
|
|
"Roman",
|
|
"Semibold",
|
|
)
|
|
|
|
class INDEX:
|
|
def __init__(self, fp: BinaryIO) -> None:
|
|
self.fp = fp
|
|
self.offsets: list[int] = []
|
|
(count, offsize) = struct.unpack(">HB", self.fp.read(3))
|
|
for _i in range(count + 1):
|
|
self.offsets.append(nunpack(self.fp.read(offsize)))
|
|
self.base = self.fp.tell() - 1
|
|
self.fp.seek(self.base + self.offsets[-1])
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<INDEX: size={len(self)}>"
|
|
|
|
def __len__(self) -> int:
|
|
return len(self.offsets) - 1
|
|
|
|
def __getitem__(self, i: int) -> bytes:
|
|
self.fp.seek(self.base + self.offsets[i])
|
|
return self.fp.read(self.offsets[i + 1] - self.offsets[i])
|
|
|
|
def __iter__(self) -> Iterator[bytes]:
|
|
return iter(self[i] for i in range(len(self)))
|
|
|
|
def __init__(self, name: str, fp: BinaryIO) -> None:
|
|
self.name = name
|
|
self.fp = fp
|
|
# Header
|
|
(_major, _minor, hdrsize, _offsize) = struct.unpack("BBBB", self.fp.read(4))
|
|
self.fp.read(hdrsize - 4)
|
|
# Name INDEX
|
|
self.name_index = self.INDEX(self.fp)
|
|
# Top DICT INDEX
|
|
self.dict_index = self.INDEX(self.fp)
|
|
# String INDEX
|
|
self.string_index = self.INDEX(self.fp)
|
|
# Global Subr INDEX
|
|
self.subr_index = self.INDEX(self.fp)
|
|
# Top DICT DATA
|
|
self.top_dict = getdict(self.dict_index[0])
|
|
(charset_pos,) = self.top_dict.get(15, [0])
|
|
(encoding_pos,) = self.top_dict.get(16, [0])
|
|
(charstring_pos,) = self.top_dict.get(17, [0])
|
|
# CharStrings
|
|
self.fp.seek(cast(int, charstring_pos))
|
|
self.charstring = self.INDEX(self.fp)
|
|
self.nglyphs = len(self.charstring)
|
|
# Encodings
|
|
self.code2gid = {}
|
|
self.gid2code = {}
|
|
self.fp.seek(cast(int, encoding_pos))
|
|
format = self.fp.read(1)
|
|
if format == b"\x00":
|
|
# Format 0
|
|
(n,) = struct.unpack("B", self.fp.read(1))
|
|
for code, gid in enumerate(struct.unpack("B" * n, self.fp.read(n))):
|
|
self.code2gid[code] = gid
|
|
self.gid2code[gid] = code
|
|
elif format == b"\x01":
|
|
# Format 1
|
|
(n,) = struct.unpack("B", self.fp.read(1))
|
|
code = 0
|
|
for _i in range(n):
|
|
(first, nleft) = struct.unpack("BB", self.fp.read(2))
|
|
for gid in range(first, first + nleft + 1):
|
|
self.code2gid[code] = gid
|
|
self.gid2code[gid] = code
|
|
code += 1
|
|
else:
|
|
raise PDFValueError(f"unsupported encoding format: {format!r}")
|
|
# Charsets
|
|
self.name2gid = {}
|
|
self.gid2name = {}
|
|
self.fp.seek(cast(int, charset_pos))
|
|
format = self.fp.read(1)
|
|
if format == b"\x00":
|
|
# Format 0
|
|
n = self.nglyphs - 1
|
|
for gid, sid in enumerate(
|
|
cast(
|
|
tuple[int, ...], struct.unpack(">" + "H" * n, self.fp.read(2 * n))
|
|
),
|
|
):
|
|
gid += 1
|
|
sidname = self.getstr(sid)
|
|
self.name2gid[sidname] = gid
|
|
self.gid2name[gid] = sidname
|
|
elif format == b"\x01":
|
|
# Format 1
|
|
(n,) = struct.unpack("B", self.fp.read(1))
|
|
sid = 0
|
|
for _i in range(n):
|
|
(first, nleft) = struct.unpack("BB", self.fp.read(2))
|
|
for gid in range(first, first + nleft + 1):
|
|
sidname = self.getstr(sid)
|
|
self.name2gid[sidname] = gid
|
|
self.gid2name[gid] = sidname
|
|
sid += 1
|
|
elif format == b"\x02":
|
|
# Format 2
|
|
raise AssertionError(str(("Unhandled", format)))
|
|
else:
|
|
raise PDFValueError(f"unsupported charset format: {format!r}")
|
|
|
|
def getstr(self, sid: int) -> str | bytes:
|
|
# This returns str for one of the STANDARD_STRINGS but bytes otherwise,
|
|
# and appears to be a needless source of type complexity.
|
|
if sid < len(self.STANDARD_STRINGS):
|
|
return self.STANDARD_STRINGS[sid]
|
|
return self.string_index[sid - len(self.STANDARD_STRINGS)]
|
|
|
|
|
|
class TrueTypeFont:
|
|
class CMapNotFound(PDFException):
|
|
pass
|
|
|
|
def __init__(self, name: str, fp: BinaryIO) -> None:
|
|
self.name = name
|
|
self.fp = fp
|
|
self.tables: dict[bytes, tuple[int, int]] = {}
|
|
self.fonttype = fp.read(4)
|
|
try:
|
|
(ntables, _1, _2, _3) = cast(
|
|
tuple[int, int, int, int],
|
|
struct.unpack(">HHHH", fp.read(8)),
|
|
)
|
|
for _ in range(ntables):
|
|
(name_bytes, _tsum, offset, length) = cast(
|
|
tuple[bytes, int, int, int],
|
|
struct.unpack(">4sLLL", fp.read(16)),
|
|
)
|
|
self.tables[name_bytes] = (offset, length)
|
|
except struct.error:
|
|
# Do not fail if there are not enough bytes to read. Even for
|
|
# corrupted PDFs we would like to get as much information as
|
|
# possible, so continue.
|
|
pass
|
|
|
|
def create_unicode_map(self) -> FileUnicodeMap:
|
|
if b"cmap" not in self.tables:
|
|
raise TrueTypeFont.CMapNotFound
|
|
(base_offset, _length) = self.tables[b"cmap"]
|
|
fp = self.fp
|
|
fp.seek(base_offset)
|
|
(_version, nsubtables) = cast(tuple[int, int], struct.unpack(">HH", fp.read(4)))
|
|
subtables: list[tuple[int, int, int]] = []
|
|
for _i in range(nsubtables):
|
|
subtables.append(
|
|
cast(tuple[int, int, int], struct.unpack(">HHL", fp.read(8))),
|
|
)
|
|
char2gid: dict[int, int] = {}
|
|
# Only supports subtable type 0, 2 and 4.
|
|
for platform_id, encoding_id, st_offset in subtables:
|
|
# Skip non-Unicode cmaps.
|
|
# https://docs.microsoft.com/en-us/typography/opentype/spec/cmap
|
|
if not (platform_id == 0 or (platform_id == 3 and encoding_id in [1, 10])):
|
|
continue
|
|
fp.seek(base_offset + st_offset)
|
|
(fmttype, _fmtlen, _fmtlang) = cast(
|
|
tuple[int, int, int],
|
|
struct.unpack(">HHH", fp.read(6)),
|
|
)
|
|
if fmttype == 0:
|
|
char2gid.update(
|
|
enumerate(
|
|
cast(tuple[int, ...], struct.unpack(">256B", fp.read(256))),
|
|
),
|
|
)
|
|
elif fmttype == 2:
|
|
subheaderkeys = cast(
|
|
tuple[int, ...],
|
|
struct.unpack(">256H", fp.read(512)),
|
|
)
|
|
firstbytes = [0] * 8192
|
|
for i, k in enumerate(subheaderkeys):
|
|
firstbytes[k // 8] = i
|
|
nhdrs = max(subheaderkeys) // 8 + 1
|
|
hdrs: list[tuple[int, int, int, int, int]] = []
|
|
for i in range(nhdrs):
|
|
(firstcode, entcount, delta, offset) = cast(
|
|
tuple[int, int, int, int],
|
|
struct.unpack(">HHhH", fp.read(8)),
|
|
)
|
|
hdrs.append((i, firstcode, entcount, delta, fp.tell() - 2 + offset))
|
|
for i, firstcode, entcount, delta, pos in hdrs:
|
|
if not entcount:
|
|
continue
|
|
first = firstcode + (firstbytes[i] << 8)
|
|
fp.seek(pos)
|
|
for c in range(entcount):
|
|
gid = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0]
|
|
if gid:
|
|
gid += delta
|
|
char2gid[first + c] = gid
|
|
elif fmttype == 4:
|
|
(segcount, _1, _2, _3) = cast(
|
|
tuple[int, int, int, int],
|
|
struct.unpack(">HHHH", fp.read(8)),
|
|
)
|
|
segcount //= 2
|
|
ecs = cast(
|
|
tuple[int, ...],
|
|
struct.unpack(f">{segcount}H", fp.read(2 * segcount)),
|
|
)
|
|
fp.read(2)
|
|
scs = cast(
|
|
tuple[int, ...],
|
|
struct.unpack(f">{segcount}H", fp.read(2 * segcount)),
|
|
)
|
|
idds = cast(
|
|
tuple[int, ...],
|
|
struct.unpack(f">{segcount}h", fp.read(2 * segcount)),
|
|
)
|
|
pos = fp.tell()
|
|
idrs = cast(
|
|
tuple[int, ...],
|
|
struct.unpack(f">{segcount}H", fp.read(2 * segcount)),
|
|
)
|
|
for ec, sc, idd, idr in zip(ecs, scs, idds, idrs, strict=False):
|
|
if idr:
|
|
fp.seek(pos + idr)
|
|
for c in range(sc, ec + 1):
|
|
b = cast(tuple[int], struct.unpack(">H", fp.read(2)))[0]
|
|
char2gid[c] = (b + idd) & 0xFFFF
|
|
else:
|
|
for c in range(sc, ec + 1):
|
|
char2gid[c] = (c + idd) & 0xFFFF
|
|
else:
|
|
raise AssertionError(str(("Unhandled", fmttype)))
|
|
if not char2gid:
|
|
raise TrueTypeFont.CMapNotFound
|
|
# create unicode map
|
|
unicode_map = FileUnicodeMap()
|
|
for char, gid in char2gid.items():
|
|
unicode_map.add_cid2unichr(gid, char)
|
|
return unicode_map
|
|
|
|
|
|
class PDFFontError(PDFException):
|
|
pass
|
|
|
|
|
|
class PDFUnicodeNotDefined(PDFFontError):
|
|
pass
|
|
|
|
|
|
LITERAL_STANDARD_ENCODING = LIT("StandardEncoding")
|
|
LITERAL_TYPE1C = LIT("Type1C")
|
|
|
|
# Font widths are maintained in a dict type that maps from *either* unicode
|
|
# chars or integer character IDs.
|
|
FontWidthDict = dict[int | str, float]
|
|
|
|
|
|
class PDFFont:
|
|
def __init__(
|
|
self,
|
|
descriptor: Mapping[str, Any],
|
|
widths: FontWidthDict,
|
|
default_width: float | None = None,
|
|
) -> None:
|
|
self.descriptor = descriptor
|
|
self.widths: FontWidthDict = resolve_all(widths)
|
|
self.fontname = resolve1(descriptor.get("FontName", "unknown"))
|
|
if isinstance(self.fontname, PSLiteral):
|
|
self.fontname = literal_name(self.fontname)
|
|
self.flags = int_value(descriptor.get("Flags", 0))
|
|
self.ascent = num_value(descriptor.get("Ascent", 0))
|
|
self.descent = num_value(descriptor.get("Descent", 0))
|
|
self.italic_angle = num_value(descriptor.get("ItalicAngle", 0))
|
|
if default_width is None:
|
|
self.default_width = num_value(descriptor.get("MissingWidth", 0))
|
|
else:
|
|
self.default_width = default_width
|
|
self.default_width = resolve1(self.default_width)
|
|
self.leading = num_value(descriptor.get("Leading", 0))
|
|
self.bbox = self._parse_bbox(descriptor)
|
|
self.hscale = self.vscale = 0.001
|
|
|
|
# PDF RM 9.8.1 specifies /Descent should always be a negative number.
|
|
# PScript5.dll seems to produce Descent with a positive number, but
|
|
# text analysis will be wrong if this is taken as correct. So force
|
|
# descent to negative.
|
|
if self.descent > 0:
|
|
self.descent = -self.descent
|
|
|
|
def __repr__(self) -> str:
|
|
return "<PDFFont>"
|
|
|
|
def is_vertical(self) -> bool:
|
|
return False
|
|
|
|
def is_multibyte(self) -> bool:
|
|
return False
|
|
|
|
def decode(self, bytes: bytes) -> Iterable[int]:
|
|
return bytearray(bytes) # map(ord, bytes)
|
|
|
|
def get_ascent(self) -> float:
|
|
"""Ascent above the baseline, in text space units"""
|
|
return self.ascent * self.vscale
|
|
|
|
def get_descent(self) -> float:
|
|
"""Descent below the baseline, in text space units; always negative"""
|
|
return self.descent * self.vscale
|
|
|
|
def get_width(self) -> float:
|
|
w = self.bbox[2] - self.bbox[0]
|
|
if w == 0:
|
|
w = -self.default_width
|
|
return w * self.hscale
|
|
|
|
def get_height(self) -> float:
|
|
h = self.bbox[3] - self.bbox[1]
|
|
if h == 0:
|
|
h = self.ascent - self.descent
|
|
return h * self.vscale
|
|
|
|
def char_width(self, cid: int) -> float:
|
|
# Because character widths may be mapping either IDs or strings,
|
|
# we try to lookup the character ID first, then its str equivalent.
|
|
cid_width = safe_float(self.widths.get(cid))
|
|
if cid_width is not None:
|
|
return cid_width * self.hscale
|
|
|
|
try:
|
|
str_cid = self.to_unichr(cid)
|
|
cid_width = safe_float(self.widths.get(str_cid))
|
|
if cid_width is not None:
|
|
return cid_width * self.hscale
|
|
|
|
except PDFUnicodeNotDefined:
|
|
pass
|
|
|
|
return self.default_width * self.hscale
|
|
|
|
def char_disp(self, cid: int) -> float | tuple[float | None, float]:
|
|
"""Returns an integer for horizontal fonts, a tuple for vertical fonts."""
|
|
return 0
|
|
|
|
def string_width(self, s: bytes) -> float:
|
|
return sum(self.char_width(cid) for cid in self.decode(s))
|
|
|
|
def to_unichr(self, cid: int) -> str:
|
|
raise NotImplementedError
|
|
|
|
@staticmethod
|
|
def _parse_bbox(descriptor: Mapping[str, Any]) -> Rect:
|
|
"""Parse FontBBox from the fonts descriptor"""
|
|
font_bbox = resolve_all(descriptor.get("FontBBox"))
|
|
bbox = safe_rect_list(font_bbox)
|
|
if bbox is None:
|
|
log.warning(
|
|
f"Could not get FontBBox from font descriptor because "
|
|
f"{font_bbox!r} cannot be parsed as 4 floats"
|
|
)
|
|
return 0.0, 0.0, 0.0, 0.0
|
|
return bbox
|
|
|
|
|
|
class PDFSimpleFont(PDFFont):
|
|
def __init__(
|
|
self,
|
|
descriptor: Mapping[str, Any],
|
|
widths: FontWidthDict,
|
|
spec: Mapping[str, Any],
|
|
) -> None:
|
|
# Font encoding is specified either by a name of
|
|
# built-in encoding or a dictionary that describes
|
|
# the differences.
|
|
|
|
default_encoding = LITERAL_STANDARD_ENCODING
|
|
if literal_name(spec.get("Subtype")) == "TrueType":
|
|
# PDF spec: TrueType fonts without Encoding default to WinAnsiEncoding
|
|
default_encoding = LIT("WinAnsiEncoding")
|
|
|
|
encoding = default_encoding
|
|
if "Encoding" in spec:
|
|
encoding = resolve1(spec["Encoding"])
|
|
|
|
if isinstance(encoding, dict):
|
|
name = literal_name(encoding.get("BaseEncoding", default_encoding))
|
|
diff = list_value(encoding.get("Differences", []))
|
|
self.cid2unicode = EncodingDB.get_encoding(name, diff)
|
|
else:
|
|
self.cid2unicode = EncodingDB.get_encoding(literal_name(encoding))
|
|
|
|
self.unicode_map: UnicodeMap | None = None
|
|
if "ToUnicode" in spec:
|
|
strm = stream_value(spec["ToUnicode"])
|
|
self.unicode_map = FileUnicodeMap()
|
|
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
|
PDFFont.__init__(self, descriptor, widths)
|
|
|
|
def to_unichr(self, cid: int) -> str:
|
|
if self.unicode_map:
|
|
try:
|
|
return self.unicode_map.get_unichr(cid)
|
|
except KeyError:
|
|
pass
|
|
try:
|
|
return self.cid2unicode[cid]
|
|
except KeyError as err:
|
|
raise PDFUnicodeNotDefined(None, cid) from err
|
|
|
|
|
|
class PDFType1Font(PDFSimpleFont):
|
|
def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
|
|
try:
|
|
self.basefont = literal_name(spec["BaseFont"])
|
|
except KeyError:
|
|
if settings.STRICT:
|
|
raise PDFFontError("BaseFont is missing") from None
|
|
self.basefont = "unknown"
|
|
|
|
widths: FontWidthDict
|
|
try:
|
|
(descriptor, int_widths) = FontMetricsDB.get_metrics(self.basefont)
|
|
widths = cast(dict[str | int, float], int_widths) # implicit int->float
|
|
except KeyError:
|
|
descriptor = dict_value(spec.get("FontDescriptor", {}))
|
|
firstchar = int_value(spec.get("FirstChar", 0))
|
|
# lastchar = int_value(spec.get('LastChar', 255))
|
|
width_list = list_value(spec.get("Widths", [0] * 256))
|
|
widths = {i + firstchar: resolve1(w) for (i, w) in enumerate(width_list)}
|
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
|
if "Encoding" not in spec and "FontFile" in descriptor:
|
|
# try to recover the missing encoding info from the font file.
|
|
self.fontfile = stream_value(descriptor.get("FontFile"))
|
|
length1 = int_value(self.fontfile["Length1"])
|
|
data = self.fontfile.get_data()[:length1]
|
|
parser = Type1FontHeaderParser(BytesIO(data))
|
|
self.cid2unicode = parser.get_encoding()
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<PDFType1Font: basefont={self.basefont!r}>"
|
|
|
|
|
|
class PDFTrueTypeFont(PDFType1Font):
|
|
def __repr__(self) -> str:
|
|
return f"<PDFTrueTypeFont: basefont={self.basefont!r}>"
|
|
|
|
|
|
class PDFType3Font(PDFSimpleFont):
|
|
def __init__(self, rsrcmgr: "PDFResourceManager", spec: Mapping[str, Any]) -> None:
|
|
firstchar = int_value(spec.get("FirstChar", 0))
|
|
# lastchar = int_value(spec.get('LastChar', 0))
|
|
width_list = list_value(spec.get("Widths", [0] * 256))
|
|
widths: dict[str | int, float] = {
|
|
i + firstchar: w for (i, w) in enumerate(width_list)
|
|
}
|
|
if "FontDescriptor" in spec:
|
|
descriptor = dict_value(spec["FontDescriptor"])
|
|
else:
|
|
descriptor = {"Ascent": 0, "Descent": 0, "FontBBox": spec["FontBBox"]}
|
|
PDFSimpleFont.__init__(self, descriptor, widths, spec)
|
|
self.matrix = cast(Matrix, tuple(list_value(spec.get("FontMatrix"))))
|
|
(_, self.descent, _, self.ascent) = self.bbox
|
|
(self.hscale, self.vscale) = apply_matrix_norm(self.matrix, (1, 1))
|
|
|
|
def __repr__(self) -> str:
|
|
return "<PDFType3Font>"
|
|
|
|
|
|
class PDFCIDFont(PDFFont):
|
|
default_disp: float | tuple[float | None, float]
|
|
|
|
def __init__(
|
|
self,
|
|
rsrcmgr: "PDFResourceManager",
|
|
spec: Mapping[str, Any],
|
|
strict: bool = settings.STRICT,
|
|
) -> None:
|
|
try:
|
|
self.basefont = literal_name(spec["BaseFont"])
|
|
except KeyError:
|
|
if strict:
|
|
raise PDFFontError("BaseFont is missing") from None
|
|
self.basefont = "unknown"
|
|
self.cidsysteminfo = dict_value(spec.get("CIDSystemInfo", {}))
|
|
cid_registry = resolve1(self.cidsysteminfo.get("Registry", b"unknown")).decode(
|
|
"latin1",
|
|
)
|
|
cid_ordering = resolve1(self.cidsysteminfo.get("Ordering", b"unknown")).decode(
|
|
"latin1",
|
|
)
|
|
self.cidcoding = f"{cid_registry.strip()}-{cid_ordering.strip()}"
|
|
self.cmap: CMapBase = self.get_cmap_from_spec(spec, strict)
|
|
|
|
try:
|
|
descriptor = dict_value(spec["FontDescriptor"])
|
|
except KeyError:
|
|
if strict:
|
|
raise PDFFontError("FontDescriptor is missing") from None
|
|
descriptor = {}
|
|
ttf = None
|
|
if "FontFile2" in descriptor:
|
|
self.fontfile = stream_value(descriptor.get("FontFile2"))
|
|
ttf = TrueTypeFont(self.basefont, BytesIO(self.fontfile.get_data()))
|
|
self.unicode_map: UnicodeMap | None = None
|
|
if "ToUnicode" in spec:
|
|
if isinstance(spec["ToUnicode"], PDFStream):
|
|
strm = stream_value(spec["ToUnicode"])
|
|
self.unicode_map = FileUnicodeMap()
|
|
CMapParser(self.unicode_map, BytesIO(strm.get_data())).run()
|
|
else:
|
|
cmap_name = literal_name(spec["ToUnicode"])
|
|
encoding = literal_name(spec["Encoding"])
|
|
if (
|
|
"Identity" in cid_ordering
|
|
or "Identity" in cmap_name
|
|
or "Identity" in encoding
|
|
):
|
|
self.unicode_map = IdentityUnicodeMap()
|
|
elif self.cidcoding in ("Adobe-Identity", "Adobe-UCS"):
|
|
if ttf:
|
|
with contextlib.suppress(TrueTypeFont.CMapNotFound):
|
|
self.unicode_map = ttf.create_unicode_map()
|
|
else:
|
|
with contextlib.suppress(CMapDB.CMapNotFound):
|
|
self.unicode_map = CMapDB.get_unicode_map(
|
|
self.cidcoding,
|
|
self.cmap.is_vertical(),
|
|
)
|
|
|
|
self.vertical = self.cmap.is_vertical()
|
|
if self.vertical:
|
|
# writing mode: vertical
|
|
widths2 = get_widths2(list_value(spec.get("W2", [])))
|
|
self.disps = {cid: (vx, vy) for (cid, (_, (vx, vy))) in widths2.items()}
|
|
(vy, w) = resolve1(spec.get("DW2", [880, -1000]))
|
|
self.default_disp = (None, vy)
|
|
widths: dict[str | int, float] = {
|
|
cid: w for (cid, (w, _)) in widths2.items()
|
|
}
|
|
default_width = w
|
|
else:
|
|
# writing mode: horizontal
|
|
self.disps = {}
|
|
self.default_disp = 0
|
|
widths = get_widths(list_value(spec.get("W", [])))
|
|
default_width = spec.get("DW", 1000)
|
|
PDFFont.__init__(self, descriptor, widths, default_width=default_width)
|
|
|
|
def get_cmap_from_spec(self, spec: Mapping[str, Any], strict: bool) -> CMapBase:
|
|
"""Get cmap from font specification
|
|
|
|
For certain PDFs, Encoding Type isn't mentioned as an attribute of
|
|
Encoding but as an attribute of CMapName, where CMapName is an
|
|
attribute of spec['Encoding'].
|
|
The horizontal/vertical modes are mentioned with different name
|
|
such as 'DLIdent-H/V','OneByteIdentityH/V','Identity-H/V'.
|
|
"""
|
|
cmap_name = self._get_cmap_name(spec, strict)
|
|
|
|
try:
|
|
return CMapDB.get_cmap(cmap_name)
|
|
except CMapDB.CMapNotFound as e:
|
|
if strict:
|
|
raise PDFFontError(e) from e
|
|
return CMap()
|
|
|
|
@staticmethod
|
|
def _get_cmap_name(spec: Mapping[str, Any], strict: bool) -> str:
|
|
"""Get cmap name from font specification"""
|
|
cmap_name = "unknown" # default value
|
|
|
|
try:
|
|
spec_encoding = spec["Encoding"]
|
|
if hasattr(spec_encoding, "name"):
|
|
cmap_name = literal_name(spec["Encoding"])
|
|
else:
|
|
cmap_name = literal_name(spec_encoding["CMapName"])
|
|
except KeyError:
|
|
if strict:
|
|
raise PDFFontError("Encoding is unspecified") from None
|
|
|
|
if type(cmap_name) is PDFStream: # type: ignore[comparison-overlap]
|
|
cmap_name_stream: PDFStream = cast(PDFStream, cmap_name)
|
|
if "CMapName" in cmap_name_stream:
|
|
cmap_name = cmap_name_stream.get("CMapName").name
|
|
elif strict:
|
|
raise PDFFontError("CMapName unspecified for encoding")
|
|
|
|
return IDENTITY_ENCODER.get(cmap_name, cmap_name)
|
|
|
|
def __repr__(self) -> str:
|
|
return f"<PDFCIDFont: basefont={self.basefont!r}, cidcoding={self.cidcoding!r}>"
|
|
|
|
def is_vertical(self) -> bool:
|
|
return self.vertical
|
|
|
|
def is_multibyte(self) -> bool:
|
|
return True
|
|
|
|
def decode(self, bytes: bytes) -> Iterable[int]:
|
|
return self.cmap.decode(bytes)
|
|
|
|
def char_disp(self, cid: int) -> float | tuple[float | None, float]:
|
|
"""Returns an integer for horizontal fonts, a tuple for vertical fonts."""
|
|
return self.disps.get(cid, self.default_disp)
|
|
|
|
def to_unichr(self, cid: int) -> str:
|
|
try:
|
|
if not self.unicode_map:
|
|
raise PDFKeyError(cid)
|
|
return self.unicode_map.get_unichr(cid)
|
|
except KeyError as err:
|
|
raise PDFUnicodeNotDefined(self.cidcoding, cid) from err
|