48 lines
1.8 KiB
Python
48 lines
1.8 KiB
Python
"""Python implementation of ASCII85/ASCIIHex decoder (Adobe version)."""
|
|
|
|
import re
|
|
from base64 import a85decode
|
|
from binascii import unhexlify
|
|
|
|
start_re = re.compile(rb"^\s*<?\s*~\s*")
|
|
end_re = re.compile(rb"\s*~\s*>?\s*$")
|
|
|
|
|
|
def ascii85decode(data: bytes) -> bytes:
|
|
"""In ASCII85 encoding, every four bytes are encoded with five ASCII
|
|
letters, using 85 different types of characters (as 256**4 < 85**5).
|
|
When the length of the original bytes is not a multiple of 4, a special
|
|
rule is used for round up.
|
|
|
|
Adobe's ASCII85 implementation expects the input to be terminated
|
|
by `b"~>"`, and (though this is absent from the PDF spec) it can
|
|
also begin with `b"<~"`. We can't reliably expect this to be the
|
|
case, and there can be off-by-one errors in stream lengths which
|
|
mean we only see `~` at the end. Worse yet, `<` and `>` are
|
|
ASCII85 digits, so we can't strip them. We settle on a compromise
|
|
where we strip leading `<~` or `~` and trailing `~` or `~>`.
|
|
"""
|
|
data = start_re.sub(b"", data)
|
|
data = end_re.sub(b"", data)
|
|
return a85decode(data)
|
|
|
|
|
|
bws_re = re.compile(rb"\s")
|
|
|
|
|
|
def asciihexdecode(data: bytes) -> bytes:
|
|
"""ASCIIHexDecode filter: PDFReference v1.4 section 3.3.1
|
|
For each pair of ASCII hexadecimal digits (0-9 and A-F or a-f), the
|
|
ASCIIHexDecode filter produces one byte of binary data. All white-space
|
|
characters are ignored. A right angle bracket character (>) indicates
|
|
EOD. Any other characters will cause an error. If the filter encounters
|
|
the EOD marker after reading an odd number of hexadecimal digits, it
|
|
will behave as if a 0 followed the last digit.
|
|
"""
|
|
data = bws_re.sub(b"", data)
|
|
idx = data.find(b">")
|
|
if idx != -1:
|
|
data = data[:idx]
|
|
if idx % 2 == 1:
|
|
data += b"0"
|
|
return unhexlify(data)
|