87 lines
2.4 KiB
Python
87 lines
2.4 KiB
Python
from typing import Any, List, Optional, Union
|
|
|
|
from pdfminer.pdftypes import PDFObjRef
|
|
from pdfminer.psparser import PSLiteral
|
|
from pdfminer.utils import PDFDocEncoding
|
|
|
|
from .exceptions import MalformedPDFException
|
|
|
|
|
|
def decode_text(s: Union[bytes, str]) -> str:
|
|
"""
|
|
Decodes a PDFDocEncoding string to Unicode.
|
|
Adds py3 compatibility to pdfminer's version.
|
|
"""
|
|
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
|
|
return str(s[2:], "utf-16be", "ignore")
|
|
try:
|
|
ords = (ord(c) if isinstance(c, str) else c for c in s)
|
|
return "".join(PDFDocEncoding[o] for o in ords)
|
|
except IndexError:
|
|
return str(s)
|
|
|
|
|
|
def resolve_and_decode(obj: Any) -> Any:
|
|
"""Recursively resolve the metadata values."""
|
|
if hasattr(obj, "resolve"):
|
|
obj = obj.resolve()
|
|
if isinstance(obj, list):
|
|
return list(map(resolve_and_decode, obj))
|
|
elif isinstance(obj, PSLiteral):
|
|
return decode_text(obj.name)
|
|
elif isinstance(obj, (str, bytes)):
|
|
return decode_text(obj)
|
|
elif isinstance(obj, dict):
|
|
for k, v in obj.items():
|
|
obj[k] = resolve_and_decode(v)
|
|
return obj
|
|
|
|
return obj
|
|
|
|
|
|
def decode_psl_list(_list: List[Union[PSLiteral, str]]) -> List[str]:
|
|
return [
|
|
decode_text(value.name) if isinstance(value, PSLiteral) else value
|
|
for value in _list
|
|
]
|
|
|
|
|
|
def resolve(x: Any) -> Any:
|
|
if isinstance(x, PDFObjRef):
|
|
return x.resolve()
|
|
else:
|
|
return x
|
|
|
|
|
|
def get_dict_type(d: Any) -> Optional[str]:
|
|
if not isinstance(d, dict):
|
|
return None
|
|
t = d.get("Type")
|
|
if isinstance(t, PSLiteral):
|
|
return decode_text(t.name)
|
|
else:
|
|
return t
|
|
|
|
|
|
def resolve_all(x: Any) -> Any:
|
|
"""
|
|
Recursively resolves the given object and all the internals.
|
|
"""
|
|
if isinstance(x, PDFObjRef):
|
|
resolved = x.resolve()
|
|
|
|
# Avoid infinite recursion
|
|
if get_dict_type(resolved) == "Page":
|
|
return x
|
|
|
|
try:
|
|
return resolve_all(resolved)
|
|
except RecursionError as e:
|
|
raise MalformedPDFException(e)
|
|
elif isinstance(x, (list, tuple)):
|
|
return type(x)(resolve_all(v) for v in x)
|
|
elif isinstance(x, dict):
|
|
exceptions = ["Parent"] if get_dict_type(x) == "Annot" else []
|
|
return {k: v if k in exceptions else resolve_all(v) for k, v in x.items()}
|
|
else:
|
|
return x
|