eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/utils/pdfinternals.py

87 lines
2.4 KiB
Python

from typing import Any, List, Optional, Union
from pdfminer.pdftypes import PDFObjRef
from pdfminer.psparser import PSLiteral
from pdfminer.utils import PDFDocEncoding
from .exceptions import MalformedPDFException
def decode_text(s: Union[bytes, str]) -> str:
"""
Decodes a PDFDocEncoding string to Unicode.
Adds py3 compatibility to pdfminer's version.
"""
if isinstance(s, bytes) and s.startswith(b"\xfe\xff"):
return str(s[2:], "utf-16be", "ignore")
try:
ords = (ord(c) if isinstance(c, str) else c for c in s)
return "".join(PDFDocEncoding[o] for o in ords)
except IndexError:
return str(s)
def resolve_and_decode(obj: Any) -> Any:
"""Recursively resolve the metadata values."""
if hasattr(obj, "resolve"):
obj = obj.resolve()
if isinstance(obj, list):
return list(map(resolve_and_decode, obj))
elif isinstance(obj, PSLiteral):
return decode_text(obj.name)
elif isinstance(obj, (str, bytes)):
return decode_text(obj)
elif isinstance(obj, dict):
for k, v in obj.items():
obj[k] = resolve_and_decode(v)
return obj
return obj
def decode_psl_list(_list: List[Union[PSLiteral, str]]) -> List[str]:
return [
decode_text(value.name) if isinstance(value, PSLiteral) else value
for value in _list
]
def resolve(x: Any) -> Any:
if isinstance(x, PDFObjRef):
return x.resolve()
else:
return x
def get_dict_type(d: Any) -> Optional[str]:
if not isinstance(d, dict):
return None
t = d.get("Type")
if isinstance(t, PSLiteral):
return decode_text(t.name)
else:
return t
def resolve_all(x: Any) -> Any:
"""
Recursively resolves the given object and all the internals.
"""
if isinstance(x, PDFObjRef):
resolved = x.resolve()
# Avoid infinite recursion
if get_dict_type(resolved) == "Page":
return x
try:
return resolve_all(resolved)
except RecursionError as e:
raise MalformedPDFException(e)
elif isinstance(x, (list, tuple)):
return type(x)(resolve_all(v) for v in x)
elif isinstance(x, dict):
exceptions = ["Parent"] if get_dict_type(x) == "Annot" else []
return {k: v if k in exceptions else resolve_all(v) for k, v in x.items()}
else:
return x