eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/convert.py

import base64
from typing import Any, Callable, Dict, List, Optional, Tuple

from pdfminer.psparser import PSLiteral

from .utils import decode_text

ENCODINGS_TO_TRY = [
    "utf-8",
    "latin-1",
    "utf-16",
    "utf-16le",
]

CSV_COLS_REQUIRED = [
    "object_type",
]

CSV_COLS_TO_PREPEND = [
    "page_number",
    "x0",
    "x1",
    "y0",
    "y1",
    "doctop",
    "top",
    "bottom",
    "width",
    "height",
]


def get_attr_filter(
    include_attrs: Optional[List[str]] = None, exclude_attrs: Optional[List[str]] = None
) -> Callable[[str], bool]:
    if include_attrs is not None and exclude_attrs is not None:
        raise ValueError(
            "Cannot specify `include_attrs` and `exclude_attrs` at the same time."
        )

    elif include_attrs is not None:
        incl = set(CSV_COLS_REQUIRED + include_attrs)
        return lambda attr: attr in incl

    elif exclude_attrs is not None:
        nonexcludable = set(exclude_attrs).intersection(set(CSV_COLS_REQUIRED))
        if len(nonexcludable):
            raise ValueError(
                f"Cannot exclude these required properties: {list(nonexcludable)}"
            )
        excl = set(exclude_attrs)
        return lambda attr: attr not in excl

    else:
        return lambda attr: True


def to_b64(data: bytes) -> str:
    return base64.b64encode(data).decode("ascii")


class Serializer:
    def __init__(
        self,
        precision: Optional[int] = None,
        include_attrs: Optional[List[str]] = None,
        exclude_attrs: Optional[List[str]] = None,
    ):

        self.precision = precision
        self.attr_filter = get_attr_filter(
            include_attrs=include_attrs, exclude_attrs=exclude_attrs
        )

    def serialize(self, obj: Any) -> Any:
        if obj is None:
            return None

        t = type(obj)

        # Basic types don't need to be converted
        if t in (int, str):
            return obj

        # Use one of the custom converters, if possible
        fn = getattr(self, f"do_{t.__name__}", None)
        if fn is not None:
            return fn(obj)

        # Otherwise, just use the string-representation
        else:
            return str(obj)

    def do_float(self, x: float) -> float:
        return x if self.precision is None else round(x, self.precision)

    def do_bool(self, x: bool) -> int:
        return int(x)

    def do_list(self, obj: List[Any]) -> List[Any]:
        return list(self.serialize(x) for x in obj)

    def do_tuple(self, obj: Tuple[Any, ...]) -> Tuple[Any, ...]:
        return tuple(self.serialize(x) for x in obj)

    def do_dict(self, obj: Dict[str, Any]) -> Dict[str, Any]:
        if "object_type" in obj.keys():
            return {k: self.serialize(v) for k, v in obj.items() if self.attr_filter(k)}
        else:
            return {k: self.serialize(v) for k, v in obj.items()}

    def do_PDFStream(self, obj: Any) -> Dict[str, Optional[str]]:
        return {"rawdata": to_b64(obj.rawdata) if obj.rawdata else None}

    def do_PSLiteral(self, obj: PSLiteral) -> str:
        return decode_text(obj.name)

    def do_bytes(self, obj: bytes) -> Optional[str]:
        for e in ENCODINGS_TO_TRY:
            try:
                return obj.decode(e)
            except UnicodeDecodeError:  # pragma: no cover
                return None
        # If none of the decodings work, raise whatever error
        # decoding with utf-8 causes
        obj.decode(ENCODINGS_TO_TRY[0])  # pragma: no cover
        return None  # pragma: no cover