eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/structure.py

import itertools
import logging
import re
from collections import deque
from dataclasses import asdict, dataclass, field
from typing import (
    TYPE_CHECKING,
    Any,
    Callable,
    Dict,
    Iterable,
    Iterator,
    List,
    Optional,
    Pattern,
    Tuple,
    Union,
)

from pdfminer.data_structures import NumberTree
from pdfminer.pdfparser import PDFParser
from pdfminer.pdftypes import PDFObjRef, resolve1
from pdfminer.psparser import PSLiteral

from ._typing import T_bbox, T_obj
from .utils import decode_text, geometry

logger = logging.getLogger(__name__)


if TYPE_CHECKING:  # pragma: nocover
    from .page import Page
    from .pdf import PDF


MatchFunc = Callable[["PDFStructElement"], bool]


def _find_all(
    elements: Iterable["PDFStructElement"],
    matcher: Union[str, Pattern[str], MatchFunc],
) -> Iterator["PDFStructElement"]:
    """
    Common code for `find_all()` in trees and elements.
    """

    def match_tag(x: "PDFStructElement") -> bool:
        """Match an element name."""
        return x.type == matcher

    def match_regex(x: "PDFStructElement") -> bool:
        """Match an element name by regular expression."""
        return matcher.match(x.type)  # type: ignore

    if isinstance(matcher, str):
        match_func = match_tag
    elif isinstance(matcher, re.Pattern):
        match_func = match_regex
    else:
        match_func = matcher  # type: ignore
    d = deque(elements)
    while d:
        el = d.popleft()
        if match_func(el):
            yield el
        d.extendleft(reversed(el.children))


class Findable:
    """find() and find_all() methods that can be inherited to avoid
    repeating oneself"""

    children: List["PDFStructElement"]

    def find_all(
        self, matcher: Union[str, Pattern[str], MatchFunc]
    ) -> Iterator["PDFStructElement"]:
        """Iterate depth-first over matching elements in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `PDFStructElement` and
        returning `True` if the element matches.
        """
        return _find_all(self.children, matcher)

    def find(
        self, matcher: Union[str, Pattern[str], MatchFunc]
    ) -> Optional["PDFStructElement"]:
        """Find the first matching element in subtree.

        The `matcher` argument is either an element name, a regular
        expression, or a function taking a `PDFStructElement` and
        returning `True` if the element matches.
        """
        try:
            return next(_find_all(self.children, matcher))
        except StopIteration:
            return None


@dataclass
class PDFStructElement(Findable):
    type: str
    revision: Optional[int]
    id: Optional[str]
    lang: Optional[str]
    alt_text: Optional[str]
    actual_text: Optional[str]
    title: Optional[str]
    page_number: Optional[int]
    attributes: Dict[str, Any] = field(default_factory=dict)
    mcids: List[int] = field(default_factory=list)
    children: List["PDFStructElement"] = field(default_factory=list)

    def __iter__(self) -> Iterator["PDFStructElement"]:
        return iter(self.children)

    def all_mcids(self) -> Iterator[Tuple[Optional[int], int]]:
        """Collect all MCIDs (with their page numbers, if there are
        multiple pages in the tree) inside a structure element.
        """
        # Collect them depth-first to preserve ordering
        for mcid in self.mcids:
            yield self.page_number, mcid
        d = deque(self.children)
        while d:
            el = d.popleft()
            for mcid in el.mcids:
                yield el.page_number, mcid
            d.extendleft(reversed(el.children))

    def to_dict(self) -> Dict[str, Any]:
        """Return a compacted dict representation."""
        r = asdict(self)
        # Prune empty values (does not matter in which order)
        d = deque([r])
        while d:
            el = d.popleft()
            for k in list(el.keys()):
                if el[k] is None or el[k] == [] or el[k] == {}:
                    del el[k]
            if "children" in el:
                d.extend(el["children"])
        return r


class StructTreeMissing(ValueError):
    pass


class PDFStructTree(Findable):
    """Parse the structure tree of a PDF.

    The constructor takes a `pdfplumber.PDF` and optionally a
    `pdfplumber.Page`.  To avoid creating the entire tree for a large
    document it is recommended to provide a page.

    This class creates a representation of the portion of the
    structure tree that reaches marked content sections, either for a
    single page, or for the whole document.  Note that this is slightly
    different from the behaviour of other PDF libraries which will
    also include structure elements with no content.

    If the PDF has no structure, the constructor will raise
    `StructTreeMissing`.

    """

    page: Optional["Page"]

    def __init__(self, doc: "PDF", page: Optional["Page"] = None):
        self.doc = doc.doc
        if "StructTreeRoot" not in self.doc.catalog:
            raise StructTreeMissing("PDF has no structure")
        self.root = resolve1(self.doc.catalog["StructTreeRoot"])
        self.role_map = resolve1(self.root.get("RoleMap", {}))
        self.class_map = resolve1(self.root.get("ClassMap", {}))
        self.children: List[PDFStructElement] = []

        # If we have a specific page then we will work backwards from
        # its ParentTree - this is because structure elements could
        # span multiple pages, and the "Pg" attribute is *optional*,
        # so this is the approved way to get a page's structure...
        if page is not None:
            self.page = page
            self.pages = {page.page_number: page}
            self.page_dict = None
            # ...EXCEPT that the ParentTree is sometimes missing, in which
            # case we fall back to the non-approved way.
            parent_tree_obj = self.root.get("ParentTree")
            if parent_tree_obj is None:
                self._parse_struct_tree()
            else:
                parent_tree = NumberTree(parent_tree_obj)
                # If there is no marked content in the structure tree for
                # this page (which can happen even when there is a
                # structure tree) then there is no `StructParents`.
                # Note however that if there are XObjects in a page,
                # *they* may have `StructParent` (not `StructParents`)
                if "StructParents" not in self.page.page_obj.attrs:
                    return
                parent_id = self.page.page_obj.attrs["StructParents"]
                # NumberTree should have a `get` method like it does in pdf.js...
                parent_array = resolve1(
                    next(array for num, array in parent_tree.values if num == parent_id)
                )
                self._parse_parent_tree(parent_array)
        else:
            self.page = None
            # Overhead of creating pages shouldn't be too bad we hope!
            self.pages = {page.page_number: page for page in doc.pages}
            self.page_dict = {
                page.page_obj.pageid: page.page_number for page in self.pages.values()
            }
            self._parse_struct_tree()

    def _make_attributes(
        self, obj: Dict[str, Any], revision: Optional[int]
    ) -> Dict[str, Any]:
        attr_obj_list = []
        for key in "C", "A":
            if key not in obj:
                continue
            attr_obj = resolve1(obj[key])
            # It could be a list of attribute objects (why?)
            if isinstance(attr_obj, list):
                attr_obj_list.extend(attr_obj)
            else:
                attr_obj_list.append(attr_obj)
        attr_objs = []
        prev_obj = None
        for aref in attr_obj_list:
            # If we find a revision number, which might "follow the
            # revision object" (the spec is not clear about what this
            # should look like but it implies they are simply adjacent
            # in a flat array), then use it to decide whether to take
            # the previous object...
            if isinstance(aref, int):
                if aref == revision and prev_obj is not None:
                    attr_objs.append(prev_obj)
                prev_obj = None
            else:
                if prev_obj is not None:
                    attr_objs.append(prev_obj)
                prev_obj = resolve1(aref)
        if prev_obj is not None:
            attr_objs.append(prev_obj)
        # Now merge all the attribute objects in the collected to a
        # single set (again, the spec doesn't really explain this but
        # does say that attributes in /A supersede those in /C)
        attr = {}
        for obj in attr_objs:
            if isinstance(obj, PSLiteral):
                key = decode_text(obj.name)
                if key not in self.class_map:
                    logger.warning("Unknown attribute class %s", key)
                    continue
                obj = self.class_map[key]
            for k, v in obj.items():
                if isinstance(v, PSLiteral):
                    attr[k] = decode_text(v.name)
                else:
                    attr[k] = obj[k]
        return attr

    def _make_element(self, obj: Any) -> Tuple[Optional[PDFStructElement], List[Any]]:
        # We hopefully caught these earlier
        assert "MCID" not in obj, "Uncaught MCR: %s" % obj
        assert "Obj" not in obj, "Uncaught OBJR: %s" % obj
        # Get page number if necessary
        page_number = None
        if self.page_dict is not None and "Pg" in obj:
            page_objid = obj["Pg"].objid
            assert page_objid in self.page_dict, "Object on unparsed page: %s" % obj
            page_number = self.page_dict[page_objid]
        obj_tag = ""
        if "S" in obj:
            obj_tag = decode_text(obj["S"].name)
            if obj_tag in self.role_map:
                obj_tag = decode_text(self.role_map[obj_tag].name)
        children = resolve1(obj["K"]) if "K" in obj else []
        if isinstance(children, int):  # ugh... isinstance...
            children = [children]
        elif isinstance(children, dict):  # a single object.. ugh...
            children = [obj["K"]]
        revision = obj.get("R")
        attributes = self._make_attributes(obj, revision)
        element_id = decode_text(resolve1(obj["ID"])) if "ID" in obj else None
        title = decode_text(resolve1(obj["T"])) if "T" in obj else None
        lang = decode_text(resolve1(obj["Lang"])) if "Lang" in obj else None
        alt_text = decode_text(resolve1(obj["Alt"])) if "Alt" in obj else None
        actual_text = (
            decode_text(resolve1(obj["ActualText"])) if "ActualText" in obj else None
        )
        element = PDFStructElement(
            type=obj_tag,
            id=element_id,
            page_number=page_number,
            revision=revision,
            lang=lang,
            title=title,
            alt_text=alt_text,
            actual_text=actual_text,
            attributes=attributes,
        )
        return element, children

    def _parse_parent_tree(self, parent_array: List[Any]) -> None:
        """Populate the structure tree using the leaves of the parent tree for
        a given page."""
        # First walk backwards from the leaves to the root, tracking references
        d = deque(parent_array)
        s = {}
        found_root = False
        while d:
            ref = d.popleft()
            # In the case where an MCID is not associated with any
            # structure, there will be a "null" in the parent tree.
            if ref == PDFParser.KEYWORD_NULL:
                continue
            if repr(ref) in s:
                continue
            obj = resolve1(ref)
            # This is required! It's in the spec!
            if "Type" in obj and decode_text(obj["Type"].name) == "StructTreeRoot":
                found_root = True
            else:
                # We hope that these are actual elements and not
                # references or marked-content sections...
                element, children = self._make_element(obj)
                # We have no page tree so we assume this page was parsed
                assert element is not None
                s[repr(ref)] = element, children
                d.append(obj["P"])
        # If we didn't reach the root something is quite wrong!
        assert found_root
        self._resolve_children(s)

    def on_parsed_page(self, obj: Dict[str, Any]) -> bool:
        if "Pg" not in obj:
            return True
        page_objid = obj["Pg"].objid
        if self.page_dict is not None:
            return page_objid in self.page_dict
        if self.page is not None:
            # We have to do this to satisfy mypy
            if page_objid != self.page.page_obj.pageid:
                return False
        return True

    def _parse_struct_tree(self) -> None:
        """Populate the structure tree starting from the root, skipping
        unparsed pages and empty elements."""
        root = resolve1(self.root["K"])

        # It could just be a single object ... it's in the spec (argh)
        if isinstance(root, dict):
            root = [self.root["K"]]
        d = deque(root)
        s = {}
        while d:
            ref = d.popleft()
            # In case the tree is actually a DAG and not a tree...
            if repr(ref) in s:  # pragma: nocover (shouldn't happen)
                continue
            obj = resolve1(ref)
            # Deref top-level OBJR skipping refs to unparsed pages
            if isinstance(obj, dict) and "Obj" in obj:
                if not self.on_parsed_page(obj):
                    continue
                ref = obj["Obj"]
                obj = resolve1(ref)
            element, children = self._make_element(obj)
            # Similar to above, delay resolving the children to avoid
            # tree-recursion.
            s[repr(ref)] = element, children
            for child in children:
                obj = resolve1(child)
                if isinstance(obj, dict):
                    if not self.on_parsed_page(obj):
                        continue
                    if "Obj" in obj:
                        child = obj["Obj"]
                    elif "MCID" in obj:
                        continue
                if isinstance(child, PDFObjRef):
                    d.append(child)

        # Traverse depth-first, removing empty elements (unsure how to
        # do this non-recursively)
        def prune(elements: List[Any]) -> List[Any]:
            next_elements = []
            for ref in elements:
                obj = resolve1(ref)
                if isinstance(ref, int):
                    next_elements.append(ref)
                    continue
                elif isinstance(obj, dict):
                    if not self.on_parsed_page(obj):
                        continue
                    if "MCID" in obj:
                        next_elements.append(obj["MCID"])
                        continue
                    elif "Obj" in obj:
                        ref = obj["Obj"]
                element, children = s[repr(ref)]
                children = prune(children)
                # See assertions below
                if element is None or not children:
                    del s[repr(ref)]
                else:
                    s[repr(ref)] = element, children
                    next_elements.append(ref)
            return next_elements

        prune(root)
        self._resolve_children(s)

    def _resolve_children(self, seen: Dict[str, Any]) -> None:
        """Resolve children starting from the tree root based on references we
        saw when traversing the structure tree.
        """
        root = resolve1(self.root["K"])
        # It could just be a single object ... it's in the spec (argh)
        if isinstance(root, dict):
            root = [self.root["K"]]
        self.children = []
        # Create top-level self.children
        parsed_root = []
        for ref in root:
            obj = resolve1(ref)
            if isinstance(obj, dict) and "Obj" in obj:
                if not self.on_parsed_page(obj):
                    continue
                ref = obj["Obj"]
            if repr(ref) in seen:
                parsed_root.append(ref)
        d = deque(parsed_root)
        while d:
            ref = d.popleft()
            element, children = seen[repr(ref)]
            assert element is not None, "Unparsed element"
            for child in children:
                obj = resolve1(child)
                if isinstance(obj, int):
                    element.mcids.append(obj)
                elif isinstance(obj, dict):
                    # Skip out-of-page MCIDS and OBJRs
                    if not self.on_parsed_page(obj):
                        continue
                    if "MCID" in obj:
                        element.mcids.append(obj["MCID"])
                    elif "Obj" in obj:
                        child = obj["Obj"]
                # NOTE: if, not elif, in case of OBJR above
                if isinstance(child, PDFObjRef):
                    child_element, _ = seen.get(repr(child), (None, None))
                    if child_element is not None:
                        element.children.append(child_element)
                        d.append(child)
        self.children = [seen[repr(ref)][0] for ref in parsed_root]

    def __iter__(self) -> Iterator[PDFStructElement]:
        return iter(self.children)

    def element_bbox(self, el: PDFStructElement) -> T_bbox:
        """Get the bounding box for an element for visual debugging."""
        page = None
        if self.page is not None:
            page = self.page
        elif el.page_number is not None:
            page = self.pages[el.page_number]
        bbox = el.attributes.get("BBox", None)
        if page is not None and bbox is not None:
            from .page import CroppedPage, _invert_box, _normalize_box

            # Use secret knowledge of CroppedPage (cannot use
            # page.height because it is the *cropped* dimension, but
            # cropping does not actually translate coordinates)
            bbox = _invert_box(
                _normalize_box(bbox), page.mediabox[3] - page.mediabox[1]
            )
            # Use more secret knowledge of CroppedPage
            if isinstance(page, CroppedPage):
                rect = geometry.bbox_to_rect(bbox)
                rects = page._crop_fn([rect])
                if not rects:
                    raise IndexError("Element no longer on page")
                return geometry.obj_to_bbox(rects[0])
            else:
                # Not sure why mypy complains here
                return bbox  # type: ignore
        else:
            mcid_objs = []
            for page_number, mcid in el.all_mcids():
                objects: Iterable[T_obj]
                if page_number is None:
                    if page is not None:
                        objects = itertools.chain.from_iterable(page.objects.values())
                    else:
                        objects = []  # pragma: nocover
                else:
                    objects = itertools.chain.from_iterable(
                        self.pages[page_number].objects.values()
                    )
                for c in objects:
                    if c["mcid"] == mcid:
                        mcid_objs.append(c)
            if not mcid_objs:
                raise IndexError("No objects found")  # pragma: nocover
            return geometry.objects_to_bbox(mcid_objs)