eptm_dashboard/.venv/lib/python3.12/site-packages/pypdfium2_cli/pageobjects.py

# SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause

# TODO test-confirm filter and info params

from collections import OrderedDict
import pypdfium2._helpers as pdfium
import pypdfium2.internal as pdfium_i
from pypdfium2_cli._parsers import (
    add_input,
    add_n_digits,
    get_input,
    round_list,
    iterator_hasvalue,
)


PARAM_POS = "pos"
PARAM_IMGINFO = "imginfo"
PARAM_TEXT = "text"
INFO_PARAMS = (PARAM_POS, PARAM_IMGINFO, PARAM_TEXT)


def attach(parser):

    add_input(parser, pages=True)
    add_n_digits(parser)

    # TODO think out strategy for choices (see https://github.com/python/cpython/issues/69247)
    obj_types = list( pdfium_i.ObjectTypeToConst.keys() )
    parser.add_argument(
        "--filter",
        nargs = "+",
        metavar = "T",
        choices = obj_types,
        help = f"Object types to include. Choices: {obj_types}",
    )
    parser.add_argument(
        "--max-depth",
        type = int,
        default = 2,
        help = "Maximum recursion depth to consider when descending into Form XObjects.",
    )
    parser.add_argument(
        "--info",
        nargs = "+",
        type = str.lower,
        choices = INFO_PARAMS,
        default = INFO_PARAMS,
        help = "Object details to show.",
    )


def print_img_metadata(m, n_digits, pad=""):

    members = OrderedDict(
        width = m.width,
        height = m.height,
        horizontal_dpi = round(m.horizontal_dpi, n_digits),
        vertical_dpi = round(m.vertical_dpi, n_digits),
        bits_per_pixel = m.bits_per_pixel,
        colorspace = pdfium_i.ColorspaceToStr.get(m.colorspace),
    )
    if m.marked_content_id != -1:
        members["marked_content_id"] = m.marked_content_id

    for key, value in members.items():
        print(pad + f"{key}: {value}")


def main(args):

    pdf = get_input(args)

    # if no filter is given, leave it at None (make a difference in case of unhandled object types)
    if args.filter:
        args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter]

    show_pos = PARAM_POS in args.info
    show_imginfo = PARAM_IMGINFO in args.info
    show_text = PARAM_TEXT in args.info
    assert any((show_pos, show_imginfo, show_text))

    total_count = 0
    for i in args.pages:

        page = pdf[i]
        textpage = page.get_textpage() if show_text else None
        hasvalue, obj_searcher = iterator_hasvalue( page.get_objects(args.filter, max_depth=args.max_depth, textpage=textpage) )
        if not hasvalue: continue

        print(f"# Page {i+1}")
        count = 0

        for obj in obj_searcher:

            pad_0 = "    " * obj.level
            pad_1 = pad_0 + "    "
            print(pad_0 + pdfium_i.ObjectTypeToStr.get(obj.type))

            if show_pos:
                bounds = round_list(obj.get_bounds(), args.n_digits)
                print(pad_1 + f"Bounding Box: {bounds}")
                if isinstance(obj, (pdfium.PdfImage, pdfium.PdfTextObj)):
                    quad_bounds = obj.get_quad_points()
                    print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}")

            if show_imginfo and isinstance(obj, pdfium.PdfImage):
                print(pad_1 + f"Filters: {obj.get_filters()}")
                metadata = obj.get_metadata()
                assert (metadata.width, metadata.height) == obj.get_px_size()
                print_img_metadata(metadata, args.n_digits, pad=pad_1)

            elif show_text and isinstance(obj, pdfium.PdfTextObj):
                print(pad_1 + repr(obj.extract()))

            count += 1

        if count > 0:
            print(f"-> Count: {count}\n")
            total_count += count

    if total_count > 0:
        print(f"-> Total count: {total_count}")