eptm_dashboard/.venv/lib/python3.12/site-packages/pypdfium2_cli/pageobjects.py

124 lines
3.9 KiB
Python

# SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
# TODO test-confirm filter and info params
from collections import OrderedDict
import pypdfium2._helpers as pdfium
import pypdfium2.internal as pdfium_i
from pypdfium2_cli._parsers import (
add_input,
add_n_digits,
get_input,
round_list,
iterator_hasvalue,
)
PARAM_POS = "pos"
PARAM_IMGINFO = "imginfo"
PARAM_TEXT = "text"
INFO_PARAMS = (PARAM_POS, PARAM_IMGINFO, PARAM_TEXT)
def attach(parser):
add_input(parser, pages=True)
add_n_digits(parser)
# TODO think out strategy for choices (see https://github.com/python/cpython/issues/69247)
obj_types = list( pdfium_i.ObjectTypeToConst.keys() )
parser.add_argument(
"--filter",
nargs = "+",
metavar = "T",
choices = obj_types,
help = f"Object types to include. Choices: {obj_types}",
)
parser.add_argument(
"--max-depth",
type = int,
default = 2,
help = "Maximum recursion depth to consider when descending into Form XObjects.",
)
parser.add_argument(
"--info",
nargs = "+",
type = str.lower,
choices = INFO_PARAMS,
default = INFO_PARAMS,
help = "Object details to show.",
)
def print_img_metadata(m, n_digits, pad=""):
members = OrderedDict(
width = m.width,
height = m.height,
horizontal_dpi = round(m.horizontal_dpi, n_digits),
vertical_dpi = round(m.vertical_dpi, n_digits),
bits_per_pixel = m.bits_per_pixel,
colorspace = pdfium_i.ColorspaceToStr.get(m.colorspace),
)
if m.marked_content_id != -1:
members["marked_content_id"] = m.marked_content_id
for key, value in members.items():
print(pad + f"{key}: {value}")
def main(args):
pdf = get_input(args)
# if no filter is given, leave it at None (make a difference in case of unhandled object types)
if args.filter:
args.filter = [pdfium_i.ObjectTypeToConst[t] for t in args.filter]
show_pos = PARAM_POS in args.info
show_imginfo = PARAM_IMGINFO in args.info
show_text = PARAM_TEXT in args.info
assert any((show_pos, show_imginfo, show_text))
total_count = 0
for i in args.pages:
page = pdf[i]
textpage = page.get_textpage() if show_text else None
hasvalue, obj_searcher = iterator_hasvalue( page.get_objects(args.filter, max_depth=args.max_depth, textpage=textpage) )
if not hasvalue: continue
print(f"# Page {i+1}")
count = 0
for obj in obj_searcher:
pad_0 = " " * obj.level
pad_1 = pad_0 + " "
print(pad_0 + pdfium_i.ObjectTypeToStr.get(obj.type))
if show_pos:
bounds = round_list(obj.get_bounds(), args.n_digits)
print(pad_1 + f"Bounding Box: {bounds}")
if isinstance(obj, (pdfium.PdfImage, pdfium.PdfTextObj)):
quad_bounds = obj.get_quad_points()
print(pad_1 + f"Quad Points: {[round_list(p, args.n_digits) for p in quad_bounds]}")
if show_imginfo and isinstance(obj, pdfium.PdfImage):
print(pad_1 + f"Filters: {obj.get_filters()}")
metadata = obj.get_metadata()
assert (metadata.width, metadata.height) == obj.get_px_size()
print_img_metadata(metadata, args.n_digits, pad=pad_1)
elif show_text and isinstance(obj, pdfium.PdfTextObj):
print(pad_1 + repr(obj.extract()))
count += 1
if count > 0:
print(f"-> Count: {count}\n")
total_count += count
if total_count > 0:
print(f"-> Total count: {total_count}")