# SPDX-FileCopyrightText: 2026 geisserml # SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause __all__ = ("PdfPage", "PdfColorScheme") import math import ctypes import logging import weakref import pypdfium2.raw as pdfium_c import pypdfium2.internal as pdfium_i from pypdfium2._helpers.misc import PdfiumError from pypdfium2._helpers.bitmap import PdfBitmap from pypdfium2._helpers.textpage import PdfTextPage from pypdfium2._helpers.pageobjects import PdfObject from pypdfium2.version import PDFIUM_INFO c_float = ctypes.c_float logger = logging.getLogger(__name__) class PdfPage (pdfium_i.AutoCloseable): """ Page helper class. Attributes: raw (FPDF_PAGE): The underlying PDFium page handle. pdf (PdfDocument): Reference to the document this page belongs to. formenv (PdfFormEnv | None): Formenv handle, if the parent pdf had an active formenv at the time of page retrieval. None otherwise. """ def __init__(self, raw, pdf, formenv): self.raw = raw self.pdf = pdf self.formenv = formenv super().__init__(PdfPage._close_impl, self.formenv) @staticmethod def _close_impl(raw, formenv): if formenv: pdfium_c.FORM_OnBeforeClosePage(raw, formenv) pdfium_c.FPDF_ClosePage(raw) @property def parent(self): # AutoCloseable hook # this needs to point at the nearest dependency, because that's the one that holds a weakref to this object return self.pdf if self.formenv is None else self.formenv def get_width(self): """ Returns: float: Page width (horizontal size), in PDF canvas units. """ return pdfium_c.FPDF_GetPageWidthF(self) def get_height(self): """ Returns: float: Page height (vertical size), in PDF canvas units. """ return pdfium_c.FPDF_GetPageHeightF(self) def get_size(self): """ Returns: (float, float): Page width and height, in PDF canvas units. """ return (self.get_width(), self.get_height()) # {get,set}_rotation() deliberately fail with dict access error in case of invalid values def get_rotation(self): """ Returns: int: Clockwise page rotation in degrees. """ raw_rotation = pdfium_c.FPDFPage_GetRotation(self) if raw_rotation == -1: raise PdfiumError("Failed to get page rotation.") return pdfium_i.RotationToDegrees[raw_rotation] def set_rotation(self, rotation): """ Define the absolute, clockwise page rotation (0, 90, 180, or 270 degrees). """ pdfium_c.FPDFPage_SetRotation(self, pdfium_i.RotationToConst[rotation]) def _get_box(self, box_func, fallback_func, fallback_ok): left, bottom, right, top = c_float(), c_float(), c_float(), c_float() ok = box_func(self, left, bottom, right, top) if not ok: return (fallback_func() if fallback_ok else None) return (left.value, bottom.value, right.value, top.value) # NOTE in case further arguments are needed (besides fallback_ok), then use *args, **kwargs in callers def get_mediabox(self, fallback_ok=True): """ Returns: (float, float, float, float) | None: The page MediaBox in PDF canvas units, consisting of four coordinates (usually x0, y0, x1, y1). If MediaBox is not defined, returns ANSI A (0, 0, 612, 792) if ``fallback_ok=True``, None otherwise. .. admonition:: Known issue\n Due to quirks in PDFium, all ``get_*box()`` functions except :meth:`.get_bbox` do not inherit from parent nodes in the page tree (as of PDFium 5418). """ # https://crbug.com/pdfium/1786 return self._get_box(pdfium_c.FPDFPage_GetMediaBox, lambda: (0, 0, 612, 792), fallback_ok) def set_mediabox(self, l, b, r, t): """ Set the page's MediaBox by passing four :class:`float` coordinates (usually x0, y0, x1, y1). """ pdfium_c.FPDFPage_SetMediaBox(self, l, b, r, t) def get_cropbox(self, fallback_ok=True): """ Returns: The page's CropBox (If not defined, falls back to MediaBox). """ return self._get_box(pdfium_c.FPDFPage_GetCropBox, self.get_mediabox, fallback_ok) def set_cropbox(self, l, b, r, t): """ Set the page's CropBox. """ pdfium_c.FPDFPage_SetCropBox(self, l, b, r, t) def get_bleedbox(self, fallback_ok=True): """ Returns: The page's BleedBox (If not defined, falls back to CropBox). """ return self._get_box(pdfium_c.FPDFPage_GetBleedBox, self.get_cropbox, fallback_ok) def set_bleedbox(self, l, b, r, t): """ Set the page's BleedBox. """ pdfium_c.FPDFPage_SetBleedBox(self, l, b, r, t) def get_trimbox(self, fallback_ok=True): """ Returns: The page's TrimBox (If not defined, falls back to CropBox). """ return self._get_box(pdfium_c.FPDFPage_GetTrimBox, self.get_cropbox, fallback_ok) def set_trimbox(self, l, b, r, t): """ Set the page's TrimBox. """ pdfium_c.FPDFPage_SetTrimBox(self, l, b, r, t) def get_artbox(self, fallback_ok=True): """ Returns: The page's ArtBox (If not defined, falls back to CropBox). """ return self._get_box(pdfium_c.FPDFPage_GetArtBox, self.get_cropbox, fallback_ok) def set_artbox(self, l, b, r, t): """ Set the page's ArtBox. """ pdfium_c.FPDFPage_SetArtBox(self, l, b, r, t) def get_bbox(self): """ Returns: The bounding box of the page (the intersection between its media box and crop box). """ rect = pdfium_c.FS_RECTF() ok = pdfium_c.FPDF_GetPageBoundingBox(self, rect) if not ok: raise PdfiumError("Failed to get page bounding box.") return (rect.left, rect.bottom, rect.right, rect.top) # TODO add bindings to FPDFPage_TransFormWithClip() def get_textpage(self): """ Returns: PdfTextPage: A new text page handle for this page. """ raw_textpage = pdfium_c.FPDFText_LoadPage(self) if not raw_textpage: raise PdfiumError("Failed to load text page.") textpage = PdfTextPage(raw_textpage, self) self._add_kid(textpage) return textpage def insert_obj(self, pageobj): """ Insert a pageobject into the page. The pageobject must not belong to a page yet. If it belongs to a PDF, the target page must be part of that PDF. Position and form are defined by the object's matrix. If it is the identity matrix, the object will appear as-is on the bottom left corner of the page. Parameters: pageobj (PdfObject): The pageobject to insert. """ if pageobj.page: raise ValueError("The pageobject you attempted to insert already belongs to a page.") if pageobj.pdf and (pageobj.pdf is not self.pdf): raise ValueError("The pageobject you attempted to insert belongs to a different PDF.") ok = pdfium_c.FPDFPage_InsertObject(self, pageobj) if not ok and PDFIUM_INFO.build >= 7809: raise PdfiumError("Failed to insert object.") pageobj._detach_finalizer() pageobj.page = self pageobj.pdf = self.pdf def remove_obj(self, pageobj): """ Remove a pageobject from the page. As of PDFium 5692, detached pageobjects may be only re-inserted into existing pages of the same document. If the pageobject is not re-inserted into a page, its ``close()`` method may be called. Note: If the object's :attr:`~.PdfObject.type` is :data:`FPDF_PAGEOBJ_TEXT`, any :class:`.PdfTextPage` handles to the page should be closed before removing the object. Parameters: pageobj (PdfObject): The pageobject to remove. """ # note https://pdfium-review.googlesource.com/c/pdfium/+/118914 if pageobj.page is not self: raise ValueError("The pageobject you attempted to remove is not part of this page.") if pageobj.level > 0: assert pageobj.container is not None ok = pdfium_c.FPDFFormObj_RemoveObject(pageobj.container, pageobj) pageobj.level, pageobj.container = 0, None else: assert pageobj.container is None ok = pdfium_c.FPDFPage_RemoveObject(self, pageobj) if not ok: raise PdfiumError("Failed to remove pageobject.") pageobj.page = None pageobj._attach_finalizer() def gen_content(self): """ Generate page content to apply additions, removals or modifications of pageobjects. If page content was changed, this function should be called once before saving the document or re-loading the page. """ ok = pdfium_c.FPDFPage_GenerateContent(self) if not ok: raise PdfiumError("Failed to generate page content.") def get_objects(self, filter=None, max_depth=15, form=None, level=0, textpage=None): """ Iterate through the pageobjects on this page. Parameters: filter (list[int] | None): An optional list of pageobject types to filter (:attr:`FPDF_PAGEOBJ_*`). Any objects whose type is not contained will be skipped. If None or empty, all objects will be provided, regardless of their type. max_depth (int): Maximum recursion depth to consider when descending into Form XObjects. textpage (PdfTextPage | None): Text page to pass through to any :class:`.PdfTextObj` instances. Yields: :class:`.PdfObject`: A pageobject. """ if form: count_objects = pdfium_c.FPDFFormObj_CountObjects get_object = pdfium_c.FPDFFormObj_GetObject parent = form else: count_objects = pdfium_c.FPDFPage_CountObjects get_object = pdfium_c.FPDFPage_GetObject parent = self if textpage and textpage.page is not self: raise ValueError("The given textpage does not belong to this page.") n_objects = count_objects(parent) if n_objects < 0: raise PdfiumError("Failed to get number of pageobjects.") for i in range(n_objects): raw_obj = get_object(parent, i) if not raw_obj: raise PdfiumError("Failed to get pageobject.") # Don't register as child object, because the lifetime of pageobjects that are part of a page is managed by pdfium. The parent page should remain alive while a pageobject is used, but it seems unjustified to store countless of weakrefs just to lock pageobjects when the parent page is closed. helper_obj = PdfObject(raw_obj, page=self, pdf=self.pdf, container=form, level=level, textpage=textpage) # tracked=False if not filter or helper_obj.type in filter: yield helper_obj if helper_obj.type == pdfium_c.FPDF_PAGEOBJ_FORM and level < max_depth-1: yield from self.get_objects( filter = filter, max_depth = max_depth, form = helper_obj, level = level + 1, textpage = textpage, ) def flatten(self, flag=pdfium_c.FLAT_NORMALDISPLAY): """ Flatten form fields and annotations into page contents. Attention: * :meth:`~.PdfDocument.init_forms` must have been called on the parent pdf, before the page was retrieved, for this method to work. In other words, :attr:`.PdfPage.formenv` must be non-null. * Flattening may invalidate existing handles to the page, so you'll want to re-initialize these afterwards. Parameters: flag (int): PDFium flattening target (:attr:`FLAT_*`) Returns: int: PDFium flattening status (:attr:`FLATTEN_*`). :attr:`FLATTEN_FAIL` is handled internally. """ if not self.formenv: raise RuntimeError("page.flatten() requires prior pdf.init_forms(), before page retrieval.") rc = pdfium_c.FPDFPage_Flatten(self, flag) if rc == pdfium_c.FLATTEN_FAIL: raise PdfiumError("Failed to flatten annotations / form fields.") return rc # TODO # - add helpers for matrix-based and interruptible rendering # - add lower-level renderer that takes a caller-provided bitmap # e.g. render(), render_ex(), render_matrix(), render_matrix_ex() def render( self, scale = 1, rotation = 0, crop = (0, 0, 0, 0), may_draw_forms = True, bitmap_maker = PdfBitmap.new_native, color_scheme = None, fill_to_stroke = False, **kwargs ): """ Rasterize the page to a :class:`.PdfBitmap`. Parameters: scale (float): A factor scaling the number of pixels per PDF canvas unit. This defines the resolution of the image. To convert a DPI value to a scale factor, multiply it by the size of 1 canvas unit in inches (usually 1/72in). [#user_unit]_ rotation (int): Additional rotation in degrees (0, 90, 180, or 270). crop (tuple[float, float, float, float]): Amount in PDF canvas units to cut off from page borders (left, bottom, right, top). Crop is applied after rotation. may_draw_forms (bool): If True, render form fields (provided the document has forms and :meth:`~.PdfDocument.init_forms` was called). bitmap_maker (typing.Callable): Callback function used to create the :class:`.PdfBitmap`. fill_color (tuple[int, int, int, int]): Color the bitmap will be filled with before rendering. This uses RGBA syntax regardless of the pixel format used, with values from 0 to 255. If the fill color is not opaque (i.e. has transparency), ``{BGR,RGB}A`` will be used. grayscale (bool): If True, render in grayscale mode. optimize_mode (None | str): Page rendering optimization mode (None, "lcd", "print"). draw_annots (bool): If True, render page annotations. no_smoothtext (bool): If True, disable text anti-aliasing. Overrides ``optimize_mode="lcd"``. no_smoothimage (bool): If True, disable image anti-aliasing. no_smoothpath (bool): If True, disable path anti-aliasing. force_halftone (bool): If True, always use halftone for image stretching. limit_image_cache (bool): If True, limit image cache size. rev_byteorder (bool): If True, render with reverse byte order, leading to ``RGB{A/x}`` output rather than ``BGR{A/x}``. Other pixel formats are not affected. prefer_bgrx (bool): If True, use 4-byte ``{BGR/RGB}x`` rather than 3-byte ``{BGR/RGB}`` (i.e. add an unused byte). Other pixel formats are not affected. maybe_alpha (bool): If True, use a pixel format with alpha channel (i.e. ``{BGR/RGB}A``) if page content has transparency. This is recommended for performance in these cases, but as page-dependent format selection can be unexpected, it is not enabled by default. force_bitmap_format (int | None): If given, override automatic pixel format selection and enforce use of the given format (one of the :attr:`FPDFBitmap_*` constants). In this case, you should not pass any other format selection options, except potentially *rev_byteorder*. extra_flags (int): Additional PDFium rendering flags. May be combined with bitwise OR (``|`` operator). color_scheme (PdfColorScheme | None): A custom pdfium color scheme. Note that this may flatten different colors into one, so the usability of this is limited. fill_to_stroke (bool): If a *color_scheme* is given, whether to only draw borders around fill areas using the `path_stroke` color, instead of filling with the `path_fill` color. Returns: PdfBitmap: Bitmap of the rendered page. .. admonition:: Format selection This is the format selection hierarchy used by :meth:`.render`, from lowest to highest priority: * default: ``BGR`` * ``prefer_bgrx=True``: ``BGRx`` * ``grayscale=True``: ``L`` * ``maybe_alpha=True``: ``BGRA`` if the page has transparency, else the format selected otherwise * ``fill_color[3] < 255``: ``BGRA`` (background color with transparency) * ``force_bitmap_format=...`` -> any supported by pdfium Additionally, ``rev_byteorder=True`` will swap ``BGR{A/x}`` to ``RGB{A/x}`` if applicable. .. [#user_unit] Since PDF 1.6, pages may define an additional user unit factor. In this case, 1 canvas unit is equivalent to ``user_unit * (1/72)`` inches. PDFium does not currently provide an API to get the user unit, so this is not taken into account. """ src_width = math.ceil(self.get_width() * scale) src_height = math.ceil(self.get_height() * scale) if rotation in (90, 270): src_width, src_height = src_height, src_width crop = [math.ceil(c*scale) for c in crop] width = src_width - crop[0] - crop[2] height = src_height - crop[1] - crop[3] if any(d < 1 for d in (width, height)): raise ValueError("Crop exceeds page dimensions") cl_format, rev_byteorder, fill_color, flags = _parse_renderopts(self, **kwargs) if (color_scheme is not None) and fill_to_stroke: flags |= pdfium_c.FPDF_CONVERT_FILL_TO_STROKE bitmap = bitmap_maker(width, height, format=cl_format, rev_byteorder=rev_byteorder) bitmap.fill_rect(fill_color, 0, 0, width, height) pos_args = (-crop[0], -crop[3], src_width, src_height, pdfium_i.RotationToConst[rotation]) render_args = (bitmap, self, *pos_args, flags) if color_scheme is None: pdfium_c.FPDF_RenderPageBitmap(*render_args) else: pause = pdfium_c.IFSDK_PAUSE(version=1) pdfium_i.set_callback(pause, "NeedToPauseNow", lambda _: False) fpdf_cs = color_scheme.convert(rev_byteorder) status = pdfium_c.FPDF_RenderPageBitmapWithColorScheme_Start(*render_args, fpdf_cs, pause) assert status == pdfium_c.FPDF_RENDER_DONE pdfium_c.FPDF_RenderPage_Close(self) if may_draw_forms and self.formenv: pdfium_c.FPDF_FFLDraw(self.formenv, *render_args) bitmap._render_args = (weakref.ref(self), pos_args) return bitmap def _auto_bitmap_format(page, fill_color, grayscale, prefer_bgrx, maybe_alpha): # regarding maybe_alpha, see # https://chromium.googlesource.com/chromium/src/+/21e456b92bfadc625c947c718a6c4c5bf0c4c61b if fill_color[3] < 255 or (maybe_alpha and pdfium_c.FPDFPage_HasTransparency(page)): return pdfium_c.FPDFBitmap_BGRA elif grayscale: return pdfium_c.FPDFBitmap_Gray elif prefer_bgrx: return pdfium_c.FPDFBitmap_BGRx else: return pdfium_c.FPDFBitmap_BGR def _parse_renderopts( page, fill_color = (255, 255, 255, 255), grayscale = False, optimize_mode = None, draw_annots = True, no_smoothtext = False, no_smoothimage = False, no_smoothpath = False, force_halftone = False, limit_image_cache = False, rev_byteorder = False, prefer_bgrx = False, maybe_alpha = False, force_bitmap_format = None, extra_flags = 0, ): if force_bitmap_format is None: cl_format = _auto_bitmap_format(page, fill_color, grayscale, prefer_bgrx, maybe_alpha) else: cl_format = force_bitmap_format if cl_format == pdfium_c.FPDFBitmap_Gray: rev_byteorder = False flags = extra_flags if grayscale: flags |= pdfium_c.FPDF_GRAYSCALE if draw_annots: flags |= pdfium_c.FPDF_ANNOT if no_smoothtext: flags |= pdfium_c.FPDF_RENDER_NO_SMOOTHTEXT if no_smoothimage: flags |= pdfium_c.FPDF_RENDER_NO_SMOOTHIMAGE if no_smoothpath: flags |= pdfium_c.FPDF_RENDER_NO_SMOOTHPATH if force_halftone: flags |= pdfium_c.FPDF_RENDER_FORCEHALFTONE if limit_image_cache: flags |= pdfium_c.FPDF_RENDER_LIMITEDIMAGECACHE if rev_byteorder: flags |= pdfium_c.FPDF_REVERSE_BYTE_ORDER if optimize_mode: optimize_mode = optimize_mode.lower() if optimize_mode == "lcd": flags |= pdfium_c.FPDF_LCD_TEXT elif optimize_mode == "print": flags |= pdfium_c.FPDF_PRINTING else: raise ValueError(f"Invalid optimize_mode {optimize_mode}") # TODO consider using a namedtuple or something return cl_format, rev_byteorder, fill_color, flags class PdfColorScheme: """ Rendering color scheme. Each color shall be provided as a list of values for red, green, blue and alpha, ranging from 0 to 255. """ def __init__(self, path_fill, path_stroke, text_fill, text_stroke): self.colors = dict( path_fill_color=path_fill, path_stroke_color=path_stroke, text_fill_color=text_fill, text_stroke_color=text_stroke, ) def __repr__(self): return f"{type(self).__name__}(**{self.colors})" def convert(self, rev_byteorder): """ Returns: The color scheme as :class:`FPDF_COLORSCHEME` object. """ fpdf_cs = pdfium_c.FPDF_COLORSCHEME() for key, value in self.colors.items(): setattr(fpdf_cs, key, pdfium_i.color_tohex(value, rev_byteorder)) return fpdf_cs