658 lines
26 KiB
Python
658 lines
26 KiB
Python
# SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
|
|
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
|
|
|
|
__all__ = ("PdfObject", "PdfImage", "PdfTextObj", "PdfFont")
|
|
|
|
import ctypes
|
|
from ctypes import c_uint, c_float
|
|
import logging
|
|
from pathlib import Path
|
|
from codecs import decode
|
|
from collections import namedtuple
|
|
import pypdfium2.raw as pdfium_c
|
|
import pypdfium2.internal as pdfium_i
|
|
from pypdfium2.internal import FPDF_WCHAR_size
|
|
from pypdfium2._helpers.misc import PdfiumError
|
|
from pypdfium2._helpers.matrix import PdfMatrix
|
|
from pypdfium2._helpers.bitmap import PdfBitmap
|
|
from pypdfium2._lazy import Lazy, cached_property
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PdfObject (pdfium_i.AutoCloseable):
|
|
"""
|
|
Pageobject helper class.
|
|
|
|
When constructing a :class:`.PdfObject`, an instance of a more specific subclass may be returned instead, depending on the object's :attr:`.type` (e.g. :class:`.PdfImage`, :class:`.PdfTextObj`).
|
|
|
|
Note:
|
|
:meth:`.PdfObject.close` only takes effect on loose pageobjects.
|
|
It is a no-op otherwise, because pageobjects that are part of a page are owned by pdfium, not the caller.
|
|
|
|
Attributes:
|
|
raw (FPDF_PAGEOBJECT):
|
|
The underlying PDFium pageobject handle.
|
|
type (int):
|
|
The object's type (:data:`FPDF_PAGEOBJ_*`).
|
|
page (PdfPage):
|
|
Reference to the page this pageobject belongs to. May be None if not part of a page (e.g. new or detached object).
|
|
pdf (PdfDocument):
|
|
Reference to the document this pageobject belongs to. May be None if the object does not belong to a document yet.
|
|
This attribute is always set if :attr:`.page` is set.
|
|
container (PdfObject | None):
|
|
PdfObject handle to parent Form XObject, if the pageobject is nested in a Form XObject, None otherwise.
|
|
level (int):
|
|
Nesting level signifying the number of parent Form XObjects, at the time of construction.
|
|
Zero if the object is not nested in a Form XObject.
|
|
"""
|
|
|
|
def __new__(cls, raw, *args, **kwargs):
|
|
|
|
type = pdfium_c.FPDFPageObj_GetType(raw)
|
|
if type == pdfium_c.FPDF_PAGEOBJ_IMAGE:
|
|
instance = super().__new__(PdfImage)
|
|
elif type == pdfium_c.FPDF_PAGEOBJ_TEXT:
|
|
instance = super().__new__(PdfTextObj)
|
|
else:
|
|
instance = super().__new__(PdfObject)
|
|
|
|
instance.type = type
|
|
return instance
|
|
|
|
|
|
# textpage is only picked up by the PdfTextObj subclass, but included here so get_object() can just unconditionally pass the textpage without need for type if-checks
|
|
def __init__(self, raw, page=None, pdf=None, container=None, level=0, textpage=None, tracked=False):
|
|
|
|
self.raw = raw
|
|
self.page = page
|
|
self.pdf = pdf
|
|
self.container = container
|
|
self.level = level
|
|
|
|
if page is not None:
|
|
if self.pdf is None:
|
|
self.pdf = page.pdf
|
|
elif self.pdf is not page.pdf:
|
|
raise ValueError("*page* must belong to *pdf* when constructing a pageobject.")
|
|
|
|
# TODO if page is not None, hold it in the finalizer, unless the pageobject is detached from the page
|
|
super().__init__(pdfium_c.FPDFPageObj_Destroy, needs_free=(page is None), tracked=tracked)
|
|
|
|
|
|
@property
|
|
def parent(self): # AutoCloseable hook
|
|
# Not actually used by the autoclose machinery. PdfObjects are not tracked, and if they are part of a page we don't have ownership anyway.
|
|
return self.pdf if self.page is None else self.page # May be None (loose pageobject)
|
|
|
|
|
|
def get_bounds(self):
|
|
"""
|
|
Get the bounds of the object on the page.
|
|
|
|
Returns:
|
|
tuple[float * 4]: Left, bottom, right and top, in PDF page coordinates.
|
|
"""
|
|
if self.page is None:
|
|
raise RuntimeError("Must not call get_bounds() on a loose pageobject.")
|
|
|
|
l, b, r, t = c_float(), c_float(), c_float(), c_float()
|
|
ok = pdfium_c.FPDFPageObj_GetBounds(self, l, b, r, t)
|
|
if not ok:
|
|
raise PdfiumError("Failed to locate pageobject.")
|
|
|
|
return (l.value, b.value, r.value, t.value)
|
|
|
|
|
|
def get_quad_points(self):
|
|
"""
|
|
Get the object's quadriliteral points (i.e. the positions of its corners).
|
|
For transformed objects, this may provide tighter bounds than a rectangle (e.g. rotation by a non-multiple of 90°, shear).
|
|
|
|
Note:
|
|
This function only supports image and text objects.
|
|
|
|
Returns:
|
|
tuple[tuple[float*2] * 4]: Corner positions as (x, y) tuples, counter-clockwise from origin, i.e. bottom-left, bottom-right, top-right, top-left, in PDF page coordinates.
|
|
"""
|
|
|
|
if self.type not in (pdfium_c.FPDF_PAGEOBJ_IMAGE, pdfium_c.FPDF_PAGEOBJ_TEXT):
|
|
# as of pdfium 5921
|
|
raise RuntimeError("Quad points only supported for image and text objects.")
|
|
|
|
q = pdfium_c.FS_QUADPOINTSF()
|
|
ok = pdfium_c.FPDFPageObj_GetRotatedBounds(self, q)
|
|
if not ok:
|
|
raise PdfiumError("Failed to get quad points.")
|
|
|
|
return (q.x1, q.y1), (q.x2, q.y2), (q.x3, q.y3), (q.x4, q.y4)
|
|
|
|
|
|
def get_matrix(self):
|
|
"""
|
|
Returns:
|
|
PdfMatrix: The pageobject's current transform matrix.
|
|
"""
|
|
fs_matrix = pdfium_c.FS_MATRIX()
|
|
ok = pdfium_c.FPDFPageObj_GetMatrix(self, fs_matrix)
|
|
if not ok:
|
|
raise PdfiumError("Failed to get matrix of pageobject.")
|
|
return PdfMatrix.from_raw(fs_matrix)
|
|
|
|
|
|
def set_matrix(self, matrix):
|
|
"""
|
|
Parameters:
|
|
matrix (PdfMatrix): Set this matrix as the pageobject's transform matrix.
|
|
"""
|
|
ok = pdfium_c.FPDFPageObj_SetMatrix(self, matrix)
|
|
if not ok:
|
|
raise PdfiumError("Failed to set matrix of pageobject.")
|
|
|
|
|
|
def transform(self, matrix):
|
|
"""
|
|
Parameters:
|
|
matrix (PdfMatrix): Multiply the pageobject's current transform matrix by this matrix.
|
|
"""
|
|
ok = pdfium_c.FPDFPageObj_TransformF(self, matrix)
|
|
if not ok:
|
|
raise PdfiumError("Failed to transform pageobject with matrix.")
|
|
|
|
|
|
class PdfTextObj (PdfObject):
|
|
"""
|
|
Textobject helper class.
|
|
|
|
You may want to call :meth:`.PdfPage.get_objects` or :meth:`.PdfTextPage.get_textobj` to obtain an instance of this class.
|
|
|
|
Attributes:
|
|
textpage (PdfTextPage | None):
|
|
The parent textpage, or None if not set.
|
|
"""
|
|
|
|
# TODO hold parent object in finalizer
|
|
def __init__(self, *args, textpage=None, **kwargs):
|
|
if textpage is not None:
|
|
kwargs.update(page=textpage.page, pdf=textpage.page.pdf)
|
|
super().__init__(*args, **kwargs)
|
|
self.textpage = textpage
|
|
|
|
def extract(self):
|
|
"""
|
|
Returns:
|
|
str: The objects's text content.
|
|
Note:
|
|
This method requires the :attr:`.textpage` attribute to be set.
|
|
For textobjects obtained through :meth:`.PdfPage.get_objects`, use the ``textpage`` passthrough parameter.
|
|
"""
|
|
if not self.textpage:
|
|
raise RuntimeError("PdfTextObj.extract() requires textpage to be set.")
|
|
|
|
n_bytes = pdfium_c.FPDFTextObj_GetText(self, self.textpage, None, 0)
|
|
if n_bytes == 0:
|
|
raise PdfiumError("Failed to get text from textobject.")
|
|
|
|
n_units = -(n_bytes // -FPDF_WCHAR_size) # ceildiv
|
|
buffer = (pdfium_c.FPDF_WCHAR * n_units)()
|
|
pdfium_c.FPDFTextObj_GetText(self, self.textpage, buffer, n_bytes)
|
|
|
|
return decode(memoryview(buffer)[:n_units-1], "utf-16-le")
|
|
|
|
def get_font(self):
|
|
"""
|
|
Returns:
|
|
PdfFont: Handle to the object's font. Provides name and weight info.
|
|
"""
|
|
# The font object is _not_ owned by the caller, and the PdfTextObj must remain alive while the font object lives.
|
|
raw_font = pdfium_c.FPDFTextObj_GetFont(self)
|
|
return PdfFont(raw_font, self, needs_free=False)
|
|
|
|
def get_font_size(self):
|
|
"""
|
|
Returns:
|
|
float: Font size used by the object's text, in PDF canvas units (typically 1/72in).
|
|
"""
|
|
r_size = ctypes.c_float()
|
|
ok = pdfium_c.FPDFTextObj_GetFontSize(self, r_size)
|
|
if not ok:
|
|
raise PdfiumError("Failed to get font size.")
|
|
return r_size.value
|
|
|
|
|
|
class PdfFont (pdfium_i.AutoCloseable):
|
|
"""
|
|
Font helper class.
|
|
"""
|
|
|
|
# TODO hold parent in finalizer
|
|
def __init__(self, raw, parent=None, needs_free=False):
|
|
self.raw = raw
|
|
self.parent = parent
|
|
super().__init__(pdfium_c.FPDFFont_Close, needs_free=needs_free, tracked=needs_free)
|
|
|
|
@cached_property
|
|
def is_embedded(self):
|
|
"""
|
|
bool: The font's embedding status. True if it is embedded (bundled) in the PDF, False otherwise.
|
|
This is a cached property, as a font object's embedding status is unlikely to change.
|
|
"""
|
|
rc = pdfium_c.FPDFFont_GetIsEmbedded(self)
|
|
if rc == -1:
|
|
raise PdfiumError("Failed to determine font embedding status.")
|
|
return rc == 1
|
|
|
|
def _get_name_impl(self, api, which, errors):
|
|
|
|
bufsize = api(self, None, 0)
|
|
if bufsize == 0:
|
|
raise PdfiumError(f"Failed to get font {which} name.")
|
|
|
|
buffer = ctypes.create_string_buffer(bufsize)
|
|
api(self, buffer, bufsize)
|
|
|
|
return decode(memoryview(buffer)[:bufsize-1], "utf-8", errors=errors)
|
|
|
|
def get_base_name(self, errors="replace"):
|
|
"""
|
|
Returns:
|
|
str: The base font name.
|
|
"""
|
|
return self._get_name_impl(pdfium_c.FPDFFont_GetBaseFontName, "base", errors)
|
|
|
|
def get_family_name(self, errors="replace"):
|
|
"""
|
|
Returns:
|
|
str: The font family name.
|
|
"""
|
|
return self._get_name_impl(pdfium_c.FPDFFont_GetFamilyName, "family", errors)
|
|
|
|
def get_weight(self):
|
|
"""
|
|
Returns:
|
|
int: The font's weight. Typical values are 400 (normal) and 700 (bold).
|
|
"""
|
|
weight = pdfium_c.FPDFFont_GetWeight(self)
|
|
if weight == -1:
|
|
raise PdfiumError("Failed to get font weight.")
|
|
return weight
|
|
|
|
STANDARD_FONTS = ("Times-Roman", "Times-Bold", "Times-Italic", "Times-BoldItalic", "Helvetica", "Helvetica-Bold", "Helvetica-Oblique", "Helvetica-BoldOblique", "Courier", "Courier-Bold", "Courier-Oblique", "Courier-BoldOblique", "Symbol", "ZapfDingbats")
|
|
"""
|
|
Standard 14 fonts (Type 1, PostScript names) according to PDF32000_2008, section 9.6.2.2.
|
|
These fonts or suitable substitutes should be available to all PDF engines,
|
|
so PDFs that uses them without embedding can still be expected to display correctly.
|
|
"""
|
|
|
|
@classmethod
|
|
def load_standard(cls, pdf, name):
|
|
"""
|
|
Load one of the Standard 14 fonts defined above into a PDF.
|
|
|
|
If the font is not available in the system, a substitute may be used.
|
|
Checking :meth:`.get_family_name` should give a clue about internal substitution (e.g. "Chrom Sans OTF", "Chrom Serif OTF").
|
|
For system substitution, consider intercepting what goes through the :class:`.PdfSysfontBase` callbacks.
|
|
|
|
Parameters:
|
|
pdf (PdfDocument):
|
|
The document to which the font shall be loaded.
|
|
name (str):
|
|
The font name. Must be one of :attr:`.STANDARD_FONTS`.
|
|
"""
|
|
assert name in cls.STANDARD_FONTS
|
|
raw_font = pdfium_c.FPDFText_LoadStandardFont(pdf, name.encode("utf-8"))
|
|
if not raw_font:
|
|
raise PdfiumError(f"Failed to load standard font {name!r}.")
|
|
helper = cls(raw_font, parent=pdf, needs_free=True)
|
|
pdf._add_kid(helper)
|
|
return helper
|
|
|
|
|
|
class PdfImage (PdfObject):
|
|
"""
|
|
Image object helper class (specific kind of pageobject).
|
|
"""
|
|
|
|
# cf. https://crbug.com/pdfium/1203
|
|
#: Filters applied by :func:`FPDFImageObj_GetImageDataDecoded`, referred to as "simple filters". Other filters are considered "complex filters".
|
|
SIMPLE_FILTERS = ("ASCIIHexDecode", "ASCII85Decode", "RunLengthDecode", "FlateDecode", "LZWDecode")
|
|
|
|
|
|
@classmethod
|
|
def new(cls, pdf):
|
|
"""
|
|
Parameters:
|
|
pdf (PdfDocument): The document to which the new image object shall be added.
|
|
Returns:
|
|
PdfImage: Handle to a new, empty image.
|
|
Note that position and size of the image are defined by its matrix, which defaults to the identity matrix.
|
|
This means that new images will appear as a tiny square of 1x1 canvas units on the bottom left corner of the page.
|
|
Use :class:`.PdfMatrix` and :meth:`.set_matrix` to adjust size and position.
|
|
"""
|
|
raw_img = pdfium_c.FPDFPageObj_NewImageObj(pdf)
|
|
return cls(raw_img, page=None, pdf=pdf)
|
|
|
|
|
|
def get_metadata(self):
|
|
"""
|
|
Retrieve image metadata including DPI, bits per pixel, color space, and size.
|
|
If the image does not belong to a page yet, bits per pixel and color space will be unset (0).
|
|
|
|
Note:
|
|
* The DPI values signify the resolution of the image on the PDF page, not the DPI metadata embedded in the image file.
|
|
* Due to issues in pdfium, this function might be slow on some kinds of images. If you only need size, prefer :meth:`.get_px_size` instead.
|
|
|
|
Returns:
|
|
FPDF_IMAGEOBJ_METADATA: Image metadata structure
|
|
"""
|
|
# https://crbug.com/pdfium/1928
|
|
metadata = pdfium_c.FPDF_IMAGEOBJ_METADATA()
|
|
ok = pdfium_c.FPDFImageObj_GetImageMetadata(self, self.page, metadata)
|
|
if not ok:
|
|
raise PdfiumError("Failed to get image metadata.")
|
|
return metadata
|
|
|
|
|
|
def get_px_size(self):
|
|
"""
|
|
Returns:
|
|
(int, int): Image dimension in pixels as a tuple of (width, height).
|
|
"""
|
|
# https://pdfium-review.googlesource.com/c/pdfium/+/106290
|
|
w, h = c_uint(), c_uint()
|
|
ok = pdfium_c.FPDFImageObj_GetImagePixelSize(self, w, h)
|
|
if not ok:
|
|
raise PdfiumError("Failed to get image size.")
|
|
return w.value, h.value
|
|
|
|
|
|
def load_jpeg(self, source, pages=None, inline=False, autoclose=True):
|
|
"""
|
|
Set a JPEG as the image object's content.
|
|
|
|
Parameters:
|
|
source (str | pathlib.Path | typing.BinaryIO):
|
|
Input JPEG, given as file path or readable byte stream.
|
|
pages (list[PdfPage] | None):
|
|
If replacing an image, pass in a list of loaded pages that might contain it, to update their cache.
|
|
(The same image may be shown multiple times in different transforms across a PDF.)
|
|
May be None or an empty sequence if the image is not shared.
|
|
inline (bool):
|
|
Whether to load the image content into memory. If True, the buffer may be closed after this function call.
|
|
Otherwise, the buffer needs to remain open until the PDF is closed.
|
|
autoclose (bool):
|
|
If the input is a buffer, whether it should be automatically closed once not needed by the PDF anymore.
|
|
"""
|
|
|
|
if isinstance(source, (str, Path)):
|
|
buffer = open(source, "rb")
|
|
autoclose = True
|
|
elif pdfium_i.is_stream(source, "r"):
|
|
buffer = source
|
|
else:
|
|
raise ValueError(f"Cannot load JPEG from {source} - not a file path or byte stream.")
|
|
|
|
bufaccess, to_hold = pdfium_i.get_bufreader(buffer)
|
|
loader = pdfium_c.FPDFImageObj_LoadJpegFileInline if inline else \
|
|
pdfium_c.FPDFImageObj_LoadJpegFile
|
|
|
|
c_pages, page_count = pdfium_i.pages_c_array(pages)
|
|
ok = loader(c_pages, page_count, self, bufaccess)
|
|
if not ok:
|
|
raise PdfiumError("Failed to load JPEG into image object.")
|
|
|
|
if inline:
|
|
for data in to_hold:
|
|
id(data)
|
|
if autoclose:
|
|
buffer.close()
|
|
else:
|
|
self.pdf._data_holder += to_hold
|
|
if autoclose:
|
|
self.pdf._data_closer.append(buffer)
|
|
|
|
|
|
def set_bitmap(self, bitmap, pages=None):
|
|
"""
|
|
Set a bitmap as the image object's content.
|
|
The pixel data will be flate compressed (as of PDFium 5418).
|
|
|
|
Parameters:
|
|
bitmap (PdfBitmap):
|
|
The bitmap to inject into the image object.
|
|
pages (list[PdfPage] | None):
|
|
A list of loaded pages that might contain the image object. See :meth:`.load_jpeg`.
|
|
"""
|
|
c_pages, page_count = pdfium_i.pages_c_array(pages)
|
|
ok = pdfium_c.FPDFImageObj_SetBitmap(c_pages, page_count, self, bitmap)
|
|
if not ok:
|
|
raise PdfiumError("Failed to set image to bitmap.")
|
|
|
|
|
|
def _get_rendered_bitmap(self, scale_to_original):
|
|
""" This is a private implementation function. Do not use externally. """
|
|
|
|
if self.pdf is None:
|
|
raise RuntimeError("Cannot get rendered bitmap of loose pageobject.")
|
|
|
|
if scale_to_original:
|
|
# Suggested by pdfium dev Lei Zhang in https://groups.google.com/g/pdfium/c/2czGFBcWHHQ/m/g0wzOJR-BAAJ
|
|
|
|
px_w, px_h = self.get_px_size()
|
|
l, b, r, t = self.get_bounds()
|
|
content_w, content_h = abs(r-l), abs(t-b)
|
|
|
|
# align pixel and content width/height relation if swapped due to rotation (e.g. 90°, 270°)
|
|
swap = (px_w < px_h) != (content_w < content_h)
|
|
if swap:
|
|
px_w, px_h = px_h, px_w
|
|
|
|
# if the image is squashed/stretched, prefer partial upscaling over partial downscaling (not using separate x/y scaling, so the image will look as in the PDF)
|
|
scale_factor = max(px_w/content_w, px_h/content_h)
|
|
orig_mat = self.get_matrix()
|
|
scaled_mat = orig_mat.scale(scale_factor, scale_factor)
|
|
self.set_matrix(scaled_mat)
|
|
# logger.debug(
|
|
# f"Pixel size: {px_w}, {px_h} (did swap? {swap})\n"
|
|
# f"Size in page coords: {content_w}, {content_h}\n"
|
|
# f"Scale: {scale_factor}\n"
|
|
# f"Current matrix: {orig_mat}\n"
|
|
# f"Scaled matrix: {scaled_mat}"
|
|
# )
|
|
|
|
try:
|
|
raw_bitmap = pdfium_c.FPDFImageObj_GetRenderedBitmap(self.pdf, self.page, self)
|
|
finally:
|
|
if scale_to_original:
|
|
self.set_matrix(orig_mat)
|
|
|
|
return raw_bitmap
|
|
|
|
|
|
def get_bitmap(self, render=False, scale_to_original=True):
|
|
"""
|
|
Get a bitmap rasterization of the image.
|
|
|
|
Parameters:
|
|
render (bool):
|
|
Whether the image should be rendered, thereby applying possible transform matrices and alpha masks.
|
|
scale_to_original (bool):
|
|
If *render* is True, whether to temporarily scale the image to its native resolution, or close to that (defaults to True). This should improve output quality. Ignored if *render* is False.
|
|
Returns:
|
|
PdfBitmap: Image bitmap (with a buffer allocated by PDFium).
|
|
"""
|
|
|
|
if render:
|
|
raw_bitmap = self._get_rendered_bitmap(scale_to_original)
|
|
else:
|
|
raw_bitmap = pdfium_c.FPDFImageObj_GetBitmap(self)
|
|
|
|
if not raw_bitmap:
|
|
raise PdfiumError(f"Failed to get bitmap of image {self}.")
|
|
|
|
bitmap = PdfBitmap.from_raw(raw_bitmap)
|
|
if render and scale_to_original:
|
|
logger.debug(f"Extracted size: {bitmap.width}, {bitmap.height}")
|
|
|
|
return bitmap
|
|
|
|
|
|
def get_data(self, decode_simple=False):
|
|
"""
|
|
Parameters:
|
|
decode_simple (bool):
|
|
If True, decode simple filters (see :attr:`.SIMPLE_FILTERS`), so only complex filters will remain, if any. If there are no complex filters, this provides the decoded pixel data.
|
|
If False, the raw stream data will be returned instead.
|
|
Returns:
|
|
ctypes.Array: The data of the image stream (as :class:`~ctypes.c_ubyte` array).
|
|
"""
|
|
func = pdfium_c.FPDFImageObj_GetImageDataDecoded if decode_simple else \
|
|
pdfium_c.FPDFImageObj_GetImageDataRaw
|
|
n_bytes = func(self, None, 0)
|
|
buffer = (ctypes.c_ubyte * n_bytes)()
|
|
func(self, buffer, n_bytes)
|
|
return buffer
|
|
|
|
|
|
def get_filters(self, skip_simple=False):
|
|
"""
|
|
Parameters:
|
|
skip_simple (bool):
|
|
If True, exclude simple filters.
|
|
Returns:
|
|
list[str]: A list of image filters, to be applied in order (from lowest to highest index).
|
|
"""
|
|
|
|
filters = []
|
|
count = pdfium_c.FPDFImageObj_GetImageFilterCount(self)
|
|
|
|
for i in range(count):
|
|
length = pdfium_c.FPDFImageObj_GetImageFilter(self, i, None, 0)
|
|
buffer = ctypes.create_string_buffer(length)
|
|
pdfium_c.FPDFImageObj_GetImageFilter(self, i, buffer, length)
|
|
f = decode(memoryview(buffer)[:length-1], "utf-8")
|
|
filters.append(f)
|
|
|
|
if skip_simple:
|
|
filters = [f for f in filters if f not in self.SIMPLE_FILTERS]
|
|
|
|
return filters
|
|
|
|
|
|
def extract(self, dest, *args, **kwargs):
|
|
"""
|
|
Extract the image into an independently usable file or byte stream, attempting to avoid re-encoding or quality loss, as far as pdfium's limited API permits.
|
|
|
|
This method can only extract DCTDecode (JPEG) and JPXDecode (JPEG 2000) images directly.
|
|
Otherwise, the pixel data is decoded and re-encoded using :mod:`PIL`, which is slower and loses the original encoding.
|
|
For images with simple filters only, ``get_data(decode_simple=True)`` is used to preserve higher bit depth or special color formats not supported by ``FPDF_BITMAP``.
|
|
For images with complex filters other than those extracted directly, we have to resort to :meth:`.get_bitmap`.
|
|
|
|
Note, this method is not able to account for alpha masks, and potentially other data stored separately of the main image stream, which might lead to incorrect representation of the image.
|
|
|
|
Tip:
|
|
The ``pikepdf`` library is capable of preserving the original encoding in many cases where this method is not.
|
|
|
|
Parameters:
|
|
dest (str | pathlib.Path | io.BytesIO):
|
|
File path prefix or byte stream to which the image shall be written.
|
|
fb_format (str):
|
|
The image format to use in case it is necessary to (re-)encode the data.
|
|
"""
|
|
|
|
# https://crbug.com/pdfium/1930
|
|
|
|
extraction_gen = _extract_smart(self, *args, **kwargs)
|
|
format = next(extraction_gen)
|
|
|
|
if isinstance(dest, (str, Path)):
|
|
with open(f"{dest}.{format}", "wb") as buf:
|
|
extraction_gen.send(buf)
|
|
elif pdfium_i.is_stream(dest, "w"):
|
|
extraction_gen.send(dest)
|
|
else:
|
|
raise ValueError(f"Cannot extract to '{dest}'")
|
|
|
|
|
|
_ImageInfo = namedtuple("_ImageInfo", "format mode metadata all_filters complex_filters")
|
|
|
|
|
|
class _ImageExtractionError (Exception):
|
|
pass
|
|
|
|
|
|
def _get_pil_mode(cs, bpp):
|
|
# As of Jan 2025, pdfium does not provide access to the palette, so we cannot handle indexed (palettized) color space.
|
|
# TODO handle ICC-based color spaces (pdfium now provides access to the ICC profile via FPDFImageObj_GetIccProfileDataDecoded(), see commit edd7c5cf)
|
|
if cs == pdfium_c.FPDF_COLORSPACE_DEVICEGRAY:
|
|
return "1" if bpp == 1 else "L"
|
|
elif cs == pdfium_c.FPDF_COLORSPACE_DEVICERGB:
|
|
return "RGB"
|
|
elif cs == pdfium_c.FPDF_COLORSPACE_DEVICECMYK:
|
|
return "CMYK"
|
|
else:
|
|
return None
|
|
|
|
|
|
def _extract_smart(image_obj, fb_format=None):
|
|
|
|
try:
|
|
# TODO can we change PdfImage.get_data() to take an mmap, so the data could be written directly into a file rather than an in-memory array?
|
|
data, info = _extract_direct(image_obj)
|
|
except _ImageExtractionError as e:
|
|
logger.debug(str(e))
|
|
pil_image = image_obj.get_bitmap(render=False).to_pil()
|
|
else:
|
|
pil_image = None
|
|
format = info.format
|
|
if format == "raw":
|
|
metadata = info.metadata
|
|
pil_image = Lazy.PIL_Image.frombuffer(
|
|
info.mode,
|
|
(metadata.width, metadata.height),
|
|
image_obj.get_data(decode_simple=True),
|
|
"raw", info.mode, 0, 1,
|
|
)
|
|
|
|
if pil_image:
|
|
format = fb_format
|
|
if not format:
|
|
format = "tiff" if pil_image.mode == "CMYK" else "png"
|
|
|
|
buffer = yield format
|
|
if pil_image:
|
|
pil_image.save(buffer, format=format)
|
|
else:
|
|
buffer.write(data)
|
|
|
|
yield # breakpoint preventing StopIteration on .send()
|
|
|
|
|
|
def _extract_direct(image_obj):
|
|
|
|
all_filters = image_obj.get_filters()
|
|
complex_filters = [f for f in all_filters if f not in PdfImage.SIMPLE_FILTERS]
|
|
metadata = image_obj.get_metadata()
|
|
mode = _get_pil_mode(metadata.colorspace, metadata.bits_per_pixel)
|
|
|
|
if len(complex_filters) == 0:
|
|
if mode:
|
|
out_data = image_obj.get_data(decode_simple=True)
|
|
out_format = "raw"
|
|
else:
|
|
raise _ImageExtractionError(f"Unhandled color space {pdfium_i.ColorspaceToStr.get(metadata.colorspace)} - don't know how to treat data.")
|
|
elif len(complex_filters) == 1:
|
|
f = complex_filters[0]
|
|
if f == "DCTDecode":
|
|
out_data = image_obj.get_data(decode_simple=True)
|
|
out_format = "jpg"
|
|
elif f == "JPXDecode":
|
|
out_data = image_obj.get_data(decode_simple=True)
|
|
out_format = "jp2"
|
|
else:
|
|
raise _ImageExtractionError(f"Unhandled complex filter {f}.")
|
|
else:
|
|
raise _ImageExtractionError(f"Cannot handle multiple complex filters {complex_filters}.")
|
|
|
|
info = _ImageInfo(out_format, mode, metadata, all_filters, complex_filters)
|
|
return out_data, info
|