eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/display.py

403 lines
12 KiB
Python

import pathlib
from io import BufferedReader, BytesIO
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union
import PIL.Image
import PIL.ImageDraw
import pypdfium2 # type: ignore
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils.exceptions import MalformedPDFException
if TYPE_CHECKING: # pragma: nocover
import pandas as pd
from .page import Page
class COLORS:
RED = (255, 0, 0)
GREEN = (0, 255, 0)
BLUE = (0, 0, 255)
TRANSPARENT = (0, 0, 0, 0)
DEFAULT_FILL = COLORS.BLUE + (50,)
DEFAULT_STROKE = COLORS.RED + (200,)
DEFAULT_STROKE_WIDTH = 1
DEFAULT_RESOLUTION = 72
T_color = Union[Tuple[int, int, int], Tuple[int, int, int, int], str]
T_contains_points = Union[Tuple[T_point, ...], List[T_point], T_obj]
def get_page_image(
stream: Union[BufferedReader, BytesIO],
path: Optional[pathlib.Path],
page_ix: int,
resolution: Union[int, float],
password: Optional[str],
antialias: bool = False,
) -> PIL.Image.Image:
src: Union[pathlib.Path, BufferedReader, BytesIO]
# If we are working with a file object saved to disk
if path:
src = path
# If we instead are working with a BytesIO stream
else:
stream.seek(0)
src = stream
try:
pdfium_doc = pypdfium2.PdfDocument(src, password=password)
except pypdfium2.PdfiumError as e:
raise MalformedPDFException(e)
pdfium_page = pdfium_doc.get_page(page_ix)
img: PIL.Image.Image = pdfium_page.render(
# Modifiable arguments
scale=resolution / 72,
no_smoothtext=not antialias,
no_smoothpath=not antialias,
no_smoothimage=not antialias,
# Non-modifiable arguments
prefer_bgrx=True,
).to_pil()
pdfium_doc.close()
return img.convert("RGB")
class PageImage:
def __init__(
self,
page: "Page",
original: Optional[PIL.Image.Image] = None,
resolution: Union[int, float] = DEFAULT_RESOLUTION,
antialias: bool = False,
force_mediabox: bool = False,
):
self.page = page
self.root = page if page.is_original else page.root_page
self.resolution = resolution
if original is None:
self.original = get_page_image(
stream=page.pdf.stream,
path=page.pdf.path,
page_ix=page.page_number - 1,
resolution=resolution,
antialias=antialias,
password=page.pdf.password,
)
else:
self.original = original
self.scale = self.original.size[0] / (page.cropbox[2] - page.cropbox[0])
# This value represents the coordinates of the page,
# in page-unit values, that will be displayed.
self.bbox = (
page.bbox
if page.bbox != page.mediabox
else (page.mediabox if force_mediabox else page.cropbox)
)
# If this value is different than the *Page*'s .cropbox
# (e.g., because the mediabox differs from the cropbox or
# or because we've used Page.crop(...)), then we'll need to
# crop the initially-converted image.
if page.bbox != page.cropbox:
crop_dims = self._reproject_bbox(page.cropbox)
bbox_dims = self._reproject_bbox(self.bbox)
self.original = self.original.crop(
(
bbox_dims[0] - crop_dims[0],
bbox_dims[1] - crop_dims[1],
bbox_dims[2] - crop_dims[0],
bbox_dims[3] - crop_dims[1],
)
)
self.reset()
def _reproject_bbox(self, bbox: T_bbox) -> Tuple[int, int, int, int]:
x0, top, x1, bottom = bbox
_x0, _top = self._reproject((x0, top))
_x1, _bottom = self._reproject((x1, bottom))
return (_x0, _top, _x1, _bottom)
def _reproject(self, coord: T_point) -> Tuple[int, int]:
"""
Given an (x0, top) tuple from the *root* coordinate system,
return an (x0, top) tuple in the *image* coordinate system.
"""
x0, top = coord
_x0 = (x0 - self.bbox[0]) * self.scale
_top = (top - self.bbox[1]) * self.scale
return (int(_x0), int(_top))
def reset(self) -> "PageImage":
self.annotated = PIL.Image.new("RGB", self.original.size)
self.annotated.paste(self.original)
self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA")
return self
def save(
self,
dest: Union[str, pathlib.Path, BytesIO],
format: str = "PNG",
quantize: bool = True,
colors: int = 256,
bits: int = 8,
**kwargs: Any,
) -> None:
if quantize:
out = self.annotated.quantize(colors, method=PIL.Image.FASTOCTREE).convert(
"P"
)
else:
out = self.annotated
out.save(
dest,
format=format,
bits=bits,
dpi=(self.resolution, self.resolution),
**kwargs,
)
def copy(self) -> "PageImage":
return self.__class__(self.page, self.original)
def draw_line(
self,
points_or_obj: T_contains_points,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
# If passing a raw list of points, use those
if isinstance(points_or_obj, (tuple, list)):
points = points_or_obj
# Else, use the "pts" attribute if available
elif isinstance(points_or_obj, dict) and "pts" in points_or_obj:
points = [(x, y) for x, y in points_or_obj["pts"]]
# Otherwise, just use ((x0, top), (x1, bottom))
else:
obj = points_or_obj
points = ((obj["x0"], obj["top"]), (obj["x1"], obj["bottom"]))
self.draw.line(
list(map(self._reproject, points)), fill=stroke, width=stroke_width
)
return self
def draw_lines(
self,
list_of_lines: Union[T_seq[T_contains_points], "pd.DataFrame"],
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in utils.to_list(list_of_lines):
self.draw_line(x, stroke=stroke, stroke_width=stroke_width)
return self
def draw_vline(
self,
location: T_num,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
points = (location, self.bbox[1], location, self.bbox[3])
self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
return self
def draw_vlines(
self,
locations: Union[List[T_num], "pd.Series[float]"],
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in list(locations):
self.draw_vline(x, stroke=stroke, stroke_width=stroke_width)
return self
def draw_hline(
self,
location: T_num,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
points = (self.bbox[0], location, self.bbox[2], location)
self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
return self
def draw_hlines(
self,
locations: Union[List[T_num], "pd.Series[float]"],
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in list(locations):
self.draw_hline(x, stroke=stroke, stroke_width=stroke_width)
return self
def draw_rect(
self,
bbox_or_obj: Union[T_bbox, T_obj],
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
if isinstance(bbox_or_obj, (tuple, list)):
bbox = bbox_or_obj
else:
obj = bbox_or_obj
bbox = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])
x0, top, x1, bottom = bbox
half = stroke_width / 2
x0 = min(x0 + half, (x0 + x1) / 2)
top = min(top + half, (top + bottom) / 2)
x1 = max(x1 - half, (x0 + x1) / 2)
bottom = max(bottom - half, (top + bottom) / 2)
fill_bbox = self._reproject_bbox((x0, top, x1, bottom))
self.draw.rectangle(fill_bbox, fill, COLORS.TRANSPARENT)
if stroke_width > 0:
segments = [
((x0, top), (x1, top)), # top
((x0, bottom), (x1, bottom)), # bottom
((x0, top), (x0, bottom)), # left
((x1, top), (x1, bottom)), # right
]
self.draw_lines(segments, stroke=stroke, stroke_width=stroke_width)
return self
def draw_rects(
self,
list_of_rects: Union[List[T_bbox], T_obj_list, "pd.DataFrame"],
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
for x in utils.to_list(list_of_rects):
self.draw_rect(x, fill=fill, stroke=stroke, stroke_width=stroke_width)
return self
def draw_circle(
self,
center_or_obj: Union[T_point, T_obj],
radius: int = 5,
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
) -> "PageImage":
if isinstance(center_or_obj, tuple):
center = center_or_obj
else:
obj = center_or_obj
center = ((obj["x0"] + obj["x1"]) / 2, (obj["top"] + obj["bottom"]) / 2)
cx, cy = center
bbox = (cx - radius, cy - radius, cx + radius, cy + radius)
self.draw.ellipse(self._reproject_bbox(bbox), fill, stroke)
return self
def draw_circles(
self,
list_of_circles: Union[List[T_point], T_obj_list, "pd.DataFrame"],
radius: int = 5,
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
) -> "PageImage":
for x in utils.to_list(list_of_circles):
self.draw_circle(x, radius=radius, fill=fill, stroke=stroke)
return self
def debug_table(
self,
table: Table,
fill: T_color = DEFAULT_FILL,
stroke: T_color = DEFAULT_STROKE,
stroke_width: int = 1,
) -> "PageImage":
"""
Outline all found tables.
"""
self.draw_rects(
table.cells, fill=fill, stroke=stroke, stroke_width=stroke_width
)
return self
def debug_tablefinder(
self,
table_settings: Optional[
Union[TableFinder, TableSettings, T_table_settings]
] = None,
) -> "PageImage":
if isinstance(table_settings, TableFinder):
finder = table_settings
elif table_settings is None or isinstance(
table_settings, (TableSettings, dict)
):
finder = self.page.debug_tablefinder(table_settings)
else:
raise ValueError(
"Argument must be instance of TableFinder"
"or a TableFinder settings dict."
)
for table in finder.tables:
self.debug_table(table)
self.draw_lines(finder.edges, stroke_width=1)
self.draw_circles(
list(finder.intersections.keys()),
fill=COLORS.TRANSPARENT,
stroke=COLORS.BLUE + (200,),
radius=3,
)
return self
def outline_words(
self,
stroke: T_color = DEFAULT_STROKE,
fill: T_color = DEFAULT_FILL,
stroke_width: int = DEFAULT_STROKE_WIDTH,
x_tolerance: T_num = utils.DEFAULT_X_TOLERANCE,
y_tolerance: T_num = utils.DEFAULT_Y_TOLERANCE,
) -> "PageImage":
words = self.page.extract_words(
x_tolerance=x_tolerance, y_tolerance=y_tolerance
)
self.draw_rects(words, stroke=stroke, fill=fill, stroke_width=stroke_width)
return self
def outline_chars(
self,
stroke: T_color = (255, 0, 0, 255),
fill: T_color = (255, 0, 0, int(255 / 4)),
stroke_width: int = DEFAULT_STROKE_WIDTH,
) -> "PageImage":
self.draw_rects(
self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width
)
return self
def _repr_png_(self) -> bytes:
b = BytesIO()
self.save(b, "PNG")
return b.getvalue()
def show(self) -> None: # pragma: no cover
self.annotated.show()