eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/display.py

import pathlib
from io import BufferedReader, BytesIO
from typing import TYPE_CHECKING, Any, List, Optional, Tuple, Union

import PIL.Image
import PIL.ImageDraw
import pypdfium2  # type: ignore

from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list, T_point, T_seq
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils.exceptions import MalformedPDFException

if TYPE_CHECKING:  # pragma: nocover
    import pandas as pd

    from .page import Page


class COLORS:
    RED = (255, 0, 0)
    GREEN = (0, 255, 0)
    BLUE = (0, 0, 255)
    TRANSPARENT = (0, 0, 0, 0)


DEFAULT_FILL = COLORS.BLUE + (50,)
DEFAULT_STROKE = COLORS.RED + (200,)
DEFAULT_STROKE_WIDTH = 1
DEFAULT_RESOLUTION = 72

T_color = Union[Tuple[int, int, int], Tuple[int, int, int, int], str]
T_contains_points = Union[Tuple[T_point, ...], List[T_point], T_obj]


def get_page_image(
    stream: Union[BufferedReader, BytesIO],
    path: Optional[pathlib.Path],
    page_ix: int,
    resolution: Union[int, float],
    password: Optional[str],
    antialias: bool = False,
) -> PIL.Image.Image:

    src: Union[pathlib.Path, BufferedReader, BytesIO]

    # If we are working with a file object saved to disk
    if path:
        src = path

    # If we instead are working with a BytesIO stream
    else:
        stream.seek(0)
        src = stream

    try:
        pdfium_doc = pypdfium2.PdfDocument(src, password=password)
    except pypdfium2.PdfiumError as e:
        raise MalformedPDFException(e)

    pdfium_page = pdfium_doc.get_page(page_ix)

    img: PIL.Image.Image = pdfium_page.render(
        # Modifiable arguments
        scale=resolution / 72,
        no_smoothtext=not antialias,
        no_smoothpath=not antialias,
        no_smoothimage=not antialias,
        # Non-modifiable arguments
        prefer_bgrx=True,
    ).to_pil()
    pdfium_doc.close()

    return img.convert("RGB")


class PageImage:
    def __init__(
        self,
        page: "Page",
        original: Optional[PIL.Image.Image] = None,
        resolution: Union[int, float] = DEFAULT_RESOLUTION,
        antialias: bool = False,
        force_mediabox: bool = False,
    ):
        self.page = page
        self.root = page if page.is_original else page.root_page
        self.resolution = resolution

        if original is None:
            self.original = get_page_image(
                stream=page.pdf.stream,
                path=page.pdf.path,
                page_ix=page.page_number - 1,
                resolution=resolution,
                antialias=antialias,
                password=page.pdf.password,
            )
        else:
            self.original = original

        self.scale = self.original.size[0] / (page.cropbox[2] - page.cropbox[0])

        # This value represents the coordinates of the page,
        # in page-unit values, that will be displayed.
        self.bbox = (
            page.bbox
            if page.bbox != page.mediabox
            else (page.mediabox if force_mediabox else page.cropbox)
        )

        # If this value is different than the *Page*'s .cropbox
        # (e.g., because the mediabox differs from the cropbox or
        # or because we've used Page.crop(...)), then we'll need to
        # crop the initially-converted image.
        if page.bbox != page.cropbox:
            crop_dims = self._reproject_bbox(page.cropbox)
            bbox_dims = self._reproject_bbox(self.bbox)
            self.original = self.original.crop(
                (
                    bbox_dims[0] - crop_dims[0],
                    bbox_dims[1] - crop_dims[1],
                    bbox_dims[2] - crop_dims[0],
                    bbox_dims[3] - crop_dims[1],
                )
            )

        self.reset()

    def _reproject_bbox(self, bbox: T_bbox) -> Tuple[int, int, int, int]:
        x0, top, x1, bottom = bbox
        _x0, _top = self._reproject((x0, top))
        _x1, _bottom = self._reproject((x1, bottom))
        return (_x0, _top, _x1, _bottom)

    def _reproject(self, coord: T_point) -> Tuple[int, int]:
        """
        Given an (x0, top) tuple from the *root* coordinate system,
        return an (x0, top) tuple in the *image* coordinate system.
        """
        x0, top = coord
        _x0 = (x0 - self.bbox[0]) * self.scale
        _top = (top - self.bbox[1]) * self.scale
        return (int(_x0), int(_top))

    def reset(self) -> "PageImage":
        self.annotated = PIL.Image.new("RGB", self.original.size)
        self.annotated.paste(self.original)
        self.draw = PIL.ImageDraw.Draw(self.annotated, "RGBA")
        return self

    def save(
        self,
        dest: Union[str, pathlib.Path, BytesIO],
        format: str = "PNG",
        quantize: bool = True,
        colors: int = 256,
        bits: int = 8,
        **kwargs: Any,
    ) -> None:
        if quantize:
            out = self.annotated.quantize(colors, method=PIL.Image.FASTOCTREE).convert(
                "P"
            )
        else:
            out = self.annotated

        out.save(
            dest,
            format=format,
            bits=bits,
            dpi=(self.resolution, self.resolution),
            **kwargs,
        )

    def copy(self) -> "PageImage":
        return self.__class__(self.page, self.original)

    def draw_line(
        self,
        points_or_obj: T_contains_points,
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        # If passing a raw list of points, use those
        if isinstance(points_or_obj, (tuple, list)):
            points = points_or_obj
        # Else, use the "pts" attribute if available
        elif isinstance(points_or_obj, dict) and "pts" in points_or_obj:
            points = [(x, y) for x, y in points_or_obj["pts"]]
        # Otherwise, just use ((x0, top), (x1, bottom))
        else:
            obj = points_or_obj
            points = ((obj["x0"], obj["top"]), (obj["x1"], obj["bottom"]))

        self.draw.line(
            list(map(self._reproject, points)), fill=stroke, width=stroke_width
        )

        return self

    def draw_lines(
        self,
        list_of_lines: Union[T_seq[T_contains_points], "pd.DataFrame"],
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        for x in utils.to_list(list_of_lines):
            self.draw_line(x, stroke=stroke, stroke_width=stroke_width)
        return self

    def draw_vline(
        self,
        location: T_num,
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        points = (location, self.bbox[1], location, self.bbox[3])
        self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
        return self

    def draw_vlines(
        self,
        locations: Union[List[T_num], "pd.Series[float]"],
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        for x in list(locations):
            self.draw_vline(x, stroke=stroke, stroke_width=stroke_width)
        return self

    def draw_hline(
        self,
        location: T_num,
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        points = (self.bbox[0], location, self.bbox[2], location)
        self.draw.line(self._reproject_bbox(points), fill=stroke, width=stroke_width)
        return self

    def draw_hlines(
        self,
        locations: Union[List[T_num], "pd.Series[float]"],
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        for x in list(locations):
            self.draw_hline(x, stroke=stroke, stroke_width=stroke_width)
        return self

    def draw_rect(
        self,
        bbox_or_obj: Union[T_bbox, T_obj],
        fill: T_color = DEFAULT_FILL,
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        if isinstance(bbox_or_obj, (tuple, list)):
            bbox = bbox_or_obj
        else:
            obj = bbox_or_obj
            bbox = (obj["x0"], obj["top"], obj["x1"], obj["bottom"])

        x0, top, x1, bottom = bbox
        half = stroke_width / 2
        x0 = min(x0 + half, (x0 + x1) / 2)
        top = min(top + half, (top + bottom) / 2)
        x1 = max(x1 - half, (x0 + x1) / 2)
        bottom = max(bottom - half, (top + bottom) / 2)

        fill_bbox = self._reproject_bbox((x0, top, x1, bottom))
        self.draw.rectangle(fill_bbox, fill, COLORS.TRANSPARENT)

        if stroke_width > 0:
            segments = [
                ((x0, top), (x1, top)),  # top
                ((x0, bottom), (x1, bottom)),  # bottom
                ((x0, top), (x0, bottom)),  # left
                ((x1, top), (x1, bottom)),  # right
            ]
            self.draw_lines(segments, stroke=stroke, stroke_width=stroke_width)
        return self

    def draw_rects(
        self,
        list_of_rects: Union[List[T_bbox], T_obj_list, "pd.DataFrame"],
        fill: T_color = DEFAULT_FILL,
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":
        for x in utils.to_list(list_of_rects):
            self.draw_rect(x, fill=fill, stroke=stroke, stroke_width=stroke_width)
        return self

    def draw_circle(
        self,
        center_or_obj: Union[T_point, T_obj],
        radius: int = 5,
        fill: T_color = DEFAULT_FILL,
        stroke: T_color = DEFAULT_STROKE,
    ) -> "PageImage":
        if isinstance(center_or_obj, tuple):
            center = center_or_obj
        else:
            obj = center_or_obj
            center = ((obj["x0"] + obj["x1"]) / 2, (obj["top"] + obj["bottom"]) / 2)
        cx, cy = center
        bbox = (cx - radius, cy - radius, cx + radius, cy + radius)
        self.draw.ellipse(self._reproject_bbox(bbox), fill, stroke)
        return self

    def draw_circles(
        self,
        list_of_circles: Union[List[T_point], T_obj_list, "pd.DataFrame"],
        radius: int = 5,
        fill: T_color = DEFAULT_FILL,
        stroke: T_color = DEFAULT_STROKE,
    ) -> "PageImage":
        for x in utils.to_list(list_of_circles):
            self.draw_circle(x, radius=radius, fill=fill, stroke=stroke)
        return self

    def debug_table(
        self,
        table: Table,
        fill: T_color = DEFAULT_FILL,
        stroke: T_color = DEFAULT_STROKE,
        stroke_width: int = 1,
    ) -> "PageImage":
        """
        Outline all found tables.
        """
        self.draw_rects(
            table.cells, fill=fill, stroke=stroke, stroke_width=stroke_width
        )
        return self

    def debug_tablefinder(
        self,
        table_settings: Optional[
            Union[TableFinder, TableSettings, T_table_settings]
        ] = None,
    ) -> "PageImage":
        if isinstance(table_settings, TableFinder):
            finder = table_settings
        elif table_settings is None or isinstance(
            table_settings, (TableSettings, dict)
        ):
            finder = self.page.debug_tablefinder(table_settings)
        else:
            raise ValueError(
                "Argument must be instance of TableFinder"
                "or a TableFinder settings dict."
            )

        for table in finder.tables:
            self.debug_table(table)

        self.draw_lines(finder.edges, stroke_width=1)

        self.draw_circles(
            list(finder.intersections.keys()),
            fill=COLORS.TRANSPARENT,
            stroke=COLORS.BLUE + (200,),
            radius=3,
        )
        return self

    def outline_words(
        self,
        stroke: T_color = DEFAULT_STROKE,
        fill: T_color = DEFAULT_FILL,
        stroke_width: int = DEFAULT_STROKE_WIDTH,
        x_tolerance: T_num = utils.DEFAULT_X_TOLERANCE,
        y_tolerance: T_num = utils.DEFAULT_Y_TOLERANCE,
    ) -> "PageImage":

        words = self.page.extract_words(
            x_tolerance=x_tolerance, y_tolerance=y_tolerance
        )
        self.draw_rects(words, stroke=stroke, fill=fill, stroke_width=stroke_width)
        return self

    def outline_chars(
        self,
        stroke: T_color = (255, 0, 0, 255),
        fill: T_color = (255, 0, 0, int(255 / 4)),
        stroke_width: int = DEFAULT_STROKE_WIDTH,
    ) -> "PageImage":

        self.draw_rects(
            self.page.chars, stroke=stroke, fill=fill, stroke_width=stroke_width
        )
        return self

    def _repr_png_(self) -> bytes:
        b = BytesIO()
        self.save(b, "PNG")
        return b.getvalue()

    def show(self) -> None:  # pragma: no cover
        self.annotated.show()