eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/page.py

731 lines
25 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

import numbers
import re
from functools import lru_cache
from typing import (
TYPE_CHECKING,
Any,
Callable,
Dict,
Generator,
List,
Optional,
Pattern,
Tuple,
Union,
)
from unicodedata import normalize as normalize_unicode
from warnings import warn
from pdfminer.converter import PDFPageAggregator
from pdfminer.layout import (
LTChar,
LTComponent,
LTContainer,
LTCurve,
LTItem,
LTPage,
LTTextContainer,
)
from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT
from pdfminer.pdfpage import PDFPage
from pdfminer.psparser import PSLiteral
from . import utils
from ._typing import T_bbox, T_num, T_obj, T_obj_list
from .container import Container
from .structure import PDFStructTree, StructTreeMissing
from .table import T_table_settings, Table, TableFinder, TableSettings
from .utils import decode_text, resolve_all, resolve_and_decode
from .utils.exceptions import MalformedPDFException, PdfminerException
from .utils.text import TextMap
lt_pat = re.compile(r"^LT")
ALL_ATTRS = set(
[
"adv",
"height",
"linewidth",
"pts",
"size",
"srcsize",
"width",
"x0",
"x1",
"y0",
"y1",
"bits",
"matrix",
"upright",
"fontname",
"text",
"imagemask",
"colorspace",
"evenodd",
"fill",
"non_stroking_color",
"stroke",
"stroking_color",
"stream",
"name",
"mcid",
"tag",
]
)
if TYPE_CHECKING: # pragma: nocover
from .display import PageImage
from .pdf import PDF
# via https://git.ghostscript.com/?p=mupdf.git;a=blob;f=source/pdf/pdf-font.c;h=6322cedf2c26cfb312c0c0878d7aff97b4c7470e;hb=HEAD#l774 # noqa
CP936_FONTNAMES = {
b"\xcb\xce\xcc\xe5": "SimSun,Regular",
b"\xba\xda\xcc\xe5": "SimHei,Regular",
b"\xbf\xac\xcc\xe5_GB2312": "SimKai,Regular",
b"\xb7\xc2\xcb\xce_GB2312": "SimFang,Regular",
b"\xc1\xa5\xca\xe9": "SimLi,Regular",
}
def fix_fontname_bytes(fontname: bytes) -> str:
if b"+" in fontname:
split_at = fontname.index(b"+") + 1
prefix, suffix = fontname[:split_at], fontname[split_at:]
else:
prefix, suffix = b"", fontname
suffix_new = CP936_FONTNAMES.get(suffix, str(suffix)[2:-1])
return str(prefix)[2:-1] + suffix_new
def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]:
return {
key: (tuple(value) if isinstance(value, list) else value)
for key, value in kwargs.items()
}
class PDFPageAggregatorWithMarkedContent(PDFPageAggregator):
"""Extract layout from a specific page, adding marked-content IDs to
objects where found."""
cur_mcid: Optional[int] = None
cur_tag: Optional[str] = None
def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None:
"""Handle beginning of tag, setting current MCID if any."""
self.cur_tag = decode_text(tag.name)
if isinstance(props, dict) and "MCID" in props:
self.cur_mcid = props["MCID"]
else:
self.cur_mcid = None
def end_tag(self) -> None:
"""Handle beginning of tag, clearing current MCID."""
self.cur_tag = None
self.cur_mcid = None
def tag_cur_item(self) -> None:
"""Add current MCID to what we hope to be the most recent object created
by pdfminer.six."""
# This is somewhat hacky and would not be necessary if
# pdfminer.six supported MCIDs. In reading the code it's
# clear that the `render_*` methods methods will only ever
# create one object, but that is far from being guaranteed.
# Even if pdfminer.six's API would just return the objects it
# creates, we wouldn't have to do this.
if self.cur_item._objs:
cur_obj = self.cur_item._objs[-1]
cur_obj.mcid = self.cur_mcid # type: ignore
cur_obj.tag = self.cur_tag # type: ignore
def render_char(self, *args, **kwargs) -> float: # type: ignore
"""Hook for rendering characters, adding the `mcid` attribute."""
adv = super().render_char(*args, **kwargs)
self.tag_cur_item()
return adv
def render_image(self, *args, **kwargs) -> None: # type: ignore
"""Hook for rendering images, adding the `mcid` attribute."""
super().render_image(*args, **kwargs)
self.tag_cur_item()
def paint_path(self, *args, **kwargs) -> None: # type: ignore
"""Hook for rendering lines and curves, adding the `mcid` attribute."""
super().paint_path(*args, **kwargs)
self.tag_cur_item()
def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox:
# Per PDF Reference 3.8.4: "Note: Although rectangles are
# conventionally specified by their lower-left and upperright
# corners, it is acceptable to specify any two diagonally opposite
# corners."
if not all(isinstance(x, numbers.Number) for x in box_raw): # pragma: nocover
raise MalformedPDFException(
f"Bounding box contains non-number coordinate(s): {box_raw}"
)
x0, x1 = sorted((box_raw[0], box_raw[2]))
y0, y1 = sorted((box_raw[1], box_raw[3]))
if rotation in [90, 270]:
return (y0, x0, y1, x1)
else:
return (x0, y0, x1, y1)
# PDFs coordinate spaces refer to an origin in the bottom-left of the
# page; pdfplumber flips this vertically, so that the origin is in the
# top-left.
def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox:
x0, y0, x1, y1 = box_raw
return (x0, mb_height - y1, x1, mb_height - y0)
class Page(Container):
cached_properties: List[str] = Container.cached_properties + ["_layout"]
is_original: bool = True
pages = None
def __init__(
self,
pdf: "PDF",
page_obj: PDFPage,
page_number: int,
initial_doctop: T_num = 0,
):
self.pdf = pdf
self.root_page = self
self.page_obj = page_obj
self.page_number = page_number
self.initial_doctop = initial_doctop
def get_attr(key: str, default: Any = None) -> Any:
value = resolve_all(page_obj.attrs.get(key))
return default if value is None else value
# Per PDF Reference Table 3.27: "The number of degrees by which the
# page should be rotated clockwise when displayed or printed. The value
# must be a multiple of 90. Default value: 0"
_rotation = get_attr("Rotate", 0)
self.rotation = _rotation % 360
mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation)
mb_height = mb_raw[3] - mb_raw[1]
self.mediabox = _invert_box(mb_raw, mb_height)
for box_name in ["CropBox", "TrimBox", "BleedBox", "ArtBox"]:
if box_name in page_obj.attrs:
box_normalized = _invert_box(
_normalize_box(get_attr(box_name), self.rotation), mb_height
)
setattr(self, box_name.lower(), box_normalized)
if "CropBox" not in page_obj.attrs:
self.cropbox = self.mediabox
# Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...)
self.bbox = self.mediabox
# See https://rednafi.com/python/lru_cache_on_methods/
self.get_textmap = lru_cache()(self._get_textmap)
def close(self) -> None:
self.flush_cache()
self.get_textmap.cache_clear()
@property
def width(self) -> T_num:
return self.bbox[2] - self.bbox[0]
@property
def height(self) -> T_num:
return self.bbox[3] - self.bbox[1]
@property
def structure_tree(self) -> List[Dict[str, Any]]:
"""Return the structure tree for a page, if any."""
try:
return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)]
except StructTreeMissing:
return []
@property
def layout(self) -> LTPage:
if hasattr(self, "_layout"):
return self._layout
device = PDFPageAggregatorWithMarkedContent(
self.pdf.rsrcmgr,
pageno=self.page_number,
laparams=self.pdf.laparams,
)
interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device)
try:
interpreter.process_page(self.page_obj)
except Exception as e:
raise PdfminerException(e)
self._layout: LTPage = device.get_result()
return self._layout
@property
def annots(self) -> T_obj_list:
def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]:
turns = r // 90
for i in range(turns):
x, y = pt
comp = self.width if i == turns % 2 else self.height
pt = (y, (comp - x))
return pt
def parse(annot: T_obj) -> T_obj:
_a, _b, _c, _d = annot["Rect"]
pt0 = rotate_point((_a, _b), self.rotation)
pt1 = rotate_point((_c, _d), self.rotation)
rh = self.root_page.height
x0, top, x1, bottom = _invert_box(_normalize_box((*pt0, *pt1)), rh)
a = annot.get("A", {})
extras = {
"uri": a.get("URI"),
"title": annot.get("T"),
"contents": annot.get("Contents"),
}
for k, v in extras.items():
if v is not None:
try:
extras[k] = v.decode("utf-8")
except UnicodeDecodeError:
try:
extras[k] = v.decode("utf-16")
except UnicodeDecodeError:
if self.pdf.raise_unicode_errors:
raise
warn(
f"Could not decode {k} of annotation."
f" {k} will be missing."
)
parsed = {
"page_number": self.page_number,
"object_type": "annot",
"x0": x0,
"y0": rh - bottom,
"x1": x1,
"y1": rh - top,
"doctop": self.initial_doctop + top,
"top": top,
"bottom": bottom,
"width": x1 - x0,
"height": bottom - top,
}
parsed.update(extras)
# Replace the indirect reference to the page dictionary
# with a pointer to our actual page
if "P" in annot:
annot["P"] = self
parsed["data"] = annot
return parsed
raw = resolve_all(self.page_obj.annots) or []
parsed = list(map(parse, raw))
if isinstance(self, CroppedPage):
return self._crop_fn(parsed)
else:
return parsed
@property
def hyperlinks(self) -> T_obj_list:
return [a for a in self.annots if a["uri"] is not None]
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
self._objects: Dict[str, T_obj_list] = self.parse_objects()
return self._objects
def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]:
# See note below re. #1181 and mediabox-adjustment reversions
return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1])
def process_object(self, obj: LTItem) -> T_obj:
kind = re.sub(lt_pat, "", obj.__class__.__name__).lower()
def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]:
k, v = item
if k in ALL_ATTRS:
res = resolve_all(v)
return (k, res)
else:
return None
attr = dict(filter(None, map(process_attr, obj.__dict__.items())))
attr["object_type"] = kind
attr["page_number"] = self.page_number
for cs in ["ncs", "scs"]:
# Note: As of pdfminer.six v20221105, that library only
# exposes ncs for LTChars, and neither attribute for
# other objects. Keeping this code here, though,
# for ease of addition if color spaces become
# more available via pdfminer.six
if hasattr(obj, cs):
attr[cs] = resolve_and_decode(getattr(obj, cs).name)
if isinstance(obj, (LTChar, LTTextContainer)):
text = obj.get_text()
attr["text"] = (
normalize_unicode(self.pdf.unicode_norm, text)
if self.pdf.unicode_norm is not None
else text
)
if isinstance(obj, LTChar):
# pdfminer.six (at least as of v20221105) does not
# directly expose .stroking_color and .non_stroking_color
# for LTChar objects (unlike, e.g., LTRect objects).
gs = obj.graphicstate
attr["stroking_color"] = (
gs.scolor if isinstance(gs.scolor, tuple) else (gs.scolor,)
)
attr["non_stroking_color"] = (
gs.ncolor if isinstance(gs.ncolor, tuple) else (gs.ncolor,)
)
# Handle (rare) byte-encoded fontnames
if isinstance(attr["fontname"], bytes): # pragma: nocover
attr["fontname"] = fix_fontname_bytes(attr["fontname"])
elif isinstance(obj, (LTCurve,)):
attr["pts"] = list(map(self.point2coord, attr["pts"]))
# Ignoring typing because type signature for obj.original_path
# appears to be incorrect
attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501
attr["dash"] = obj.dashing_style
# As noted in #1181, `pdfminer.six` adjusts objects'
# coordinates relative to the MediaBox:
# https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84
mb_x0, mb_top = self.mediabox[:2]
if "y0" in attr:
attr["top"] = (self.height - attr["y1"]) + mb_top
attr["bottom"] = (self.height - attr["y0"]) + mb_top
attr["doctop"] = self.initial_doctop + attr["top"]
if "x0" in attr and mb_x0 != 0:
attr["x0"] = attr["x0"] + mb_x0
attr["x1"] = attr["x1"] + mb_x0
return attr
def iter_layout_objects(
self, layout_objects: List[LTComponent]
) -> Generator[T_obj, None, None]:
for obj in layout_objects:
# If object is, like LTFigure, a higher-level object ...
if isinstance(obj, LTContainer):
# and LAParams is passed, process the object itself.
if self.pdf.laparams is not None:
yield self.process_object(obj)
# Regardless, iterate through its children
yield from self.iter_layout_objects(obj._objs)
else:
yield self.process_object(obj)
def parse_objects(self) -> Dict[str, T_obj_list]:
objects: Dict[str, T_obj_list] = {}
for obj in self.iter_layout_objects(self.layout._objs):
kind = obj["object_type"]
if kind in ["anno"]:
continue
if objects.get(kind) is None:
objects[kind] = []
objects[kind].append(obj)
return objects
def debug_tablefinder(
self, table_settings: Optional[T_table_settings] = None
) -> TableFinder:
tset = TableSettings.resolve(table_settings)
return TableFinder(self, tset)
def find_tables(
self, table_settings: Optional[T_table_settings] = None
) -> List[Table]:
tset = TableSettings.resolve(table_settings)
return TableFinder(self, tset).tables
def find_table(
self, table_settings: Optional[T_table_settings] = None
) -> Optional[Table]:
tset = TableSettings.resolve(table_settings)
tables = self.find_tables(tset)
if len(tables) == 0:
return None
# Return the largest table, as measured by number of cells.
def sorter(x: Table) -> Tuple[int, T_num, T_num]:
return (-len(x.cells), x.bbox[1], x.bbox[0])
largest = list(sorted(tables, key=sorter))[0]
return largest
def extract_tables(
self, table_settings: Optional[T_table_settings] = None
) -> List[List[List[Optional[str]]]]:
tset = TableSettings.resolve(table_settings)
tables = self.find_tables(tset)
return [table.extract(**(tset.text_settings or {})) for table in tables]
def extract_table(
self, table_settings: Optional[T_table_settings] = None
) -> Optional[List[List[Optional[str]]]]:
tset = TableSettings.resolve(table_settings)
table = self.find_table(tset)
if table is None:
return None
else:
return table.extract(**(tset.text_settings or {}))
def _get_textmap(self, **kwargs: Any) -> TextMap:
defaults: Dict[str, Any] = dict(
layout_bbox=self.bbox,
)
if "layout_width_chars" not in kwargs:
defaults.update({"layout_width": self.width})
if "layout_height_chars" not in kwargs:
defaults.update({"layout_height": self.height})
full_kwargs: Dict[str, Any] = {**defaults, **kwargs}
return utils.chars_to_textmap(self.chars, **full_kwargs)
def search(
self,
pattern: Union[str, Pattern[str]],
regex: bool = True,
case: bool = True,
main_group: int = 0,
return_chars: bool = True,
return_groups: bool = True,
**kwargs: Any,
) -> List[Dict[str, Any]]:
textmap = self.get_textmap(**tuplify_list_kwargs(kwargs))
return textmap.search(
pattern,
regex=regex,
case=case,
main_group=main_group,
return_chars=return_chars,
return_groups=return_groups,
)
def extract_text(self, **kwargs: Any) -> str:
return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string
def extract_text_simple(self, **kwargs: Any) -> str:
return utils.extract_text_simple(self.chars, **kwargs)
def extract_words(self, **kwargs: Any) -> T_obj_list:
return utils.extract_words(self.chars, **kwargs)
def extract_text_lines(
self, strip: bool = True, return_chars: bool = True, **kwargs: Any
) -> T_obj_list:
return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines(
strip=strip, return_chars=return_chars
)
def crop(
self, bbox: T_bbox, relative: bool = False, strict: bool = True
) -> "CroppedPage":
return CroppedPage(self, bbox, relative=relative, strict=strict)
def within_bbox(
self, bbox: T_bbox, relative: bool = False, strict: bool = True
) -> "CroppedPage":
"""
Same as .crop, except only includes objects fully within the bbox
"""
return CroppedPage(
self, bbox, relative=relative, strict=strict, crop_fn=utils.within_bbox
)
def outside_bbox(
self, bbox: T_bbox, relative: bool = False, strict: bool = True
) -> "CroppedPage":
"""
Same as .crop, except only includes objects fully within the bbox
"""
return CroppedPage(
self, bbox, relative=relative, strict=strict, crop_fn=utils.outside_bbox
)
def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage":
return FilteredPage(self, test_function)
def dedupe_chars(self, **kwargs: Any) -> "FilteredPage":
"""
Removes duplicate chars — those sharing the same text and positioning
(within `tolerance`) as other characters in the set. Adjust extra_args
to be more/less restrictive with the properties checked.
"""
p = FilteredPage(self, lambda x: True)
p._objects = {kind: objs for kind, objs in self.objects.items()}
p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs)
return p
def to_image(
self,
resolution: Optional[Union[int, float]] = None,
width: Optional[Union[int, float]] = None,
height: Optional[Union[int, float]] = None,
antialias: bool = False,
force_mediabox: bool = False,
) -> "PageImage":
"""
You can pass a maximum of 1 of the following:
- resolution: The desired number pixels per inch. Defaults to 72.
- width: The desired image width in pixels.
- height: The desired image width in pixels.
"""
from .display import DEFAULT_RESOLUTION, PageImage
num_specs = sum(x is not None for x in [resolution, width, height])
if num_specs > 1:
raise ValueError(
f"Only one of these arguments can be provided: resolution, width, height. You provided {num_specs}" # noqa: E501
)
elif width is not None:
resolution = 72 * width / self.width
elif height is not None:
resolution = 72 * height / self.height
return PageImage(
self,
resolution=resolution or DEFAULT_RESOLUTION,
antialias=antialias,
force_mediabox=force_mediabox,
)
def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
if object_types is None:
_object_types = list(self.objects.keys()) + ["annot"]
else:
_object_types = object_types
d = {
"page_number": self.page_number,
"initial_doctop": self.initial_doctop,
"rotation": self.rotation,
"cropbox": self.cropbox,
"mediabox": self.mediabox,
"bbox": self.bbox,
"width": self.width,
"height": self.height,
}
for t in _object_types:
d[t + "s"] = getattr(self, t + "s")
return d
def __repr__(self) -> str:
return f"<Page:{self.page_number}>"
class DerivedPage(Page):
is_original: bool = False
def __init__(self, parent_page: Page):
self.parent_page = parent_page
self.root_page = parent_page.root_page
self.pdf = parent_page.pdf
self.page_obj = parent_page.page_obj
self.page_number = parent_page.page_number
self.initial_doctop = parent_page.initial_doctop
self.rotation = parent_page.rotation
self.mediabox = parent_page.mediabox
self.cropbox = parent_page.cropbox
self.flush_cache(Container.cached_properties)
self.get_textmap = lru_cache()(self._get_textmap)
def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None:
bbox_area = utils.calculate_area(bbox)
if bbox_area == 0:
raise ValueError(f"Bounding box {bbox} has an area of zero.")
overlap = utils.get_bbox_overlap(bbox, parent_bbox)
if overlap is None:
raise ValueError(
f"Bounding box {bbox} is entirely outside "
f"parent page bounding box {parent_bbox}"
)
overlap_area = utils.calculate_area(overlap)
if overlap_area < bbox_area:
raise ValueError(
f"Bounding box {bbox} is not fully within "
f"parent page bounding box {parent_bbox}"
)
class CroppedPage(DerivedPage):
def __init__(
self,
parent_page: Page,
crop_bbox: T_bbox,
crop_fn: Callable[[T_obj_list, T_bbox], T_obj_list] = utils.crop_to_bbox,
relative: bool = False,
strict: bool = True,
):
if relative:
o_x0, o_top, _, _ = parent_page.bbox
x0, top, x1, bottom = crop_bbox
crop_bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top)
if strict:
test_proposed_bbox(crop_bbox, parent_page.bbox)
def _crop_fn(objs: T_obj_list) -> T_obj_list:
return crop_fn(objs, crop_bbox)
super().__init__(parent_page)
self._crop_fn = _crop_fn
# Note: testing for original function passed, not _crop_fn
if crop_fn is utils.outside_bbox:
self.bbox = parent_page.bbox
else:
self.bbox = crop_bbox
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
self._objects: Dict[str, T_obj_list] = {
k: self._crop_fn(v) for k, v in self.parent_page.objects.items()
}
return self._objects
class FilteredPage(DerivedPage):
def __init__(self, parent_page: Page, filter_fn: Callable[[T_obj], bool]):
self.bbox = parent_page.bbox
self.filter_fn = filter_fn
super().__init__(parent_page)
@property
def objects(self) -> Dict[str, T_obj_list]:
if hasattr(self, "_objects"):
return self._objects
self._objects: Dict[str, T_obj_list] = {
k: list(filter(self.filter_fn, v))
for k, v in self.parent_page.objects.items()
}
return self._objects