import numbers import re from functools import lru_cache from typing import ( TYPE_CHECKING, Any, Callable, Dict, Generator, List, Optional, Pattern, Tuple, Union, ) from unicodedata import normalize as normalize_unicode from warnings import warn from pdfminer.converter import PDFPageAggregator from pdfminer.layout import ( LTChar, LTComponent, LTContainer, LTCurve, LTItem, LTPage, LTTextContainer, ) from pdfminer.pdfinterp import PDFPageInterpreter, PDFStackT from pdfminer.pdfpage import PDFPage from pdfminer.psparser import PSLiteral from . import utils from ._typing import T_bbox, T_num, T_obj, T_obj_list from .container import Container from .structure import PDFStructTree, StructTreeMissing from .table import T_table_settings, Table, TableFinder, TableSettings from .utils import decode_text, resolve_all, resolve_and_decode from .utils.exceptions import MalformedPDFException, PdfminerException from .utils.text import TextMap lt_pat = re.compile(r"^LT") ALL_ATTRS = set( [ "adv", "height", "linewidth", "pts", "size", "srcsize", "width", "x0", "x1", "y0", "y1", "bits", "matrix", "upright", "fontname", "text", "imagemask", "colorspace", "evenodd", "fill", "non_stroking_color", "stroke", "stroking_color", "stream", "name", "mcid", "tag", ] ) if TYPE_CHECKING: # pragma: nocover from .display import PageImage from .pdf import PDF # via https://git.ghostscript.com/?p=mupdf.git;a=blob;f=source/pdf/pdf-font.c;h=6322cedf2c26cfb312c0c0878d7aff97b4c7470e;hb=HEAD#l774 # noqa CP936_FONTNAMES = { b"\xcb\xce\xcc\xe5": "SimSun,Regular", b"\xba\xda\xcc\xe5": "SimHei,Regular", b"\xbf\xac\xcc\xe5_GB2312": "SimKai,Regular", b"\xb7\xc2\xcb\xce_GB2312": "SimFang,Regular", b"\xc1\xa5\xca\xe9": "SimLi,Regular", } def fix_fontname_bytes(fontname: bytes) -> str: if b"+" in fontname: split_at = fontname.index(b"+") + 1 prefix, suffix = fontname[:split_at], fontname[split_at:] else: prefix, suffix = b"", fontname suffix_new = CP936_FONTNAMES.get(suffix, str(suffix)[2:-1]) return str(prefix)[2:-1] + suffix_new def tuplify_list_kwargs(kwargs: Dict[str, Any]) -> Dict[str, Any]: return { key: (tuple(value) if isinstance(value, list) else value) for key, value in kwargs.items() } class PDFPageAggregatorWithMarkedContent(PDFPageAggregator): """Extract layout from a specific page, adding marked-content IDs to objects where found.""" cur_mcid: Optional[int] = None cur_tag: Optional[str] = None def begin_tag(self, tag: PSLiteral, props: Optional[PDFStackT] = None) -> None: """Handle beginning of tag, setting current MCID if any.""" self.cur_tag = decode_text(tag.name) if isinstance(props, dict) and "MCID" in props: self.cur_mcid = props["MCID"] else: self.cur_mcid = None def end_tag(self) -> None: """Handle beginning of tag, clearing current MCID.""" self.cur_tag = None self.cur_mcid = None def tag_cur_item(self) -> None: """Add current MCID to what we hope to be the most recent object created by pdfminer.six.""" # This is somewhat hacky and would not be necessary if # pdfminer.six supported MCIDs. In reading the code it's # clear that the `render_*` methods methods will only ever # create one object, but that is far from being guaranteed. # Even if pdfminer.six's API would just return the objects it # creates, we wouldn't have to do this. if self.cur_item._objs: cur_obj = self.cur_item._objs[-1] cur_obj.mcid = self.cur_mcid # type: ignore cur_obj.tag = self.cur_tag # type: ignore def render_char(self, *args, **kwargs) -> float: # type: ignore """Hook for rendering characters, adding the `mcid` attribute.""" adv = super().render_char(*args, **kwargs) self.tag_cur_item() return adv def render_image(self, *args, **kwargs) -> None: # type: ignore """Hook for rendering images, adding the `mcid` attribute.""" super().render_image(*args, **kwargs) self.tag_cur_item() def paint_path(self, *args, **kwargs) -> None: # type: ignore """Hook for rendering lines and curves, adding the `mcid` attribute.""" super().paint_path(*args, **kwargs) self.tag_cur_item() def _normalize_box(box_raw: T_bbox, rotation: T_num = 0) -> T_bbox: # Per PDF Reference 3.8.4: "Note: Although rectangles are # conventionally specified by their lower-left and upperright # corners, it is acceptable to specify any two diagonally opposite # corners." if not all(isinstance(x, numbers.Number) for x in box_raw): # pragma: nocover raise MalformedPDFException( f"Bounding box contains non-number coordinate(s): {box_raw}" ) x0, x1 = sorted((box_raw[0], box_raw[2])) y0, y1 = sorted((box_raw[1], box_raw[3])) if rotation in [90, 270]: return (y0, x0, y1, x1) else: return (x0, y0, x1, y1) # PDFs coordinate spaces refer to an origin in the bottom-left of the # page; pdfplumber flips this vertically, so that the origin is in the # top-left. def _invert_box(box_raw: T_bbox, mb_height: T_num) -> T_bbox: x0, y0, x1, y1 = box_raw return (x0, mb_height - y1, x1, mb_height - y0) class Page(Container): cached_properties: List[str] = Container.cached_properties + ["_layout"] is_original: bool = True pages = None def __init__( self, pdf: "PDF", page_obj: PDFPage, page_number: int, initial_doctop: T_num = 0, ): self.pdf = pdf self.root_page = self self.page_obj = page_obj self.page_number = page_number self.initial_doctop = initial_doctop def get_attr(key: str, default: Any = None) -> Any: value = resolve_all(page_obj.attrs.get(key)) return default if value is None else value # Per PDF Reference Table 3.27: "The number of degrees by which the # page should be rotated clockwise when displayed or printed. The value # must be a multiple of 90. Default value: 0" _rotation = get_attr("Rotate", 0) self.rotation = _rotation % 360 mb_raw = _normalize_box(get_attr("MediaBox"), self.rotation) mb_height = mb_raw[3] - mb_raw[1] self.mediabox = _invert_box(mb_raw, mb_height) for box_name in ["CropBox", "TrimBox", "BleedBox", "ArtBox"]: if box_name in page_obj.attrs: box_normalized = _invert_box( _normalize_box(get_attr(box_name), self.rotation), mb_height ) setattr(self, box_name.lower(), box_normalized) if "CropBox" not in page_obj.attrs: self.cropbox = self.mediabox # Page.bbox defaults to self.mediabox, but can be altered by Page.crop(...) self.bbox = self.mediabox # See https://rednafi.com/python/lru_cache_on_methods/ self.get_textmap = lru_cache()(self._get_textmap) def close(self) -> None: self.flush_cache() self.get_textmap.cache_clear() @property def width(self) -> T_num: return self.bbox[2] - self.bbox[0] @property def height(self) -> T_num: return self.bbox[3] - self.bbox[1] @property def structure_tree(self) -> List[Dict[str, Any]]: """Return the structure tree for a page, if any.""" try: return [elem.to_dict() for elem in PDFStructTree(self.pdf, self)] except StructTreeMissing: return [] @property def layout(self) -> LTPage: if hasattr(self, "_layout"): return self._layout device = PDFPageAggregatorWithMarkedContent( self.pdf.rsrcmgr, pageno=self.page_number, laparams=self.pdf.laparams, ) interpreter = PDFPageInterpreter(self.pdf.rsrcmgr, device) try: interpreter.process_page(self.page_obj) except Exception as e: raise PdfminerException(e) self._layout: LTPage = device.get_result() return self._layout @property def annots(self) -> T_obj_list: def rotate_point(pt: Tuple[float, float], r: int) -> Tuple[float, float]: turns = r // 90 for i in range(turns): x, y = pt comp = self.width if i == turns % 2 else self.height pt = (y, (comp - x)) return pt def parse(annot: T_obj) -> T_obj: _a, _b, _c, _d = annot["Rect"] pt0 = rotate_point((_a, _b), self.rotation) pt1 = rotate_point((_c, _d), self.rotation) rh = self.root_page.height x0, top, x1, bottom = _invert_box(_normalize_box((*pt0, *pt1)), rh) a = annot.get("A", {}) extras = { "uri": a.get("URI"), "title": annot.get("T"), "contents": annot.get("Contents"), } for k, v in extras.items(): if v is not None: try: extras[k] = v.decode("utf-8") except UnicodeDecodeError: try: extras[k] = v.decode("utf-16") except UnicodeDecodeError: if self.pdf.raise_unicode_errors: raise warn( f"Could not decode {k} of annotation." f" {k} will be missing." ) parsed = { "page_number": self.page_number, "object_type": "annot", "x0": x0, "y0": rh - bottom, "x1": x1, "y1": rh - top, "doctop": self.initial_doctop + top, "top": top, "bottom": bottom, "width": x1 - x0, "height": bottom - top, } parsed.update(extras) # Replace the indirect reference to the page dictionary # with a pointer to our actual page if "P" in annot: annot["P"] = self parsed["data"] = annot return parsed raw = resolve_all(self.page_obj.annots) or [] parsed = list(map(parse, raw)) if isinstance(self, CroppedPage): return self._crop_fn(parsed) else: return parsed @property def hyperlinks(self) -> T_obj_list: return [a for a in self.annots if a["uri"] is not None] @property def objects(self) -> Dict[str, T_obj_list]: if hasattr(self, "_objects"): return self._objects self._objects: Dict[str, T_obj_list] = self.parse_objects() return self._objects def point2coord(self, pt: Tuple[T_num, T_num]) -> Tuple[T_num, T_num]: # See note below re. #1181 and mediabox-adjustment reversions return (self.mediabox[0] + pt[0], self.mediabox[1] + self.height - pt[1]) def process_object(self, obj: LTItem) -> T_obj: kind = re.sub(lt_pat, "", obj.__class__.__name__).lower() def process_attr(item: Tuple[str, Any]) -> Optional[Tuple[str, Any]]: k, v = item if k in ALL_ATTRS: res = resolve_all(v) return (k, res) else: return None attr = dict(filter(None, map(process_attr, obj.__dict__.items()))) attr["object_type"] = kind attr["page_number"] = self.page_number for cs in ["ncs", "scs"]: # Note: As of pdfminer.six v20221105, that library only # exposes ncs for LTChars, and neither attribute for # other objects. Keeping this code here, though, # for ease of addition if color spaces become # more available via pdfminer.six if hasattr(obj, cs): attr[cs] = resolve_and_decode(getattr(obj, cs).name) if isinstance(obj, (LTChar, LTTextContainer)): text = obj.get_text() attr["text"] = ( normalize_unicode(self.pdf.unicode_norm, text) if self.pdf.unicode_norm is not None else text ) if isinstance(obj, LTChar): # pdfminer.six (at least as of v20221105) does not # directly expose .stroking_color and .non_stroking_color # for LTChar objects (unlike, e.g., LTRect objects). gs = obj.graphicstate attr["stroking_color"] = ( gs.scolor if isinstance(gs.scolor, tuple) else (gs.scolor,) ) attr["non_stroking_color"] = ( gs.ncolor if isinstance(gs.ncolor, tuple) else (gs.ncolor,) ) # Handle (rare) byte-encoded fontnames if isinstance(attr["fontname"], bytes): # pragma: nocover attr["fontname"] = fix_fontname_bytes(attr["fontname"]) elif isinstance(obj, (LTCurve,)): attr["pts"] = list(map(self.point2coord, attr["pts"])) # Ignoring typing because type signature for obj.original_path # appears to be incorrect attr["path"] = [(cmd, *map(self.point2coord, pts)) for cmd, *pts in obj.original_path] # type: ignore # noqa: E501 attr["dash"] = obj.dashing_style # As noted in #1181, `pdfminer.six` adjusts objects' # coordinates relative to the MediaBox: # https://github.com/pdfminer/pdfminer.six/blob/1a8bd2f730295b31d6165e4d95fcb5a03793c978/pdfminer/converter.py#L79-L84 mb_x0, mb_top = self.mediabox[:2] if "y0" in attr: attr["top"] = (self.height - attr["y1"]) + mb_top attr["bottom"] = (self.height - attr["y0"]) + mb_top attr["doctop"] = self.initial_doctop + attr["top"] if "x0" in attr and mb_x0 != 0: attr["x0"] = attr["x0"] + mb_x0 attr["x1"] = attr["x1"] + mb_x0 return attr def iter_layout_objects( self, layout_objects: List[LTComponent] ) -> Generator[T_obj, None, None]: for obj in layout_objects: # If object is, like LTFigure, a higher-level object ... if isinstance(obj, LTContainer): # and LAParams is passed, process the object itself. if self.pdf.laparams is not None: yield self.process_object(obj) # Regardless, iterate through its children yield from self.iter_layout_objects(obj._objs) else: yield self.process_object(obj) def parse_objects(self) -> Dict[str, T_obj_list]: objects: Dict[str, T_obj_list] = {} for obj in self.iter_layout_objects(self.layout._objs): kind = obj["object_type"] if kind in ["anno"]: continue if objects.get(kind) is None: objects[kind] = [] objects[kind].append(obj) return objects def debug_tablefinder( self, table_settings: Optional[T_table_settings] = None ) -> TableFinder: tset = TableSettings.resolve(table_settings) return TableFinder(self, tset) def find_tables( self, table_settings: Optional[T_table_settings] = None ) -> List[Table]: tset = TableSettings.resolve(table_settings) return TableFinder(self, tset).tables def find_table( self, table_settings: Optional[T_table_settings] = None ) -> Optional[Table]: tset = TableSettings.resolve(table_settings) tables = self.find_tables(tset) if len(tables) == 0: return None # Return the largest table, as measured by number of cells. def sorter(x: Table) -> Tuple[int, T_num, T_num]: return (-len(x.cells), x.bbox[1], x.bbox[0]) largest = list(sorted(tables, key=sorter))[0] return largest def extract_tables( self, table_settings: Optional[T_table_settings] = None ) -> List[List[List[Optional[str]]]]: tset = TableSettings.resolve(table_settings) tables = self.find_tables(tset) return [table.extract(**(tset.text_settings or {})) for table in tables] def extract_table( self, table_settings: Optional[T_table_settings] = None ) -> Optional[List[List[Optional[str]]]]: tset = TableSettings.resolve(table_settings) table = self.find_table(tset) if table is None: return None else: return table.extract(**(tset.text_settings or {})) def _get_textmap(self, **kwargs: Any) -> TextMap: defaults: Dict[str, Any] = dict( layout_bbox=self.bbox, ) if "layout_width_chars" not in kwargs: defaults.update({"layout_width": self.width}) if "layout_height_chars" not in kwargs: defaults.update({"layout_height": self.height}) full_kwargs: Dict[str, Any] = {**defaults, **kwargs} return utils.chars_to_textmap(self.chars, **full_kwargs) def search( self, pattern: Union[str, Pattern[str]], regex: bool = True, case: bool = True, main_group: int = 0, return_chars: bool = True, return_groups: bool = True, **kwargs: Any, ) -> List[Dict[str, Any]]: textmap = self.get_textmap(**tuplify_list_kwargs(kwargs)) return textmap.search( pattern, regex=regex, case=case, main_group=main_group, return_chars=return_chars, return_groups=return_groups, ) def extract_text(self, **kwargs: Any) -> str: return self.get_textmap(**tuplify_list_kwargs(kwargs)).as_string def extract_text_simple(self, **kwargs: Any) -> str: return utils.extract_text_simple(self.chars, **kwargs) def extract_words(self, **kwargs: Any) -> T_obj_list: return utils.extract_words(self.chars, **kwargs) def extract_text_lines( self, strip: bool = True, return_chars: bool = True, **kwargs: Any ) -> T_obj_list: return self.get_textmap(**tuplify_list_kwargs(kwargs)).extract_text_lines( strip=strip, return_chars=return_chars ) def crop( self, bbox: T_bbox, relative: bool = False, strict: bool = True ) -> "CroppedPage": return CroppedPage(self, bbox, relative=relative, strict=strict) def within_bbox( self, bbox: T_bbox, relative: bool = False, strict: bool = True ) -> "CroppedPage": """ Same as .crop, except only includes objects fully within the bbox """ return CroppedPage( self, bbox, relative=relative, strict=strict, crop_fn=utils.within_bbox ) def outside_bbox( self, bbox: T_bbox, relative: bool = False, strict: bool = True ) -> "CroppedPage": """ Same as .crop, except only includes objects fully within the bbox """ return CroppedPage( self, bbox, relative=relative, strict=strict, crop_fn=utils.outside_bbox ) def filter(self, test_function: Callable[[T_obj], bool]) -> "FilteredPage": return FilteredPage(self, test_function) def dedupe_chars(self, **kwargs: Any) -> "FilteredPage": """ Removes duplicate chars — those sharing the same text and positioning (within `tolerance`) as other characters in the set. Adjust extra_args to be more/less restrictive with the properties checked. """ p = FilteredPage(self, lambda x: True) p._objects = {kind: objs for kind, objs in self.objects.items()} p._objects["char"] = utils.dedupe_chars(self.chars, **kwargs) return p def to_image( self, resolution: Optional[Union[int, float]] = None, width: Optional[Union[int, float]] = None, height: Optional[Union[int, float]] = None, antialias: bool = False, force_mediabox: bool = False, ) -> "PageImage": """ You can pass a maximum of 1 of the following: - resolution: The desired number pixels per inch. Defaults to 72. - width: The desired image width in pixels. - height: The desired image width in pixels. """ from .display import DEFAULT_RESOLUTION, PageImage num_specs = sum(x is not None for x in [resolution, width, height]) if num_specs > 1: raise ValueError( f"Only one of these arguments can be provided: resolution, width, height. You provided {num_specs}" # noqa: E501 ) elif width is not None: resolution = 72 * width / self.width elif height is not None: resolution = 72 * height / self.height return PageImage( self, resolution=resolution or DEFAULT_RESOLUTION, antialias=antialias, force_mediabox=force_mediabox, ) def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]: if object_types is None: _object_types = list(self.objects.keys()) + ["annot"] else: _object_types = object_types d = { "page_number": self.page_number, "initial_doctop": self.initial_doctop, "rotation": self.rotation, "cropbox": self.cropbox, "mediabox": self.mediabox, "bbox": self.bbox, "width": self.width, "height": self.height, } for t in _object_types: d[t + "s"] = getattr(self, t + "s") return d def __repr__(self) -> str: return f"" class DerivedPage(Page): is_original: bool = False def __init__(self, parent_page: Page): self.parent_page = parent_page self.root_page = parent_page.root_page self.pdf = parent_page.pdf self.page_obj = parent_page.page_obj self.page_number = parent_page.page_number self.initial_doctop = parent_page.initial_doctop self.rotation = parent_page.rotation self.mediabox = parent_page.mediabox self.cropbox = parent_page.cropbox self.flush_cache(Container.cached_properties) self.get_textmap = lru_cache()(self._get_textmap) def test_proposed_bbox(bbox: T_bbox, parent_bbox: T_bbox) -> None: bbox_area = utils.calculate_area(bbox) if bbox_area == 0: raise ValueError(f"Bounding box {bbox} has an area of zero.") overlap = utils.get_bbox_overlap(bbox, parent_bbox) if overlap is None: raise ValueError( f"Bounding box {bbox} is entirely outside " f"parent page bounding box {parent_bbox}" ) overlap_area = utils.calculate_area(overlap) if overlap_area < bbox_area: raise ValueError( f"Bounding box {bbox} is not fully within " f"parent page bounding box {parent_bbox}" ) class CroppedPage(DerivedPage): def __init__( self, parent_page: Page, crop_bbox: T_bbox, crop_fn: Callable[[T_obj_list, T_bbox], T_obj_list] = utils.crop_to_bbox, relative: bool = False, strict: bool = True, ): if relative: o_x0, o_top, _, _ = parent_page.bbox x0, top, x1, bottom = crop_bbox crop_bbox = (x0 + o_x0, top + o_top, x1 + o_x0, bottom + o_top) if strict: test_proposed_bbox(crop_bbox, parent_page.bbox) def _crop_fn(objs: T_obj_list) -> T_obj_list: return crop_fn(objs, crop_bbox) super().__init__(parent_page) self._crop_fn = _crop_fn # Note: testing for original function passed, not _crop_fn if crop_fn is utils.outside_bbox: self.bbox = parent_page.bbox else: self.bbox = crop_bbox @property def objects(self) -> Dict[str, T_obj_list]: if hasattr(self, "_objects"): return self._objects self._objects: Dict[str, T_obj_list] = { k: self._crop_fn(v) for k, v in self.parent_page.objects.items() } return self._objects class FilteredPage(DerivedPage): def __init__(self, parent_page: Page, filter_fn: Callable[[T_obj], bool]): self.bbox = parent_page.bbox self.filter_fn = filter_fn super().__init__(parent_page) @property def objects(self) -> Dict[str, T_obj_list]: if hasattr(self, "_objects"): return self._objects self._objects: Dict[str, T_obj_list] = { k: list(filter(self.filter_fn, v)) for k, v in self.parent_page.objects.items() } return self._objects