205 lines
6.8 KiB
Python
205 lines
6.8 KiB
Python
import itertools
|
|
import logging
|
|
import pathlib
|
|
from io import BufferedReader, BytesIO
|
|
from types import TracebackType
|
|
from typing import Any, Dict, Generator, List, Literal, Optional, Tuple, Type, Union
|
|
|
|
from pdfminer.layout import LAParams
|
|
from pdfminer.pdfdocument import PDFDocument
|
|
from pdfminer.pdfinterp import PDFResourceManager
|
|
from pdfminer.pdfpage import PDFPage
|
|
from pdfminer.pdfparser import PDFParser
|
|
|
|
from ._typing import T_num, T_obj_list
|
|
from .container import Container
|
|
from .page import Page
|
|
from .repair import T_repair_setting, _repair
|
|
from .structure import PDFStructTree, StructTreeMissing
|
|
from .utils import resolve_and_decode
|
|
from .utils.exceptions import PdfminerException
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
|
|
class PDF(Container):
|
|
cached_properties: List[str] = Container.cached_properties + ["_pages"]
|
|
|
|
def __init__(
|
|
self,
|
|
stream: Union[BufferedReader, BytesIO],
|
|
stream_is_external: bool = False,
|
|
path: Optional[pathlib.Path] = None,
|
|
pages: Optional[Union[List[int], Tuple[int]]] = None,
|
|
laparams: Optional[Dict[str, Any]] = None,
|
|
password: Optional[str] = None,
|
|
strict_metadata: bool = False,
|
|
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
|
|
raise_unicode_errors: bool = True,
|
|
):
|
|
self.stream = stream
|
|
self.stream_is_external = stream_is_external
|
|
self.path = path
|
|
self.pages_to_parse = pages
|
|
self.laparams = None if laparams is None else LAParams(**laparams)
|
|
self.password = password
|
|
self.unicode_norm = unicode_norm
|
|
self.raise_unicode_errors = raise_unicode_errors
|
|
|
|
try:
|
|
self.doc = PDFDocument(PDFParser(stream), password=password or "")
|
|
except Exception as e:
|
|
raise PdfminerException(e)
|
|
self.rsrcmgr = PDFResourceManager()
|
|
self.metadata = {}
|
|
|
|
for info in self.doc.info:
|
|
self.metadata.update(info)
|
|
for k, v in self.metadata.items():
|
|
try:
|
|
self.metadata[k] = resolve_and_decode(v)
|
|
except Exception as e: # pragma: nocover
|
|
if strict_metadata:
|
|
# Raise an exception since unable to resolve the metadata value.
|
|
raise
|
|
# This metadata value could not be parsed. Instead of failing the PDF
|
|
# read, treat it as a warning only if `strict_metadata=False`.
|
|
logger.warning(
|
|
f'[WARNING] Metadata key "{k}" could not be parsed due to '
|
|
f"exception: {str(e)}"
|
|
)
|
|
|
|
@classmethod
|
|
def open(
|
|
cls,
|
|
path_or_fp: Union[str, pathlib.Path, BufferedReader, BytesIO],
|
|
pages: Optional[Union[List[int], Tuple[int]]] = None,
|
|
laparams: Optional[Dict[str, Any]] = None,
|
|
password: Optional[str] = None,
|
|
strict_metadata: bool = False,
|
|
unicode_norm: Optional[Literal["NFC", "NFKC", "NFD", "NFKD"]] = None,
|
|
repair: bool = False,
|
|
gs_path: Optional[Union[str, pathlib.Path]] = None,
|
|
repair_setting: T_repair_setting = "default",
|
|
raise_unicode_errors: bool = True,
|
|
) -> "PDF":
|
|
|
|
stream: Union[BufferedReader, BytesIO]
|
|
|
|
if repair:
|
|
stream = _repair(
|
|
path_or_fp, password=password, gs_path=gs_path, setting=repair_setting
|
|
)
|
|
stream_is_external = False
|
|
# Although the original file has a path,
|
|
# the repaired version does not
|
|
path = None
|
|
elif isinstance(path_or_fp, (str, pathlib.Path)):
|
|
stream = open(path_or_fp, "rb")
|
|
stream_is_external = False
|
|
path = pathlib.Path(path_or_fp)
|
|
else:
|
|
stream = path_or_fp
|
|
stream_is_external = True
|
|
path = None
|
|
|
|
try:
|
|
return cls(
|
|
stream,
|
|
path=path,
|
|
pages=pages,
|
|
laparams=laparams,
|
|
password=password,
|
|
strict_metadata=strict_metadata,
|
|
unicode_norm=unicode_norm,
|
|
stream_is_external=stream_is_external,
|
|
raise_unicode_errors=raise_unicode_errors,
|
|
)
|
|
|
|
except PdfminerException:
|
|
if not stream_is_external:
|
|
stream.close()
|
|
raise
|
|
|
|
def close(self) -> None:
|
|
self.flush_cache()
|
|
|
|
for page in self.pages:
|
|
page.close()
|
|
|
|
if not self.stream_is_external:
|
|
self.stream.close()
|
|
|
|
def __enter__(self) -> "PDF":
|
|
return self
|
|
|
|
def __exit__(
|
|
self,
|
|
t: Optional[Type[BaseException]],
|
|
value: Optional[BaseException],
|
|
traceback: Optional[TracebackType],
|
|
) -> None:
|
|
self.close()
|
|
|
|
@property
|
|
def pages(self) -> List[Page]:
|
|
if hasattr(self, "_pages"):
|
|
return self._pages
|
|
|
|
doctop: T_num = 0
|
|
pp = self.pages_to_parse
|
|
self._pages: List[Page] = []
|
|
|
|
def iter_pages() -> Generator[PDFPage, None, None]:
|
|
gen = PDFPage.create_pages(self.doc)
|
|
while True:
|
|
try:
|
|
yield next(gen)
|
|
except StopIteration:
|
|
break
|
|
except Exception as e:
|
|
raise PdfminerException(e)
|
|
|
|
for i, page in enumerate(iter_pages()):
|
|
page_number = i + 1
|
|
if pp is not None and page_number not in pp:
|
|
continue
|
|
p = Page(self, page, page_number=page_number, initial_doctop=doctop)
|
|
self._pages.append(p)
|
|
doctop += p.height
|
|
return self._pages
|
|
|
|
@property
|
|
def objects(self) -> Dict[str, T_obj_list]:
|
|
if hasattr(self, "_objects"):
|
|
return self._objects
|
|
all_objects: Dict[str, T_obj_list] = {}
|
|
for p in self.pages:
|
|
for kind in p.objects.keys():
|
|
all_objects[kind] = all_objects.get(kind, []) + p.objects[kind]
|
|
self._objects: Dict[str, T_obj_list] = all_objects
|
|
return self._objects
|
|
|
|
@property
|
|
def annots(self) -> List[Dict[str, Any]]:
|
|
gen = (p.annots for p in self.pages)
|
|
return list(itertools.chain(*gen))
|
|
|
|
@property
|
|
def hyperlinks(self) -> List[Dict[str, Any]]:
|
|
gen = (p.hyperlinks for p in self.pages)
|
|
return list(itertools.chain(*gen))
|
|
|
|
@property
|
|
def structure_tree(self) -> List[Dict[str, Any]]:
|
|
"""Return the structure tree for the document."""
|
|
try:
|
|
return [elem.to_dict() for elem in PDFStructTree(self)]
|
|
except StructTreeMissing:
|
|
return []
|
|
|
|
def to_dict(self, object_types: Optional[List[str]] = None) -> Dict[str, Any]:
|
|
return {
|
|
"metadata": self.metadata,
|
|
"pages": [page.to_dict(object_types) for page in self.pages],
|
|
}
|