eptm_dashboard/.venv/lib/python3.12/site-packages/pdfplumber/cli.py

128 lines
3.8 KiB
Python

#!/usr/bin/env python
import argparse
import json
import sys
from collections import defaultdict, deque
from itertools import chain
from typing import Any, DefaultDict, Dict, List
from .pdf import PDF
if len(sys.argv) == 1:
sys.argv.append("--help")
def parse_page_spec(p_str: str) -> List[int]:
if "-" in p_str:
start, end = map(int, p_str.split("-"))
return list(range(start, end + 1))
else:
return [int(p_str)]
def parse_args(args_raw: List[str]) -> argparse.Namespace:
parser = argparse.ArgumentParser("pdfplumber")
parser.add_argument("infile", nargs="?", type=argparse.FileType("rb"))
group = parser.add_mutually_exclusive_group()
group.add_argument(
"--structure",
help="Write the structure tree as JSON. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)
group.add_argument(
"--structure-text",
help="Write the structure tree as JSON including text contents. "
"All other arguments except --pages, --laparams, and --indent will be ignored",
action="store_true",
)
parser.add_argument("--format", choices=["csv", "json", "text"], default="csv")
parser.add_argument("--types", nargs="+")
parser.add_argument(
"--include-attrs",
nargs="+",
help="Include *only* these object attributes in output.",
)
parser.add_argument(
"--exclude-attrs",
nargs="+",
help="Exclude these object attributes from output.",
)
parser.add_argument("--laparams", type=json.loads)
parser.add_argument("--precision", type=int)
parser.add_argument("--pages", nargs="+", type=parse_page_spec)
parser.add_argument(
"--indent", type=int, help="Indent level for JSON pretty-printing."
)
args = parser.parse_args(args_raw)
if args.pages is not None:
args.pages = list(chain(*args.pages))
return args
def add_text_to_mcids(pdf: PDF, data: List[Dict[str, Any]]) -> None:
page_contents: DefaultDict[int, Any] = defaultdict(lambda: defaultdict(str))
for page in pdf.pages:
text_contents = page_contents[page.page_number]
for c in page.chars:
mcid = c.get("mcid")
if mcid is None:
continue
text_contents[mcid] += c["text"]
d = deque(data)
while d:
el = d.popleft()
if "children" in el:
d.extend(el["children"])
pageno = el.get("page_number")
if pageno is None:
continue
text_contents = page_contents[pageno]
if "mcids" in el:
el["text"] = [text_contents[mcid] for mcid in el["mcids"]]
def main(args_raw: List[str] = sys.argv[1:]) -> None:
args = parse_args(args_raw)
with PDF.open(args.infile, pages=args.pages, laparams=args.laparams) as pdf:
if args.structure:
print(json.dumps(pdf.structure_tree, indent=args.indent))
elif args.structure_text:
tree = pdf.structure_tree
add_text_to_mcids(pdf, tree)
print(json.dumps(tree, indent=args.indent, ensure_ascii=False))
elif args.format == "csv":
pdf.to_csv(
sys.stdout,
args.types,
precision=args.precision,
include_attrs=args.include_attrs,
exclude_attrs=args.exclude_attrs,
)
elif args.format == "text":
for page in pdf.pages:
print(page.extract_text(layout=True))
else:
pdf.to_json(
sys.stdout,
args.types,
precision=args.precision,
include_attrs=args.include_attrs,
exclude_attrs=args.exclude_attrs,
indent=args.indent,
)
if __name__ == "__main__":
main()