eptm_dashboard/.venv/lib/python3.12/site-packages/pypdfium2_cli/extract_text.py

45 lines
1.3 KiB
Python

# SPDX-FileCopyrightText: 2026 geisserml <geisserml@gmail.com>
# SPDX-License-Identifier: Apache-2.0 OR BSD-3-Clause
from pypdfium2_cli._parsers import add_input, get_input
EXTRACT_RANGE = "range"
EXTRACT_BOUNDED = "bounded"
# __main__.py hook
PARSER_DESC = """\
Note that PDFium outputs CRLF (\\r\\n) style line breaks.
This may be undesirable or confusing in some situations, e.g. when processing the output with an (unaware) parser on the command line.
If this is an issue, run e.g. `dos2unix` on the output, or use the Python API.\
"""
def attach(parser):
add_input(parser, pages=True)
parser.add_argument(
"--strategy",
default = EXTRACT_RANGE,
choices = (EXTRACT_RANGE, EXTRACT_BOUNDED),
help = "PDFium text extraction strategy (range, bounded).",
)
def main(args):
pdf = get_input(args)
sep = ""
for i in args.pages:
page = pdf[i]
textpage = page.get_textpage()
# TODO let caller pass in possible range/boundary parameters
if args.strategy == EXTRACT_RANGE:
text = textpage.get_text_range()
elif args.strategy == EXTRACT_BOUNDED:
text = textpage.get_text_bounded()
else:
assert False
print(sep + f"# Page {i+1}\n" + text)
sep = "\n"