#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Convert PDF to Microsoft Word (.docx). Uses the pdf2docx library (layout-aware; works best on text-based PDFs). Scanned PDFs may come through mostly as images inside the Word file. pip install pdf2docx Usage: python pdf_to_word.py report.pdf python pdf_to_word.py report.pdf -o out.docx python pdf_to_word.py *.pdf """ from __future__ import annotations import argparse import sys from pathlib import Path def convert_one(pdf_path: Path, output_path: Path | None) -> Path: pdf_path = pdf_path.resolve() if not pdf_path.is_file(): raise FileNotFoundError(pdf_path) if output_path is None: out = pdf_path.with_suffix(".docx") else: out = output_path.resolve() if out.suffix.lower() != ".docx": out = out.with_suffix(".docx") try: from pdf2docx import Converter except ImportError as e: raise SystemExit( "Missing dependency. Run: pip install pdf2docx\n" + str(e) ) from e print(f"Converting: {pdf_path.name} -> {out.name}") cv = Converter(str(pdf_path)) try: cv.convert(str(out)) finally: cv.close() print(f" OK: {out}") return out def main() -> None: ap = argparse.ArgumentParser( description="Convert PDF file(s) to Word (.docx).", ) ap.add_argument( "pdf", nargs="+", type=Path, help="Input PDF file(s)", ) ap.add_argument( "-o", "--output", type=Path, default=None, help="Output .docx (only when converting a single input PDF)", ) args = ap.parse_args() inputs = args.pdf if len(inputs) > 1 and args.output is not None: print("ERROR: -o / --output can only be used with a single PDF.", file=sys.stderr) sys.exit(2) for pdf in inputs: try: # Only pass explicit output for the single-file case out = args.output if len(inputs) == 1 else None convert_one(pdf, out) except FileNotFoundError as e: print(f"ERROR: {e}", file=sys.stderr) sys.exit(1) except Exception as e: print(f"ERROR ({pdf}): {e}", file=sys.stderr) sys.exit(1) print("Done.") if __name__ == "__main__": main()