SXXXXXXX_MarkdownConverter/tools/convert_test_pdf.py
2025-12-03 10:07:33 +01:00

45 lines
1.4 KiB
Python

import sys
from pathlib import Path
# Ensure project root is on sys.path so package imports work when running this script directly
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
from markdownconverter.core.core import convert_pdf_to_markdown
pdf = Path(
r"c:\src\____GitProjects\SXXXXXXX_MarkdownConverter\doc\ICD_DECD_FTH - GRIFO-F_TH, Data Exchange Control Document for, rev -A, Draft 2.pdf"
)
md_out = pdf.with_suffix(".md")
img_folder = pdf.parent / f"{pdf.stem}_images"
print("Input PDF:", pdf)
print("Output MD:", md_out)
print("Image folder:", img_folder)
try:
# For testing, limit to first 5 pages to speed up debugging
result = convert_pdf_to_markdown(
str(pdf),
str(md_out),
extract_images=True,
image_folder=str(img_folder),
page_limit=5,
)
print("\nConversion completed. Markdown saved to:", result)
# Print preview
with open(result, "r", encoding="utf-8") as f:
lines = f.readlines()
preview = "".join(lines[:200])
print("\n--- Markdown preview (first 200 lines) ---\n")
print(preview)
# Count images
if img_folder.exists():
imgs = list(img_folder.glob("*"))
print(f"\nImages extracted: {len(imgs)}")
else:
print("\nImages extracted: 0")
except Exception as e:
import traceback
print("Conversion failed:", e)
traceback.print_exc()