45 lines
1.4 KiB
Python
45 lines
1.4 KiB
Python
import sys
|
|
from pathlib import Path
|
|
|
|
# Ensure project root is on sys.path so package imports work when running this script directly
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
from markdownconverter.core.core import convert_pdf_to_markdown
|
|
|
|
pdf = Path(
|
|
r"c:\src\____GitProjects\SXXXXXXX_MarkdownConverter\doc\ICD_DECD_FTH - GRIFO-F_TH, Data Exchange Control Document for, rev -A, Draft 2.pdf"
|
|
)
|
|
md_out = pdf.with_suffix(".md")
|
|
img_folder = pdf.parent / f"{pdf.stem}_images"
|
|
|
|
print("Input PDF:", pdf)
|
|
print("Output MD:", md_out)
|
|
print("Image folder:", img_folder)
|
|
|
|
try:
|
|
# For testing, limit to first 5 pages to speed up debugging
|
|
result = convert_pdf_to_markdown(
|
|
str(pdf),
|
|
str(md_out),
|
|
extract_images=True,
|
|
image_folder=str(img_folder),
|
|
page_limit=5,
|
|
)
|
|
print("\nConversion completed. Markdown saved to:", result)
|
|
# Print preview
|
|
with open(result, "r", encoding="utf-8") as f:
|
|
lines = f.readlines()
|
|
preview = "".join(lines[:200])
|
|
print("\n--- Markdown preview (first 200 lines) ---\n")
|
|
print(preview)
|
|
# Count images
|
|
if img_folder.exists():
|
|
imgs = list(img_folder.glob("*"))
|
|
print(f"\nImages extracted: {len(imgs)}")
|
|
else:
|
|
print("\nImages extracted: 0")
|
|
except Exception as e:
|
|
import traceback
|
|
|
|
print("Conversion failed:", e)
|
|
traceback.print_exc()
|