SXXXXXXX_MarkdownConverter/markdownconverter/gui/pdf_to_markdown.py
2025-12-03 10:07:33 +01:00

372 lines
13 KiB
Python
Raw Blame History

This file contains invisible Unicode characters

This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

# markdownconverter/gui/pdf_to_markdown.py
"""
PDF to Markdown converter tab.
Allows users to convert PDF files to Markdown format with optional image extraction.
"""
import os
import sys
import subprocess
import threading
import tkinter as tk
from pathlib import Path
from tkinter import filedialog, messagebox, StringVar, BooleanVar
import ttkbootstrap as tb
from tkinter.scrolledtext import ScrolledText
from ttkbootstrap.constants import *
from ..core.core import convert_pdf_to_markdown
from ..utils.logger import get_logger
from ..utils.error_handler import handle_conversion_error
log = get_logger(__name__)
class PdfToMarkdownTab(tb.Frame):
"""
Tab per la conversione di file PDF in Markdown.
Supporta l'estrazione opzionale delle immagini.
"""
def __init__(self, parent):
super().__init__(parent, padding=10)
self.pdf_path = StringVar()
self.output_path = StringVar()
self.extract_images = BooleanVar(value=True)
self.image_folder = StringVar()
self._build_ui()
def _build_ui(self):
"""Costruisce l'interfaccia utente del tab PDF→Markdown."""
# Frame per la selezione del file PDF
input_frame = tb.Labelframe(self, text="File PDF Sorgente", padding=10)
input_frame.pack(fill=tk.X, pady=(0, 10))
input_frame.columnconfigure(1, weight=1)
tb.Label(input_frame, text="File PDF:").grid(
row=0, column=0, padx=5, pady=5, sticky="w"
)
tb.Entry(input_frame, textvariable=self.pdf_path).grid(
row=0, column=1, padx=5, pady=5, sticky="ew"
)
tb.Button(
input_frame,
text="Sfoglia... 📁",
command=self._choose_pdf,
bootstyle=PRIMARY,
).grid(row=0, column=2, padx=5, pady=5)
# Frame per il file di output
output_frame = tb.Labelframe(
self, text="File Markdown di Destinazione", padding=10
)
output_frame.pack(fill=tk.X, pady=(0, 10))
output_frame.columnconfigure(1, weight=1)
tb.Label(output_frame, text="File Markdown:").grid(
row=0, column=0, padx=5, pady=5, sticky="w"
)
tb.Entry(output_frame, textvariable=self.output_path).grid(
row=0, column=1, padx=5, pady=5, sticky="ew"
)
tb.Button(
output_frame,
text="Specifica... 🖊️",
command=self._choose_output,
bootstyle=SECONDARY,
).grid(row=0, column=2, padx=5, pady=5)
tb.Button(
output_frame,
text="Open Folder 📂",
command=self._open_output_folder,
bootstyle=INFO,
).grid(row=0, column=3, padx=5, pady=5)
# Frame per le opzioni
options_frame = tb.Labelframe(self, text="Opzioni di Conversione", padding=10)
options_frame.pack(fill=tk.X, pady=(0, 10))
options_frame.columnconfigure(1, weight=1)
# Checkbox per estrazione immagini
tb.Checkbutton(
options_frame,
text="Estrai immagini dal PDF 🖼️",
variable=self.extract_images,
command=self._toggle_image_options,
bootstyle="primary-round-toggle",
).grid(row=0, column=0, padx=5, pady=5, sticky="w")
# Frame per cartella immagini (opzionale)
tb.Label(options_frame, text="Cartella immagini:").grid(
row=1, column=0, padx=5, pady=5, sticky="w"
)
self.image_folder_entry = tb.Entry(
options_frame, textvariable=self.image_folder
)
self.image_folder_entry.grid(row=1, column=1, padx=5, pady=5, sticky="ew")
self.image_folder_button = tb.Button(
options_frame, text="Seleziona... 📁", command=self._choose_image_folder
)
self.image_folder_button.grid(row=1, column=2, padx=5, pady=5)
tb.Label(
options_frame,
text="(Lascia vuoto per usare la cartella di default)",
bootstyle=SECONDARY,
font=("TkDefaultFont", 8),
).grid(row=2, column=1, padx=5, pady=(0, 5), sticky="w")
# Info box
info_frame = tb.Frame(self)
info_frame.pack(fill=tk.X, pady=(0, 10))
info_text = (
" Nota: La conversione PDF→Markdown preserva la struttura del testo, "
"i titoli e la formattazione di base (grassetto, corsivo). "
"La qualità dipende dalla struttura del PDF originale."
)
tb.Label(
info_frame, text=info_text, wraplength=800, bootstyle=INFO, padding=10
).pack(fill=tk.X)
# Pulsante di conversione
tb.Button(
self,
text="Converti PDF → Markdown 🔁",
command=self._convert,
bootstyle=SUCCESS,
width=30,
).pack(pady=10)
# Progress bar
progress_frame = tb.Frame(self)
progress_frame.pack(fill=tk.X, pady=(0, 10))
self.progress_var = tk.IntVar(value=0)
self.progress = tb.Progressbar(
progress_frame, bootstyle="info", variable=self.progress_var, length=400
)
self.progress.pack(fill=tk.X, padx=10)
# Log area
log_frame = tb.Labelframe(self, text="Log Conversione", padding=10)
log_frame.pack(fill=tk.BOTH, expand=True)
self.log_box = ScrolledText(
log_frame, height=12, state="disabled", wrap=tk.WORD
)
self.log_box.pack(fill=tk.BOTH, expand=True)
# Initialize image options state
self._toggle_image_options()
def _log(self, text):
"""Aggiunge un messaggio all'area log."""
self.log_box.configure(state="normal")
self.log_box.insert(tk.END, text + "\n")
self.log_box.configure(state="disabled")
self.log_box.see(tk.END)
self.update_idletasks()
def _set_ui_state(self, enabled: bool):
state = "normal" if enabled else "disabled"
# toggle main inputs/buttons
for widget in (
self.image_folder_entry,
self.image_folder_button,
self.log_box,
):
try:
widget.configure(state=state)
except Exception:
pass
def _on_progress(self, percent: int, message: str = ""):
# Called from background thread; marshal to main thread
def _update():
try:
self.progress_var.set(int(percent))
except Exception:
pass
if message:
self._log(f"{percent}% - {message}")
try:
self.after(1, _update)
except Exception:
# fallback: call directly
_update()
def _choose_pdf(self):
"""Apre il dialogo per selezionare il file PDF."""
file = filedialog.askopenfilename(
title="Seleziona il file PDF",
filetypes=[("PDF Files", "*.pdf"), ("All Files", "*.*")],
)
if file:
self.pdf_path.set(file)
self._log(f"PDF selezionato: {os.path.basename(file)}")
# Auto-suggest output path
if not self.output_path.get():
pdf_path = Path(file)
suggested_md = pdf_path.parent / f"{pdf_path.stem}.md"
self.output_path.set(str(suggested_md))
def _choose_output(self):
"""Apre il dialogo per specificare il file markdown di output."""
file = filedialog.asksaveasfilename(
title="Specifica il file Markdown di output",
defaultextension=".md",
filetypes=[("Markdown Files", "*.md"), ("All Files", "*.*")],
)
if file:
self.output_path.set(file)
self._log(f"Output impostato: {os.path.basename(file)}")
def _toggle_image_options(self):
"""Attiva/disattiva i controlli per le immagini."""
state = "normal" if self.extract_images.get() else "disabled"
self.image_folder_entry.configure(state=state)
self.image_folder_button.configure(state=state)
def _choose_image_folder(self):
"""Apre il dialogo per selezionare la cartella delle immagini."""
folder = filedialog.askdirectory(title="Seleziona la cartella per le immagini")
if folder:
self.image_folder.set(folder)
self._log(f"Cartella immagini: {folder}")
def _convert(self):
"""Esegue la conversione PDF→Markdown."""
pdf_file = self.pdf_path.get().strip()
output_file = self.output_path.get().strip()
extract_imgs = self.extract_images.get()
img_folder = self.image_folder.get().strip() or None
# Validazione input
if not pdf_file:
messagebox.showwarning("Attenzione", "Seleziona un file PDF da convertire.")
return
if not output_file:
messagebox.showwarning(
"Attenzione", "Specifica il percorso del file Markdown di output."
)
return
if not os.path.exists(pdf_file):
messagebox.showerror("Errore", f"Il file PDF non esiste:\n{pdf_file}")
return
# Conferma sovrascrittura se il file esiste
if os.path.exists(output_file):
if not messagebox.askyesno(
"Conferma Sovrascrittura",
f"Il file esiste già:\n{output_file}\n\nSovrascrivere?",
):
return
self._log(f"\n{"="*60}")
self._log("INIZIO CONVERSIONE PDF → MARKDOWN")
self._log(f"{'='*60}")
self._log(f"File PDF: {os.path.basename(pdf_file)}")
self._log(f"Output: {os.path.basename(output_file)}")
self._log(f"Estrazione immagini: {'' if extract_imgs else 'No'}")
# Run conversion in background thread to avoid blocking the GUI
def _run_conversion():
try:
# Update UI to busy state
try:
self.after(0, lambda: self._set_ui_state(False))
except Exception:
pass
result_path = convert_pdf_to_markdown(
pdf_file,
output_file,
extract_images=extract_imgs,
image_folder=img_folder or None,
progress_callback=self._on_progress,
)
def _on_success():
self._log(f"\n✅ Conversione completata con successo!")
self._log(f"File Markdown: {os.path.basename(result_path)}")
if extract_imgs:
if img_folder:
img_path = Path(img_folder)
else:
img_path = (
Path(output_file).parent
/ f"{Path(output_file).stem}_images"
)
if img_path.exists():
img_count = len(list(img_path.glob("*.*")))
self._log(f"Immagini estratte: {img_count}")
self._log(f"{'='*60}")
self._log("CONVERSIONE COMPLETATA")
self._log(f"{'='*60}\n")
try:
messagebox.showinfo(
"Successo",
f"Conversione completata!\n\nFile Markdown creato:\n{os.path.basename(result_path)}",
)
except Exception:
pass
# reset UI
self._set_ui_state(True)
try:
self.progress_var.set(0)
except Exception:
pass
try:
self.after(0, _on_success)
except Exception:
_on_success()
except Exception as e:
def _on_error():
handle_conversion_error(e, log_callback=self._log, show_dialog=True)
self._set_ui_state(True)
try:
self.progress_var.set(0)
except Exception:
pass
try:
self.after(0, _on_error)
except Exception:
_on_error()
thread = threading.Thread(target=_run_conversion, daemon=True)
thread.start()
def _open_output_folder(self):
"""Apre la cartella che contiene il file Markdown di output (o mostra avviso se non impostato)."""
out = self.output_path.get().strip()
if not out:
messagebox.showwarning("Attenzione", "Nessun percorso di output impostato.")
return
out_path = Path(out)
folder = out_path.parent if out_path.parent else out_path
try:
if sys.platform == "win32":
os.startfile(folder)
elif sys.platform == "darwin":
subprocess.run(["open", str(folder)], check=False)
else:
subprocess.run(["xdg-open", str(folder)], check=False)
except Exception as e:
log.error(f"Impossibile aprire la cartella di output: {e}", exc_info=True)
messagebox.showerror(
"Errore", f"Impossibile aprire la cartella di output:\n{e}"
)