SXXXXXXX_MarkdownConverter/markdownconverter/core/core.py
2025-06-17 15:18:44 +02:00

197 lines
7.7 KiB
Python

# markdownconverter/core/core.py
import os
import re
import tempfile
from datetime import date
import docx
import pypandoc
import pdfkit
import markdown
from ..utils.logger import get_logger
log = get_logger(__name__)
# --- PDFKit Configuration ---
try:
config = pdfkit.configuration()
log.info("pdfkit configured using wkhtmltopdf from system PATH.")
except OSError:
WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
if os.path.exists(WKHTMLTOPDF_PATH):
config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)
log.info(f"pdfkit configured using fallback path: {WKHTMLTOPDF_PATH}")
else:
config = None
log.warning("wkhtmltopdf not found. PDF conversion may fail.")
# --- Helper Functions ---
def _get_document_title(markdown_text):
match = re.search(r"^\s*#\s+(.+)", markdown_text, re.MULTILINE)
return match.group(1).strip() if match else "Untitled Document"
def _split_markdown_by_revision_history(markdown_text, separator_heading="## Revision Record"):
pattern = re.compile(f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S)
match = pattern.search(markdown_text)
if not match:
log.warning(f"'{separator_heading}' section not found. No revision history will be added.")
return "", markdown_text
rev_history_md = match.group(0).strip()
main_content_md = markdown_text.replace(rev_history_md, "").strip()
return rev_history_md, main_content_md
def _replace_text_in_paragraph(paragraph, placeholders):
full_text = "".join(run.text for run in paragraph.runs)
if not any(key in full_text for key in placeholders):
return
for key, value in placeholders.items():
if key in full_text:
full_text = full_text.replace(key, str(value))
style = paragraph.runs[0].style if paragraph.runs else None
font = paragraph.runs[0].font if paragraph.runs else None
for run in reversed(paragraph.runs):
p = paragraph._p
p.remove(run._r)
new_run = paragraph.add_run(full_text)
if style:
new_run.style = style
if font:
new_run.font.name = font.name
new_run.font.size = font.size
new_run.font.bold = font.bold
new_run.font.italic = font.italic
new_run.font.underline = font.underline
if font.color and font.color.rgb:
new_run.font.color.rgb = font.color.rgb
def _replace_text_in_element(element, placeholders):
for p in element.paragraphs:
_replace_text_in_paragraph(p, placeholders)
for table in element.tables:
for row in table.rows:
for cell in row.cells:
_replace_text_in_element(cell, placeholders)
def _replace_text_placeholders(doc, placeholders):
log.info(f"Replacing text placeholders: {list(placeholders.keys())}")
_replace_text_in_element(doc, placeholders)
for section in doc.sections:
_replace_text_in_element(section.header, placeholders)
_replace_text_in_element(section.footer, placeholders)
def _find_placeholder_paragraph(doc, placeholder):
for p in doc.paragraphs:
if placeholder in "".join(run.text for run in p.runs):
return p
return None
def _insert_docx_at_paragraph(paragraph, source_docx_path):
parent = paragraph._p.getparent()
index = parent.index(paragraph._p)
source_doc = docx.Document(source_docx_path)
for element in source_doc.element.body:
parent.insert(index, element)
index += 1
parent.remove(paragraph._p)
# --- Main Conversion Function ---
def convert_markdown(input_file, output_format, add_toc=False, template_path=None, metadata=None):
if not os.path.exists(input_file):
raise FileNotFoundError(f"Input file not found: {input_file}")
log.info(f"Starting conversion of '{os.path.basename(input_file)}' to {output_format}.")
with open(input_file, 'r', encoding='utf-8') as f:
markdown_text = f.read()
# --- CORREZIONE LOGICA PDF ---
if output_format == "PDF":
output_file = os.path.splitext(input_file)[0] + ".pdf"
log.info("Starting PDF conversion using pdfkit.")
if config is None:
raise FileNotFoundError("wkhtmltopdf not found.")
md_converter = markdown.Markdown(extensions=['toc', 'fenced_code', 'tables'])
# Estrai il titolo dal testo markdown
title = _get_document_title(markdown_text)
# Converti il corpo del testo
html_body = md_converter.convert(markdown_text)
toc_html = ""
# Genera il TOC se richiesto
if add_toc and hasattr(md_converter, 'toc') and md_converter.toc:
log.info("Generating Table of Contents for PDF.")
# Mettiamo il TOC dopo il titolo principale, con un page-break
toc_html = f"<div style='page-break-after: always;'><h2>Table of Contents</h2>{md_converter.toc}</div>"
# Costruisci l'HTML finale, usando il titolo estratto sia nel <title> che come <h1>
full_html = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<title>{title}</title>
</head>
<body>
<h1>{title}</h1>
{toc_html}
{html_body}
</body>
</html>
"""
pdfkit.from_string(full_html, output_file, configuration=config, options={'encoding': "UTF-8"})
log.info(f"PDF successfully generated: {output_file}")
elif output_format == "DOCX":
output_file = os.path.splitext(input_file)[0] + ".docx"
if not template_path:
raise FileNotFoundError("A DOCX template file is required.")
doc = docx.Document(template_path)
if metadata:
metadata['DOC_PROJECT'] = metadata.get('DOC_PROJECT') or _get_document_title(markdown_text)
placeholders = {f"%%{key}%%": value for key, value in metadata.items() if value}
placeholders["%%DOC_DATE%%"] = date.today().strftime("%d/%m/%Y")
_replace_text_placeholders(doc, placeholders)
rev_history_md, main_content_md = _split_markdown_by_revision_history(markdown_text)
temp_files = []
try:
rev_placeholder_p = _find_placeholder_paragraph(doc, "%%REVISION_RECORD%%")
if rev_history_md and rev_placeholder_p:
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
pypandoc.convert_text(rev_history_md, 'docx', format='md', outputfile=temp_file.name)
temp_files.append(temp_file.name)
_insert_docx_at_paragraph(rev_placeholder_p, temp_file.name)
content_placeholder_p = _find_placeholder_paragraph(doc, "%%DOC_CONTENT%%")
if main_content_md and content_placeholder_p:
pandoc_args = ["--toc"] if add_toc else []
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
pypandoc.convert_text(main_content_md, 'docx', format='md', extra_args=pandoc_args, outputfile=temp_file.name)
temp_files.append(temp_file.name)
content_placeholder_p.insert_paragraph_before().add_run().add_break(docx.enum.text.WD_BREAK.PAGE)
_insert_docx_at_paragraph(content_placeholder_p, temp_file.name)
doc.save(output_file)
log.info(f"Document successfully created at {output_file}")
finally:
for temp_file in temp_files:
if os.path.exists(temp_file):
os.remove(temp_file)
else:
raise ValueError(f"Unsupported output format: {output_format}")
return output_file