SXXXXXXX_MarkdownConverter/markdownconverter/core/core.py

# markdownconverter/core/core.py

import os
import re
import tempfile
from datetime import date
import docx
import pypandoc
import pdfkit
import markdown
from ..utils.logger import get_logger

log = get_logger(__name__)

# --- PDFKit Configuration ---
try:
    config = pdfkit.configuration()
    log.info("pdfkit configured using wkhtmltopdf from system PATH.")
except OSError:
    WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
    if os.path.exists(WKHTMLTOPDF_PATH):
        config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)
        log.info(f"pdfkit configured using fallback path: {WKHTMLTOPDF_PATH}")
    else:
        config = None
        log.warning("wkhtmltopdf not found. PDF conversion may fail.")

# --- Helper Functions ---

def _get_document_title(markdown_text):
    match = re.search(r"^\s*#\s+(.+)", markdown_text, re.MULTILINE)
    return match.group(1).strip() if match else "Untitled Document"

def _split_markdown_by_revision_history(markdown_text, separator_heading="## Revision Record"):
    pattern = re.compile(f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S)
    match = pattern.search(markdown_text)
    if not match:
        log.warning(f"'{separator_heading}' section not found. No revision history will be added.")
        return "", markdown_text
    rev_history_md = match.group(0).strip()
    main_content_md = markdown_text.replace(rev_history_md, "").strip()
    return rev_history_md, main_content_md

def _replace_text_in_paragraph(paragraph, placeholders):
    full_text = "".join(run.text for run in paragraph.runs)
    if not any(key in full_text for key in placeholders):
        return

    for key, value in placeholders.items():
        if key in full_text:
            full_text = full_text.replace(key, str(value))

    style = paragraph.runs[0].style if paragraph.runs else None
    font = paragraph.runs[0].font if paragraph.runs else None

    for run in reversed(paragraph.runs):
        p = paragraph._p
        p.remove(run._r)

    new_run = paragraph.add_run(full_text)
    if style:
        new_run.style = style
    if font:
        new_run.font.name = font.name
        new_run.font.size = font.size
        new_run.font.bold = font.bold
        new_run.font.italic = font.italic
        new_run.font.underline = font.underline
        if font.color and font.color.rgb:
            new_run.font.color.rgb = font.color.rgb

def _replace_text_in_element(element, placeholders):
    for p in element.paragraphs:
        _replace_text_in_paragraph(p, placeholders)
    for table in element.tables:
        for row in table.rows:
            for cell in row.cells:
                _replace_text_in_element(cell, placeholders)

def _replace_text_placeholders(doc, placeholders):
    log.info(f"Replacing text placeholders: {list(placeholders.keys())}")
    _replace_text_in_element(doc, placeholders)
    for section in doc.sections:
        _replace_text_in_element(section.header, placeholders)
        _replace_text_in_element(section.footer, placeholders)

def _find_placeholder_paragraph(doc, placeholder):
    for p in doc.paragraphs:
        if placeholder in "".join(run.text for run in p.runs):
            return p
    return None

def _insert_docx_at_paragraph(paragraph, source_docx_path):
    parent = paragraph._p.getparent()
    index = parent.index(paragraph._p)
    source_doc = docx.Document(source_docx_path)
    for element in source_doc.element.body:
        parent.insert(index, element)
        index += 1
    parent.remove(paragraph._p)

# --- Main Conversion Function ---
def convert_markdown(input_file, output_format, add_toc=False, template_path=None, metadata=None):
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    log.info(f"Starting conversion of '{os.path.basename(input_file)}' to {output_format}.")

    with open(input_file, 'r', encoding='utf-8') as f:
        markdown_text = f.read()

    # --- CORREZIONE LOGICA PDF ---
    if output_format == "PDF":
        output_file = os.path.splitext(input_file)[0] + ".pdf"
        log.info("Starting PDF conversion using pdfkit.")
        if config is None:
            raise FileNotFoundError("wkhtmltopdf not found.")

        md_converter = markdown.Markdown(extensions=['toc', 'fenced_code', 'tables'])

        # Estrai il titolo dal testo markdown
        title = _get_document_title(markdown_text)

        # Converti il corpo del testo
        html_body = md_converter.convert(markdown_text)

        toc_html = ""
        # Genera il TOC se richiesto
        if add_toc and hasattr(md_converter, 'toc') and md_converter.toc:
            log.info("Generating Table of Contents for PDF.")
            # Mettiamo il TOC dopo il titolo principale, con un page-break
            toc_html = f"<div style='page-break-after: always;'><h2>Table of Contents</h2>{md_converter.toc}</div>"

        # Costruisci l'HTML finale, usando il titolo estratto sia nel <title> che come <h1>
        full_html = f"""
        <!DOCTYPE html>
        <html lang="en">
        <head>
            <meta charset="UTF-8">
            <title>{title}</title>
        </head>
        <body>
            <h1>{title}</h1>
            {toc_html}
            {html_body}
        </body>
        </html>
        """

        pdfkit.from_string(full_html, output_file, configuration=config, options={'encoding': "UTF-8"})
        log.info(f"PDF successfully generated: {output_file}")

    elif output_format == "DOCX":
        output_file = os.path.splitext(input_file)[0] + ".docx"
        if not template_path:
            raise FileNotFoundError("A DOCX template file is required.")

        doc = docx.Document(template_path)

        if metadata:
            metadata['DOC_PROJECT'] = metadata.get('DOC_PROJECT') or _get_document_title(markdown_text)
            placeholders = {f"%%{key}%%": value for key, value in metadata.items() if value}
            placeholders["%%DOC_DATE%%"] = date.today().strftime("%d/%m/%Y")
            _replace_text_placeholders(doc, placeholders)

        rev_history_md, main_content_md = _split_markdown_by_revision_history(markdown_text)

        temp_files = []
        try:
            rev_placeholder_p = _find_placeholder_paragraph(doc, "%%REVISION_RECORD%%")
            if rev_history_md and rev_placeholder_p:
                with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
                    pypandoc.convert_text(rev_history_md, 'docx', format='md', outputfile=temp_file.name)
                    temp_files.append(temp_file.name)
                    _insert_docx_at_paragraph(rev_placeholder_p, temp_file.name)

            content_placeholder_p = _find_placeholder_paragraph(doc, "%%DOC_CONTENT%%")
            if main_content_md and content_placeholder_p:
                pandoc_args = ["--toc"] if add_toc else []
                with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
                    pypandoc.convert_text(main_content_md, 'docx', format='md', extra_args=pandoc_args, outputfile=temp_file.name)
                    temp_files.append(temp_file.name)

                    content_placeholder_p.insert_paragraph_before().add_run().add_break(docx.enum.text.WD_BREAK.PAGE)
                    _insert_docx_at_paragraph(content_placeholder_p, temp_file.name)

            doc.save(output_file)
            log.info(f"Document successfully created at {output_file}")

        finally:
            for temp_file in temp_files:
                if os.path.exists(temp_file):
                    os.remove(temp_file)
    else:
        raise ValueError(f"Unsupported output format: {output_format}")

    return output_file