SXXXXXXX_MarkdownConverter/markdownconverter/core/core.py

import os
import re
import markdown
import pdfkit
import pypandoc

# Path to the wkhtmltopdf executable
WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)

# Default path for the DOCX template
TEMPLATE_DOCX_PATH = os.path.join(
    os.path.dirname(__file__), "..", "templates", "default_template.docx"
)


def _get_document_title(markdown_text):
    """Extracts the first header (any level) from markdown text to use as a title."""
    for line in markdown_text.splitlines():
        if re.match(r'^#+\s', line.strip()):
            return re.sub(r'^#+\s*', '', line.strip())
    return "Document"


def _extract_title_and_separate_content(markdown_text):
    """
    Extracts the first header (any level) and returns it with the rest of the content.
    """
    lines = markdown_text.splitlines()
    title = "Document"
    content_lines = []
    title_found = False

    for line in lines:
        if not title_found and re.match(r'^#+\s', line.strip()):
            title = re.sub(r'^#+\s*', '', line.strip())
            title_found = True
        else:
            content_lines.append(line)

    content_without_title = "\n".join(content_lines)
    return title, content_without_title


def convert_markdown(input_file, output_format, add_toc=False, font=None, template_path=None):
    """
    Converts a Markdown file to the specified output format (PDF or DOCX).
    """
    if not os.path.exists(input_file):
        raise FileNotFoundError(f"Input file not found: {input_file}")

    output_file = os.path.splitext(input_file)[0] + (
        ".pdf" if output_format == "PDF" else ".docx"
    )

    if output_format == "PDF":
        with open(input_file, 'r', encoding='utf-8') as f:
            markdown_text = f.read()

        # PDF logic requires manual assembly, so it remains the same
        extensions = ['toc'] if add_toc else []
        md = markdown.Markdown(extensions=extensions)

        body_markdown = markdown_text
        title = _get_document_title(markdown_text)

        if add_toc:
            title, body_markdown = _extract_title_and_separate_content(markdown_text)

        html_body = md.convert(body_markdown)
        style = f"<style>body {{ font-family: '{font}'; }} .page-break {{ page-break-after: always; }}</style>" if font else "<style>.page-break { page-break-after: always; }</style>"

        toc_html = ""
        if add_toc and hasattr(md, 'toc'):
            toc_html = f"""
            <h1>{title}</h1>
            <h2>Table of Contents</h2>
            {md.toc}
            <div class="page-break"></div>
            """

        full_html = f"""
        <!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">{style}<title>{title}</title></head><body>{toc_html}{html_body}</body></html>
        """
        options = {'encoding': "UTF-8"}
        pdfkit.from_string(full_html, output_file, configuration=config, options=options)

    elif output_format == "DOCX":
        # --- THE CORRECT AND SIMPLIFIED LOGIC ---
        args = ["--standalone"]

        if add_toc:
            # Let pandoc handle title detection and TOC generation automatically.
            args.append("--toc")

        if template_path and os.path.exists(template_path):
            args.extend(["--reference-doc", template_path])
        elif os.path.exists(TEMPLATE_DOCX_PATH):
            args.extend(["--reference-doc", TEMPLATE_DOCX_PATH])

        # We use convert_file with the original, unmodified input file.
        pypandoc.convert_file(
            input_file,
            'docx',
            outputfile=output_file,
            extra_args=args,
            encoding='utf-8'
        )

    else:
        raise ValueError("Unsupported format")

    return output_file