import os import re import markdown import pdfkit import pypandoc # Path to the wkhtmltopdf executable WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH) # Default path for the DOCX template TEMPLATE_DOCX_PATH = os.path.join( os.path.dirname(__file__), "..", "templates", "default_template.docx" ) def _get_document_title(markdown_text): """Extracts the first header (any level) from markdown text to use as a title.""" for line in markdown_text.splitlines(): if re.match(r'^#+\s', line.strip()): return re.sub(r'^#+\s*', '', line.strip()) return "Document" def _extract_title_and_separate_content(markdown_text): """ Extracts the first header (any level) and returns it with the rest of the content. """ lines = markdown_text.splitlines() title = "Document" content_lines = [] title_found = False for line in lines: if not title_found and re.match(r'^#+\s', line.strip()): title = re.sub(r'^#+\s*', '', line.strip()) title_found = True else: content_lines.append(line) content_without_title = "\n".join(content_lines) return title, content_without_title def convert_markdown(input_file, output_format, add_toc=False, font=None, template_path=None): """ Converts a Markdown file to the specified output format (PDF or DOCX). """ if not os.path.exists(input_file): raise FileNotFoundError(f"Input file not found: {input_file}") output_file = os.path.splitext(input_file)[0] + ( ".pdf" if output_format == "PDF" else ".docx" ) if output_format == "PDF": with open(input_file, 'r', encoding='utf-8') as f: markdown_text = f.read() # PDF logic requires manual assembly, so it remains the same extensions = ['toc'] if add_toc else [] md = markdown.Markdown(extensions=extensions) body_markdown = markdown_text title = _get_document_title(markdown_text) if add_toc: title, body_markdown = _extract_title_and_separate_content(markdown_text) html_body = md.convert(body_markdown) style = f"" if font else "" toc_html = "" if add_toc and hasattr(md, 'toc'): toc_html = f"""

{title}

{md.toc}

""" full_html = f""" {style}{title}{toc_html}{html_body} """ options = {'encoding': "UTF-8"} pdfkit.from_string(full_html, output_file, configuration=config, options=options) elif output_format == "DOCX": # --- THE CORRECT AND SIMPLIFIED LOGIC --- args = ["--standalone"] if add_toc: # Let pandoc handle title detection and TOC generation automatically. args.append("--toc") if template_path and os.path.exists(template_path): args.extend(["--reference-doc", template_path]) elif os.path.exists(TEMPLATE_DOCX_PATH): args.extend(["--reference-doc", TEMPLATE_DOCX_PATH]) # We use convert_file with the original, unmodified input file. pypandoc.convert_file( input_file, 'docx', outputfile=output_file, extra_args=args, encoding='utf-8' ) else: raise ValueError("Unsupported format") return output_file

{title}

Table of Contents