SXXXXXXX_MarkdownConverter/markdownconverter/core/core.py
2025-06-11 09:12:34 +02:00

113 lines
3.7 KiB
Python

import os
import re
import markdown
import pdfkit
import pypandoc
# Path to the wkhtmltopdf executable
WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)
# Default path for the DOCX template
TEMPLATE_DOCX_PATH = os.path.join(
os.path.dirname(__file__), "..", "templates", "default_template.docx"
)
def _get_document_title(markdown_text):
"""Extracts the first header (any level) from markdown text to use as a title."""
for line in markdown_text.splitlines():
if re.match(r'^#+\s', line.strip()):
return re.sub(r'^#+\s*', '', line.strip())
return "Document"
def _extract_title_and_separate_content(markdown_text):
"""
Extracts the first header (any level) and returns it with the rest of the content.
"""
lines = markdown_text.splitlines()
title = "Document"
content_lines = []
title_found = False
for line in lines:
if not title_found and re.match(r'^#+\s', line.strip()):
title = re.sub(r'^#+\s*', '', line.strip())
title_found = True
else:
content_lines.append(line)
content_without_title = "\n".join(content_lines)
return title, content_without_title
def convert_markdown(input_file, output_format, add_toc=False, font=None, template_path=None):
"""
Converts a Markdown file to the specified output format (PDF or DOCX).
"""
if not os.path.exists(input_file):
raise FileNotFoundError(f"Input file not found: {input_file}")
output_file = os.path.splitext(input_file)[0] + (
".pdf" if output_format == "PDF" else ".docx"
)
if output_format == "PDF":
with open(input_file, 'r', encoding='utf-8') as f:
markdown_text = f.read()
# PDF logic requires manual assembly, so it remains the same
extensions = ['toc'] if add_toc else []
md = markdown.Markdown(extensions=extensions)
body_markdown = markdown_text
title = _get_document_title(markdown_text)
if add_toc:
title, body_markdown = _extract_title_and_separate_content(markdown_text)
html_body = md.convert(body_markdown)
style = f"<style>body {{ font-family: '{font}'; }} .page-break {{ page-break-after: always; }}</style>" if font else "<style>.page-break { page-break-after: always; }</style>"
toc_html = ""
if add_toc and hasattr(md, 'toc'):
toc_html = f"""
<h1>{title}</h1>
<h2>Table of Contents</h2>
{md.toc}
<div class="page-break"></div>
"""
full_html = f"""
<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">{style}<title>{title}</title></head><body>{toc_html}{html_body}</body></html>
"""
options = {'encoding': "UTF-8"}
pdfkit.from_string(full_html, output_file, configuration=config, options=options)
elif output_format == "DOCX":
# --- THE CORRECT AND SIMPLIFIED LOGIC ---
args = ["--standalone"]
if add_toc:
# Let pandoc handle title detection and TOC generation automatically.
args.append("--toc")
if template_path and os.path.exists(template_path):
args.extend(["--reference-doc", template_path])
elif os.path.exists(TEMPLATE_DOCX_PATH):
args.extend(["--reference-doc", TEMPLATE_DOCX_PATH])
# We use convert_file with the original, unmodified input file.
pypandoc.convert_file(
input_file,
'docx',
outputfile=output_file,
extra_args=args,
encoding='utf-8'
)
else:
raise ValueError("Unsupported format")
return output_file