113 lines
3.7 KiB
Python
113 lines
3.7 KiB
Python
import os
|
|
import re
|
|
import markdown
|
|
import pdfkit
|
|
import pypandoc
|
|
|
|
# Path to the wkhtmltopdf executable
|
|
WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
|
|
config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)
|
|
|
|
# Default path for the DOCX template
|
|
TEMPLATE_DOCX_PATH = os.path.join(
|
|
os.path.dirname(__file__), "..", "templates", "default_template.docx"
|
|
)
|
|
|
|
|
|
def _get_document_title(markdown_text):
|
|
"""Extracts the first header (any level) from markdown text to use as a title."""
|
|
for line in markdown_text.splitlines():
|
|
if re.match(r'^#+\s', line.strip()):
|
|
return re.sub(r'^#+\s*', '', line.strip())
|
|
return "Document"
|
|
|
|
|
|
def _extract_title_and_separate_content(markdown_text):
|
|
"""
|
|
Extracts the first header (any level) and returns it with the rest of the content.
|
|
"""
|
|
lines = markdown_text.splitlines()
|
|
title = "Document"
|
|
content_lines = []
|
|
title_found = False
|
|
|
|
for line in lines:
|
|
if not title_found and re.match(r'^#+\s', line.strip()):
|
|
title = re.sub(r'^#+\s*', '', line.strip())
|
|
title_found = True
|
|
else:
|
|
content_lines.append(line)
|
|
|
|
content_without_title = "\n".join(content_lines)
|
|
return title, content_without_title
|
|
|
|
|
|
def convert_markdown(input_file, output_format, add_toc=False, font=None, template_path=None):
|
|
"""
|
|
Converts a Markdown file to the specified output format (PDF or DOCX).
|
|
"""
|
|
if not os.path.exists(input_file):
|
|
raise FileNotFoundError(f"Input file not found: {input_file}")
|
|
|
|
output_file = os.path.splitext(input_file)[0] + (
|
|
".pdf" if output_format == "PDF" else ".docx"
|
|
)
|
|
|
|
if output_format == "PDF":
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
markdown_text = f.read()
|
|
|
|
# PDF logic requires manual assembly, so it remains the same
|
|
extensions = ['toc'] if add_toc else []
|
|
md = markdown.Markdown(extensions=extensions)
|
|
|
|
body_markdown = markdown_text
|
|
title = _get_document_title(markdown_text)
|
|
|
|
if add_toc:
|
|
title, body_markdown = _extract_title_and_separate_content(markdown_text)
|
|
|
|
html_body = md.convert(body_markdown)
|
|
style = f"<style>body {{ font-family: '{font}'; }} .page-break {{ page-break-after: always; }}</style>" if font else "<style>.page-break { page-break-after: always; }</style>"
|
|
|
|
toc_html = ""
|
|
if add_toc and hasattr(md, 'toc'):
|
|
toc_html = f"""
|
|
<h1>{title}</h1>
|
|
<h2>Table of Contents</h2>
|
|
{md.toc}
|
|
<div class="page-break"></div>
|
|
"""
|
|
|
|
full_html = f"""
|
|
<!DOCTYPE html><html lang="en"><head><meta charset="UTF-8">{style}<title>{title}</title></head><body>{toc_html}{html_body}</body></html>
|
|
"""
|
|
options = {'encoding': "UTF-8"}
|
|
pdfkit.from_string(full_html, output_file, configuration=config, options=options)
|
|
|
|
elif output_format == "DOCX":
|
|
# --- THE CORRECT AND SIMPLIFIED LOGIC ---
|
|
args = ["--standalone"]
|
|
|
|
if add_toc:
|
|
# Let pandoc handle title detection and TOC generation automatically.
|
|
args.append("--toc")
|
|
|
|
if template_path and os.path.exists(template_path):
|
|
args.extend(["--reference-doc", template_path])
|
|
elif os.path.exists(TEMPLATE_DOCX_PATH):
|
|
args.extend(["--reference-doc", TEMPLATE_DOCX_PATH])
|
|
|
|
# We use convert_file with the original, unmodified input file.
|
|
pypandoc.convert_file(
|
|
input_file,
|
|
'docx',
|
|
outputfile=output_file,
|
|
extra_args=args,
|
|
encoding='utf-8'
|
|
)
|
|
|
|
else:
|
|
raise ValueError("Unsupported format")
|
|
|
|
return output_file |