197 lines
7.7 KiB
Python
197 lines
7.7 KiB
Python
# markdownconverter/core/core.py
|
|
|
|
import os
|
|
import re
|
|
import tempfile
|
|
from datetime import date
|
|
import docx
|
|
import pypandoc
|
|
import pdfkit
|
|
import markdown
|
|
from ..utils.logger import get_logger
|
|
|
|
log = get_logger(__name__)
|
|
|
|
# --- PDFKit Configuration ---
|
|
try:
|
|
config = pdfkit.configuration()
|
|
log.info("pdfkit configured using wkhtmltopdf from system PATH.")
|
|
except OSError:
|
|
WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe"
|
|
if os.path.exists(WKHTMLTOPDF_PATH):
|
|
config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH)
|
|
log.info(f"pdfkit configured using fallback path: {WKHTMLTOPDF_PATH}")
|
|
else:
|
|
config = None
|
|
log.warning("wkhtmltopdf not found. PDF conversion may fail.")
|
|
|
|
# --- Helper Functions ---
|
|
|
|
def _get_document_title(markdown_text):
|
|
match = re.search(r"^\s*#\s+(.+)", markdown_text, re.MULTILINE)
|
|
return match.group(1).strip() if match else "Untitled Document"
|
|
|
|
def _split_markdown_by_revision_history(markdown_text, separator_heading="## Revision Record"):
|
|
pattern = re.compile(f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S)
|
|
match = pattern.search(markdown_text)
|
|
if not match:
|
|
log.warning(f"'{separator_heading}' section not found. No revision history will be added.")
|
|
return "", markdown_text
|
|
rev_history_md = match.group(0).strip()
|
|
main_content_md = markdown_text.replace(rev_history_md, "").strip()
|
|
return rev_history_md, main_content_md
|
|
|
|
def _replace_text_in_paragraph(paragraph, placeholders):
|
|
full_text = "".join(run.text for run in paragraph.runs)
|
|
if not any(key in full_text for key in placeholders):
|
|
return
|
|
|
|
for key, value in placeholders.items():
|
|
if key in full_text:
|
|
full_text = full_text.replace(key, str(value))
|
|
|
|
style = paragraph.runs[0].style if paragraph.runs else None
|
|
font = paragraph.runs[0].font if paragraph.runs else None
|
|
|
|
for run in reversed(paragraph.runs):
|
|
p = paragraph._p
|
|
p.remove(run._r)
|
|
|
|
new_run = paragraph.add_run(full_text)
|
|
if style:
|
|
new_run.style = style
|
|
if font:
|
|
new_run.font.name = font.name
|
|
new_run.font.size = font.size
|
|
new_run.font.bold = font.bold
|
|
new_run.font.italic = font.italic
|
|
new_run.font.underline = font.underline
|
|
if font.color and font.color.rgb:
|
|
new_run.font.color.rgb = font.color.rgb
|
|
|
|
def _replace_text_in_element(element, placeholders):
|
|
for p in element.paragraphs:
|
|
_replace_text_in_paragraph(p, placeholders)
|
|
for table in element.tables:
|
|
for row in table.rows:
|
|
for cell in row.cells:
|
|
_replace_text_in_element(cell, placeholders)
|
|
|
|
def _replace_text_placeholders(doc, placeholders):
|
|
log.info(f"Replacing text placeholders: {list(placeholders.keys())}")
|
|
_replace_text_in_element(doc, placeholders)
|
|
for section in doc.sections:
|
|
_replace_text_in_element(section.header, placeholders)
|
|
_replace_text_in_element(section.footer, placeholders)
|
|
|
|
def _find_placeholder_paragraph(doc, placeholder):
|
|
for p in doc.paragraphs:
|
|
if placeholder in "".join(run.text for run in p.runs):
|
|
return p
|
|
return None
|
|
|
|
def _insert_docx_at_paragraph(paragraph, source_docx_path):
|
|
parent = paragraph._p.getparent()
|
|
index = parent.index(paragraph._p)
|
|
source_doc = docx.Document(source_docx_path)
|
|
for element in source_doc.element.body:
|
|
parent.insert(index, element)
|
|
index += 1
|
|
parent.remove(paragraph._p)
|
|
|
|
# --- Main Conversion Function ---
|
|
def convert_markdown(input_file, output_format, add_toc=False, template_path=None, metadata=None):
|
|
if not os.path.exists(input_file):
|
|
raise FileNotFoundError(f"Input file not found: {input_file}")
|
|
|
|
log.info(f"Starting conversion of '{os.path.basename(input_file)}' to {output_format}.")
|
|
|
|
with open(input_file, 'r', encoding='utf-8') as f:
|
|
markdown_text = f.read()
|
|
|
|
# --- CORREZIONE LOGICA PDF ---
|
|
if output_format == "PDF":
|
|
output_file = os.path.splitext(input_file)[0] + ".pdf"
|
|
log.info("Starting PDF conversion using pdfkit.")
|
|
if config is None:
|
|
raise FileNotFoundError("wkhtmltopdf not found.")
|
|
|
|
md_converter = markdown.Markdown(extensions=['toc', 'fenced_code', 'tables'])
|
|
|
|
# Estrai il titolo dal testo markdown
|
|
title = _get_document_title(markdown_text)
|
|
|
|
# Converti il corpo del testo
|
|
html_body = md_converter.convert(markdown_text)
|
|
|
|
toc_html = ""
|
|
# Genera il TOC se richiesto
|
|
if add_toc and hasattr(md_converter, 'toc') and md_converter.toc:
|
|
log.info("Generating Table of Contents for PDF.")
|
|
# Mettiamo il TOC dopo il titolo principale, con un page-break
|
|
toc_html = f"<div style='page-break-after: always;'><h2>Table of Contents</h2>{md_converter.toc}</div>"
|
|
|
|
# Costruisci l'HTML finale, usando il titolo estratto sia nel <title> che come <h1>
|
|
full_html = f"""
|
|
<!DOCTYPE html>
|
|
<html lang="en">
|
|
<head>
|
|
<meta charset="UTF-8">
|
|
<title>{title}</title>
|
|
</head>
|
|
<body>
|
|
<h1>{title}</h1>
|
|
{toc_html}
|
|
{html_body}
|
|
</body>
|
|
</html>
|
|
"""
|
|
|
|
pdfkit.from_string(full_html, output_file, configuration=config, options={'encoding': "UTF-8"})
|
|
log.info(f"PDF successfully generated: {output_file}")
|
|
|
|
elif output_format == "DOCX":
|
|
output_file = os.path.splitext(input_file)[0] + ".docx"
|
|
if not template_path:
|
|
raise FileNotFoundError("A DOCX template file is required.")
|
|
|
|
doc = docx.Document(template_path)
|
|
|
|
if metadata:
|
|
metadata['DOC_PROJECT'] = metadata.get('DOC_PROJECT') or _get_document_title(markdown_text)
|
|
placeholders = {f"%%{key}%%": value for key, value in metadata.items() if value}
|
|
placeholders["%%DOC_DATE%%"] = date.today().strftime("%d/%m/%Y")
|
|
_replace_text_placeholders(doc, placeholders)
|
|
|
|
rev_history_md, main_content_md = _split_markdown_by_revision_history(markdown_text)
|
|
|
|
temp_files = []
|
|
try:
|
|
rev_placeholder_p = _find_placeholder_paragraph(doc, "%%REVISION_RECORD%%")
|
|
if rev_history_md and rev_placeholder_p:
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
|
|
pypandoc.convert_text(rev_history_md, 'docx', format='md', outputfile=temp_file.name)
|
|
temp_files.append(temp_file.name)
|
|
_insert_docx_at_paragraph(rev_placeholder_p, temp_file.name)
|
|
|
|
content_placeholder_p = _find_placeholder_paragraph(doc, "%%DOC_CONTENT%%")
|
|
if main_content_md and content_placeholder_p:
|
|
pandoc_args = ["--toc"] if add_toc else []
|
|
with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file:
|
|
pypandoc.convert_text(main_content_md, 'docx', format='md', extra_args=pandoc_args, outputfile=temp_file.name)
|
|
temp_files.append(temp_file.name)
|
|
|
|
content_placeholder_p.insert_paragraph_before().add_run().add_break(docx.enum.text.WD_BREAK.PAGE)
|
|
_insert_docx_at_paragraph(content_placeholder_p, temp_file.name)
|
|
|
|
doc.save(output_file)
|
|
log.info(f"Document successfully created at {output_file}")
|
|
|
|
finally:
|
|
for temp_file in temp_files:
|
|
if os.path.exists(temp_file):
|
|
os.remove(temp_file)
|
|
else:
|
|
raise ValueError(f"Unsupported output format: {output_format}")
|
|
|
|
return output_file |