# markdownconverter/core/core.py import os import re import tempfile from datetime import date import docx import pypandoc import pdfkit import markdown from ..utils.logger import get_logger log = get_logger(__name__) # --- PDFKit Configuration --- try: config = pdfkit.configuration() log.info("pdfkit configured using wkhtmltopdf from system PATH.") except OSError: WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" if os.path.exists(WKHTMLTOPDF_PATH): config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH) log.info(f"pdfkit configured using fallback path: {WKHTMLTOPDF_PATH}") else: config = None log.warning("wkhtmltopdf not found. PDF conversion may fail.") # --- Helper Functions --- def _get_document_title(markdown_text): match = re.search(r"^\s*#\s+(.+)", markdown_text, re.MULTILINE) return match.group(1).strip() if match else "Untitled Document" def _split_markdown_by_revision_history(markdown_text, separator_heading="## Revision Record"): pattern = re.compile(f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S) match = pattern.search(markdown_text) if not match: log.warning(f"'{separator_heading}' section not found. No revision history will be added.") return "", markdown_text rev_history_md = match.group(0).strip() main_content_md = markdown_text.replace(rev_history_md, "").strip() return rev_history_md, main_content_md def _replace_text_in_paragraph(paragraph, placeholders): full_text = "".join(run.text for run in paragraph.runs) if not any(key in full_text for key in placeholders): return for key, value in placeholders.items(): if key in full_text: full_text = full_text.replace(key, str(value)) style = paragraph.runs[0].style if paragraph.runs else None font = paragraph.runs[0].font if paragraph.runs else None for run in reversed(paragraph.runs): p = paragraph._p p.remove(run._r) new_run = paragraph.add_run(full_text) if style: new_run.style = style if font: new_run.font.name = font.name new_run.font.size = font.size new_run.font.bold = font.bold new_run.font.italic = font.italic new_run.font.underline = font.underline if font.color and font.color.rgb: new_run.font.color.rgb = font.color.rgb def _replace_text_in_element(element, placeholders): for p in element.paragraphs: _replace_text_in_paragraph(p, placeholders) for table in element.tables: for row in table.rows: for cell in row.cells: _replace_text_in_element(cell, placeholders) def _replace_text_placeholders(doc, placeholders): log.info(f"Replacing text placeholders: {list(placeholders.keys())}") _replace_text_in_element(doc, placeholders) for section in doc.sections: _replace_text_in_element(section.header, placeholders) _replace_text_in_element(section.footer, placeholders) def _find_placeholder_paragraph(doc, placeholder): for p in doc.paragraphs: if placeholder in "".join(run.text for run in p.runs): return p return None def _insert_docx_at_paragraph(paragraph, source_docx_path): parent = paragraph._p.getparent() index = parent.index(paragraph._p) source_doc = docx.Document(source_docx_path) for element in source_doc.element.body: parent.insert(index, element) index += 1 parent.remove(paragraph._p) # --- Main Conversion Function --- def convert_markdown(input_file, output_format, add_toc=False, template_path=None, metadata=None): if not os.path.exists(input_file): raise FileNotFoundError(f"Input file not found: {input_file}") log.info(f"Starting conversion of '{os.path.basename(input_file)}' to {output_format}.") with open(input_file, 'r', encoding='utf-8') as f: markdown_text = f.read() # --- CORREZIONE LOGICA PDF --- if output_format == "PDF": output_file = os.path.splitext(input_file)[0] + ".pdf" log.info("Starting PDF conversion using pdfkit.") if config is None: raise FileNotFoundError("wkhtmltopdf not found.") md_converter = markdown.Markdown(extensions=['toc', 'fenced_code', 'tables']) # Estrai il titolo dal testo markdown title = _get_document_title(markdown_text) # Converti il corpo del testo html_body = md_converter.convert(markdown_text) toc_html = "" # Genera il TOC se richiesto if add_toc and hasattr(md_converter, 'toc') and md_converter.toc: log.info("Generating Table of Contents for PDF.") # Mettiamo il TOC dopo il titolo principale, con un page-break toc_html = f"

Table of Contents

{md_converter.toc}
" # Costruisci l'HTML finale, usando il titolo estratto sia nel che come <h1> full_html = f""" <!DOCTYPE html> <html lang="en"> <head> <meta charset="UTF-8"> <title>{title}

{title}

{toc_html} {html_body} """ pdfkit.from_string(full_html, output_file, configuration=config, options={'encoding': "UTF-8"}) log.info(f"PDF successfully generated: {output_file}") elif output_format == "DOCX": output_file = os.path.splitext(input_file)[0] + ".docx" if not template_path: raise FileNotFoundError("A DOCX template file is required.") doc = docx.Document(template_path) if metadata: metadata['DOC_PROJECT'] = metadata.get('DOC_PROJECT') or _get_document_title(markdown_text) placeholders = {f"%%{key}%%": value for key, value in metadata.items() if value} placeholders["%%DOC_DATE%%"] = date.today().strftime("%d/%m/%Y") _replace_text_placeholders(doc, placeholders) rev_history_md, main_content_md = _split_markdown_by_revision_history(markdown_text) temp_files = [] try: rev_placeholder_p = _find_placeholder_paragraph(doc, "%%REVISION_RECORD%%") if rev_history_md and rev_placeholder_p: with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file: pypandoc.convert_text(rev_history_md, 'docx', format='md', outputfile=temp_file.name) temp_files.append(temp_file.name) _insert_docx_at_paragraph(rev_placeholder_p, temp_file.name) content_placeholder_p = _find_placeholder_paragraph(doc, "%%DOC_CONTENT%%") if main_content_md and content_placeholder_p: pandoc_args = ["--toc"] if add_toc else [] with tempfile.NamedTemporaryFile(delete=False, suffix=".docx") as temp_file: pypandoc.convert_text(main_content_md, 'docx', format='md', extra_args=pandoc_args, outputfile=temp_file.name) temp_files.append(temp_file.name) content_placeholder_p.insert_paragraph_before().add_run().add_break(docx.enum.text.WD_BREAK.PAGE) _insert_docx_at_paragraph(content_placeholder_p, temp_file.name) doc.save(output_file) log.info(f"Document successfully created at {output_file}") finally: for temp_file in temp_files: if os.path.exists(temp_file): os.remove(temp_file) else: raise ValueError(f"Unsupported output format: {output_format}") return output_file