# markdownconverter/core/core.py import os import re import tempfile from datetime import date import docx import pypandoc import pdfkit import markdown from ..utils.logger import get_logger log = get_logger(__name__) # --- PDFKit Configuration --- try: config = pdfkit.configuration() log.info("pdfkit configured using wkhtmltopdf from system PATH.") except OSError: WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" if os.path.exists(WKHTMLTOPDF_PATH): config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH) log.info(f"pdfkit configured using fallback path: {WKHTMLTOPDF_PATH}") else: config = None log.warning("wkhtmltopdf not found. PDF conversion may fail.") # --- Helper Functions --- def _get_document_title(markdown_text): match = re.search(r"^\s*#\s+(.+)", markdown_text, re.MULTILINE) return match.group(1).strip() if match else "Untitled Document" def _split_markdown_by_revision_history(markdown_text, separator_heading="## Revision Record"): pattern = re.compile(f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S) match = pattern.search(markdown_text) if not match: log.warning(f"'{separator_heading}' section not found. No revision history will be added.") return "", markdown_text rev_history_md = match.group(0).strip() main_content_md = markdown_text.replace(rev_history_md, "").strip() return rev_history_md, main_content_md def _replace_text_in_paragraph(paragraph, placeholders): full_text = "".join(run.text for run in paragraph.runs) if not any(key in full_text for key in placeholders): return for key, value in placeholders.items(): if key in full_text: full_text = full_text.replace(key, str(value)) style = paragraph.runs[0].style if paragraph.runs else None font = paragraph.runs[0].font if paragraph.runs else None for run in reversed(paragraph.runs): p = paragraph._p p.remove(run._r) new_run = paragraph.add_run(full_text) if style: new_run.style = style if font: new_run.font.name = font.name new_run.font.size = font.size new_run.font.bold = font.bold new_run.font.italic = font.italic new_run.font.underline = font.underline if font.color and font.color.rgb: new_run.font.color.rgb = font.color.rgb def _replace_text_in_element(element, placeholders): for p in element.paragraphs: _replace_text_in_paragraph(p, placeholders) for table in element.tables: for row in table.rows: for cell in row.cells: _replace_text_in_element(cell, placeholders) def _replace_text_placeholders(doc, placeholders): log.info(f"Replacing text placeholders: {list(placeholders.keys())}") _replace_text_in_element(doc, placeholders) for section in doc.sections: _replace_text_in_element(section.header, placeholders) _replace_text_in_element(section.footer, placeholders) def _find_placeholder_paragraph(doc, placeholder): for p in doc.paragraphs: if placeholder in "".join(run.text for run in p.runs): return p return None def _insert_docx_at_paragraph(paragraph, source_docx_path): parent = paragraph._p.getparent() index = parent.index(paragraph._p) source_doc = docx.Document(source_docx_path) for element in source_doc.element.body: parent.insert(index, element) index += 1 parent.remove(paragraph._p) # --- Main Conversion Function --- def convert_markdown(input_file, output_format, add_toc=False, template_path=None, metadata=None): if not os.path.exists(input_file): raise FileNotFoundError(f"Input file not found: {input_file}") log.info(f"Starting conversion of '{os.path.basename(input_file)}' to {output_format}.") with open(input_file, 'r', encoding='utf-8') as f: markdown_text = f.read() # --- CORREZIONE LOGICA PDF --- if output_format == "PDF": output_file = os.path.splitext(input_file)[0] + ".pdf" log.info("Starting PDF conversion using pdfkit.") if config is None: raise FileNotFoundError("wkhtmltopdf not found.") md_converter = markdown.Markdown(extensions=['toc', 'fenced_code', 'tables']) # Estrai il titolo dal testo markdown title = _get_document_title(markdown_text) # Converti il corpo del testo html_body = md_converter.convert(markdown_text) toc_html = "" # Genera il TOC se richiesto if add_toc and hasattr(md_converter, 'toc') and md_converter.toc: log.info("Generating Table of Contents for PDF.") # Mettiamo il TOC dopo il titolo principale, con un page-break toc_html = f"