# markdownconverter/core/core.py import os import re import sys import tempfile from datetime import date import docx import pypandoc import pdfkit import markdown import subprocess from docx.enum.text import WD_BREAK from docx2pdf import convert as convert_word from ..utils.logger import get_logger log = get_logger(__name__) # --- Custom Exceptions --- class TemplatePlaceholderError(ValueError): pass class ConverterNotFoundError(Exception): pass # --- PDFKit Configuration --- try: config = pdfkit.configuration() log.info("pdfkit configured using wkhtmltopdf from system PATH.") except OSError: WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" if os.path.exists(WKHTMLTOPDF_PATH): config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH) else: config = None log.warning("wkhtmltopdf not found. PDF conversion may fail.") def scan_template_for_placeholders(template_path: str) -> tuple[list[str], list[str]]: if not os.path.exists(template_path): raise FileNotFoundError(f"Template file not found: {template_path}") log.info(f"Scanning template '{os.path.basename(template_path)}' for placeholders.") structural_keys = {"%%REVISION_RECORD%%", "%%DOC_TOC%%", "%%DOC_CONTENT%%"} placeholder_pattern = re.compile(r"%%([A-Z0-9_]+)%%") found_placeholders = set() doc = docx.Document(template_path) def find_in_element(element): if hasattr(element, "paragraphs"): for p in element.paragraphs: for match in placeholder_pattern.finditer( "".join(run.text for run in p.runs) ): found_placeholders.add(match.group(0)) if hasattr(element, "tables"): for table in element.tables: for row in table.rows: for cell in row.cells: find_in_element(cell) find_in_element(doc) for section in doc.sections: find_in_element(section.header) find_in_element(section.footer) dynamic = sorted([p for p in found_placeholders if p not in structural_keys]) structural = sorted([p for p in found_placeholders if p in structural_keys]) log.info( f"Scan complete. Found {len(dynamic)} dynamic and {len(structural)} structural placeholders." ) return dynamic, structural def _get_document_title(markdown_text: str) -> str: match = re.search(r"^\s*#+\s+(.+)", markdown_text, re.MULTILINE) return match.group(1).strip() if match else "Untitled Document" def _split_markdown_by_revision_history( markdown_text: str, separator_heading="## Revision Record" ) -> tuple[str, str]: pattern = re.compile( f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S ) match = pattern.search(markdown_text) if not match: log.warning( f"'{separator_heading}' section not found. No revision history will be added." ) return "", markdown_text rev_history_md = match.group(0).strip() main_content_md = markdown_text.replace(rev_history_md, "", 1).strip() return rev_history_md, main_content_md def _replace_text_in_paragraph(paragraph, placeholders: dict[str, str]): full_text = "".join(run.text for run in paragraph.runs) if not any(key in full_text for key in placeholders): return for key, value in placeholders.items(): if key in full_text: full_text = full_text.replace(key, str(value)) style = paragraph.runs[0].style if paragraph.runs else None font = paragraph.runs[0].font if paragraph.runs else None for run in reversed(paragraph.runs): p = paragraph._p p.remove(run._r) new_run = paragraph.add_run(full_text) if style: new_run.style = style if font: new_run.font.name = font.name new_run.font.size = font.size new_run.font.bold = font.bold new_run.font.italic = font.italic new_run.font.underline = font.underline if font.color and font.color.rgb: new_run.font.color.rgb = font.color.rgb def _replace_text_in_element(element, placeholders: dict[str, str]): if hasattr(element, "paragraphs"): for p in element.paragraphs: _replace_text_in_paragraph(p, placeholders) if hasattr(element, "tables"): for table in element.tables: for row in table.rows: for cell in row.cells: _replace_text_in_element(cell, placeholders) def _replace_metadata_placeholders(doc: docx.Document, placeholders: dict[str, str]): log.info(f"Replacing metadata placeholders: {list(placeholders.keys())}") _replace_text_in_element(doc, placeholders) for section in doc.sections: _replace_text_in_element(section.header, placeholders) _replace_text_in_element(section.footer, placeholders) def _find_placeholder_paragraph(doc: docx.Document, placeholder: str): for p in doc.paragraphs: if placeholder in "".join(run.text for run in p.runs): return p return None def _insert_docx_at_paragraph(paragraph, source_docx_path: str): parent = paragraph._p.getparent() index = parent.index(paragraph._p) source_doc = docx.Document(source_docx_path) for element in source_doc.element.body: parent.insert(index, element) index += 1 parent.remove(paragraph._p) def _remove_paragraph(paragraph): if paragraph is None: return parent = paragraph._p.getparent() parent.remove(paragraph._p) def _add_revision_table(doc: docx.Document, rev_history_md: str): placeholder_p = _find_placeholder_paragraph(doc, "%%REVISION_RECORD%%") if not placeholder_p: log.warning("Revision record placeholder not found in template. Skipping.") return if not rev_history_md: log.info("No revision history content found. Removing placeholder.") _remove_paragraph(placeholder_p) return lines = [line.strip() for line in rev_history_md.strip().split("\n")] table_lines = [ line for line in lines if line.startswith("|") and not line.startswith("|:--") ] if not table_lines: log.warning( "Could not parse a markdown table from the revision history section." ) _remove_paragraph(placeholder_p) return table_data = [ [cell.strip() for cell in line.split("|")][1:-1] for line in table_lines ] if not table_data or len(table_data) < 1: log.warning("Revision history table is empty.") _remove_paragraph(placeholder_p) return log.info(f"Adding revision history table with {len(table_data)} rows.") table = doc.add_table(rows=1, cols=len(table_data[0])) table.style = "Table Grid" hdr_cells = table.rows[0].cells for i, header_text in enumerate(table_data[0]): hdr_cells[i].text = header_text for row_data in table_data[1:]: row_cells = table.add_row().cells for i, cell_text in enumerate(row_data): row_cells[i].text = cell_text parent = placeholder_p._p.getparent() parent.insert(parent.index(placeholder_p._p), table._tbl) _remove_paragraph(placeholder_p) def _convert_to_pdf(markdown_text: str, output_file: str, add_toc: bool): log.info("Starting PDF conversion using pdfkit.") if config is None: raise FileNotFoundError("wkhtmltopdf executable not found. Cannot create PDF.") title = _get_document_title(markdown_text) content_without_title = markdown_text match = re.search(r"^\s*#+\s+(.+)\n?", markdown_text, re.MULTILINE) if match: content_without_title = markdown_text[match.end() :] md_converter = markdown.Markdown(extensions=["toc", "fenced_code", "tables"]) html_body = md_converter.convert(content_without_title) toc_html = "" if add_toc and hasattr(md_converter, "toc") and md_converter.toc: log.info("Generating Table of Contents for PDF.") toc_html = f"