# markdownconverter/core/core.py import os import re import sys import tempfile from datetime import date import docx import pypandoc import pdfkit import markdown import subprocess from docx.enum.text import WD_BREAK from docx2pdf import convert as convert_word from ..utils.logger import get_logger log = get_logger(__name__) # --- Custom Exceptions --- class TemplatePlaceholderError(ValueError): pass class ConverterNotFoundError(Exception): pass # --- PDFKit Configuration --- try: config = pdfkit.configuration() log.info("pdfkit configured using wkhtmltopdf from system PATH.") except OSError: WKHTMLTOPDF_PATH = r"C:\Program Files\wkhtmltopdf\bin\wkhtmltopdf.exe" if os.path.exists(WKHTMLTOPDF_PATH): config = pdfkit.configuration(wkhtmltopdf=WKHTMLTOPDF_PATH) else: config = None log.warning("wkhtmltopdf not found. PDF conversion may fail.") def scan_template_for_placeholders(template_path: str) -> tuple[list[str], list[str]]: if not os.path.exists(template_path): raise FileNotFoundError(f"Template file not found: {template_path}") log.info(f"Scanning template '{os.path.basename(template_path)}' for placeholders.") structural_keys = {"%%REVISION_RECORD%%", "%%DOC_TOC%%", "%%DOC_CONTENT%%"} placeholder_pattern = re.compile(r"%%([A-Z0-9_]+)%%") found_placeholders = set() doc = docx.Document(template_path) def find_in_element(element): if hasattr(element, "paragraphs"): for p in element.paragraphs: for match in placeholder_pattern.finditer( "".join(run.text for run in p.runs) ): found_placeholders.add(match.group(0)) if hasattr(element, "tables"): for table in element.tables: for row in table.rows: for cell in row.cells: find_in_element(cell) find_in_element(doc) for section in doc.sections: find_in_element(section.header) find_in_element(section.footer) dynamic = sorted([p for p in found_placeholders if p not in structural_keys]) structural = sorted([p for p in found_placeholders if p in structural_keys]) log.info( f"Scan complete. Found {len(dynamic)} dynamic and {len(structural)} structural placeholders." ) return dynamic, structural def _get_document_title(markdown_text: str) -> str: match = re.search(r"^\s*#+\s+(.+)", markdown_text, re.MULTILINE) return match.group(1).strip() if match else "Untitled Document" def _split_markdown_by_revision_history( markdown_text: str, separator_heading="## Revision Record" ) -> tuple[str, str]: pattern = re.compile( f"({re.escape(separator_heading)}.*?)(?=\n#+)", re.DOTALL | re.S ) match = pattern.search(markdown_text) if not match: log.warning( f"'{separator_heading}' section not found. No revision history will be added." ) return "", markdown_text rev_history_md = match.group(0).strip() main_content_md = markdown_text.replace(rev_history_md, "", 1).strip() return rev_history_md, main_content_md def _replace_text_in_paragraph(paragraph, placeholders: dict[str, str]): full_text = "".join(run.text for run in paragraph.runs) if not any(key in full_text for key in placeholders): return for key, value in placeholders.items(): if key in full_text: full_text = full_text.replace(key, str(value)) style = paragraph.runs[0].style if paragraph.runs else None font = paragraph.runs[0].font if paragraph.runs else None for run in reversed(paragraph.runs): p = paragraph._p p.remove(run._r) new_run = paragraph.add_run(full_text) if style: new_run.style = style if font: new_run.font.name = font.name new_run.font.size = font.size new_run.font.bold = font.bold new_run.font.italic = font.italic new_run.font.underline = font.underline if font.color and font.color.rgb: new_run.font.color.rgb = font.color.rgb def _replace_text_in_element(element, placeholders: dict[str, str]): if hasattr(element, "paragraphs"): for p in element.paragraphs: _replace_text_in_paragraph(p, placeholders) if hasattr(element, "tables"): for table in element.tables: for row in table.rows: for cell in row.cells: _replace_text_in_element(cell, placeholders) def _replace_metadata_placeholders(doc: docx.Document, placeholders: dict[str, str]): log.info(f"Replacing metadata placeholders: {list(placeholders.keys())}") _replace_text_in_element(doc, placeholders) for section in doc.sections: _replace_text_in_element(section.header, placeholders) _replace_text_in_element(section.footer, placeholders) def _find_placeholder_paragraph(doc: docx.Document, placeholder: str): for p in doc.paragraphs: if placeholder in "".join(run.text for run in p.runs): return p return None def _insert_docx_at_paragraph(paragraph, source_docx_path: str): parent = paragraph._p.getparent() index = parent.index(paragraph._p) source_doc = docx.Document(source_docx_path) for element in source_doc.element.body: parent.insert(index, element) index += 1 parent.remove(paragraph._p) def _remove_paragraph(paragraph): if paragraph is None: return parent = paragraph._p.getparent() parent.remove(paragraph._p) def _add_revision_table(doc: docx.Document, rev_history_md: str): placeholder_p = _find_placeholder_paragraph(doc, "%%REVISION_RECORD%%") if not placeholder_p: log.warning("Revision record placeholder not found in template. Skipping.") return if not rev_history_md: log.info("No revision history content found. Removing placeholder.") _remove_paragraph(placeholder_p) return lines = [line.strip() for line in rev_history_md.strip().split("\n")] table_lines = [ line for line in lines if line.startswith("|") and not line.startswith("|:--") ] if not table_lines: log.warning( "Could not parse a markdown table from the revision history section." ) _remove_paragraph(placeholder_p) return table_data = [ [cell.strip() for cell in line.split("|")][1:-1] for line in table_lines ] if not table_data or len(table_data) < 1: log.warning("Revision history table is empty.") _remove_paragraph(placeholder_p) return log.info(f"Adding revision history table with {len(table_data)} rows.") table = doc.add_table(rows=1, cols=len(table_data[0])) table.style = "Table Grid" hdr_cells = table.rows[0].cells for i, header_text in enumerate(table_data[0]): hdr_cells[i].text = header_text for row_data in table_data[1:]: row_cells = table.add_row().cells for i, cell_text in enumerate(row_data): row_cells[i].text = cell_text parent = placeholder_p._p.getparent() parent.insert(parent.index(placeholder_p._p), table._tbl) _remove_paragraph(placeholder_p) def _convert_to_pdf(markdown_text: str, output_file: str, add_toc: bool): log.info("Starting PDF conversion using pdfkit.") if config is None: raise FileNotFoundError("wkhtmltopdf executable not found. Cannot create PDF.") title = _get_document_title(markdown_text) content_without_title = markdown_text match = re.search(r"^\s*#+\s+(.+)\n?", markdown_text, re.MULTILINE) if match: content_without_title = markdown_text[match.end() :] # Previous code: # md_converter = markdown.Markdown(extensions=["toc", "fenced_code", "tables"]) # New code with 'nl2br' extension: md_converter = markdown.Markdown(extensions=["toc", "fenced_code", "tables", "nl2br"]) html_body = md_converter.convert(content_without_title) toc_html = "" if add_toc and hasattr(md_converter, "toc") and md_converter.toc: log.info("Generating Table of Contents for PDF.") toc_html = f"

Table of Contents

{md_converter.toc}
" full_html = f'{title}

{title}

{toc_html}{html_body}' pdf_options = {"encoding": "UTF-8", "enable-local-file-access": None} pdfkit.from_string( full_html, output_file, configuration=config, options=pdf_options ) log.info(f"PDF successfully generated: {output_file}") def _convert_to_docx( markdown_text: str, output_file: str, template_path: str, metadata: dict, add_toc: bool, ): log.info("Starting DOCX conversion.") dynamic_placeholders, structural_placeholders = scan_template_for_placeholders( template_path ) required_structural = {"%%REVISION_RECORD%%", "%%DOC_TOC%%", "%%DOC_CONTENT%%"} if not required_structural.issubset(structural_placeholders): missing = required_structural - set(structural_placeholders) raise TemplatePlaceholderError( f"Template is missing required structural placeholders: {', '.join(missing)}" ) doc = docx.Document(template_path) if "%%DOC_PROJECT%%" in dynamic_placeholders and not metadata.get( "%%DOC_PROJECT%%" ): metadata["%%DOC_PROJECT%%"] = _get_document_title(markdown_text) if "%%DOC_DATE%%" in dynamic_placeholders: metadata["%%DOC_DATE%%"] = date.today().strftime("%d/%m/%Y") _replace_metadata_placeholders(doc, metadata) rev_history_md, main_content_md = _split_markdown_by_revision_history(markdown_text) _add_revision_table(doc, rev_history_md) temp_files = [] pandoc_format = "markdown+hard_line_breaks" try: if main_content_md: content_for_pandoc = main_content_md # Step 1: Remove the main H1 document title from the content to be processed. # It's used for metadata, not for the main body's numbering. match = re.search(r"^\s*#\s+(.+)\n?", content_for_pandoc, re.MULTILINE) if match: log.info("Removing main H1 title from content body.") content_for_pandoc = content_for_pandoc[match.end() :] # Step 2: Strip any existing manual numbering from headings (e.g., "## 1. Title") # to prevent double numbering when automatic numbering is applied. log.info("Stripping manual numbering from headings for auto-numbering.") content_for_pandoc = re.sub( r"^(\s*#+)\s+[0-9\.]+\s+", r"\1 ", content_for_pandoc, flags=re.MULTILINE, ) # Step 3: Configure Pandoc arguments for correct hierarchical numbering. pandoc_args = [ # Enable automatic section numbering (e.g., 1, 1.1, 1.1.1). "--number-sections", # Shift heading levels up by one. This maps: # ## (H2 in MD) -> Heading 1 in DOCX (numbered as 1, 2, ...) # ### (H3 in MD) -> Heading 2 in DOCX (numbered as 1.1, 1.2, ...) "--shift-heading-level-by=-1", # Keep text left-aligned. "--variable=justify:false", ] if add_toc: pandoc_args.append("--toc") log.info("Adding page break before Table of Contents.") toc_placeholder_p = _find_placeholder_paragraph(doc, "%%DOC_TOC%%") # Insert a page break before the TOC for better formatting. if toc_placeholder_p: toc_placeholder_p.insert_paragraph_before().add_run().add_break( WD_BREAK.PAGE ) with tempfile.NamedTemporaryFile( delete=False, suffix=".docx" ) as temp_file: pypandoc.convert_text( content_for_pandoc, "docx", format=pandoc_format, extra_args=pandoc_args, outputfile=temp_file.name, ) temp_files.append(temp_file.name) if toc_placeholder_p: _insert_docx_at_paragraph(toc_placeholder_p, temp_file.name) # The main content is now part of the generated TOC doc, so remove the placeholder. _remove_paragraph(_find_placeholder_paragraph(doc, "%%DOC_CONTENT%%")) else: # If no TOC, just insert the content at its placeholder. log.info("Adding page break before main content.") content_placeholder_p = _find_placeholder_paragraph( doc, "%%DOC_CONTENT%%" ) if content_placeholder_p: content_placeholder_p.insert_paragraph_before().add_run().add_break( WD_BREAK.PAGE ) with tempfile.NamedTemporaryFile( delete=False, suffix=".docx" ) as temp_file: # We don't add '--toc' to pandoc_args here. pypandoc.convert_text( content_for_pandoc, "docx", format=pandoc_format, extra_args=pandoc_args, outputfile=temp_file.name, ) temp_files.append(temp_file.name) if content_placeholder_p: _insert_docx_at_paragraph(content_placeholder_p, temp_file.name) # TOC placeholder is not used, so remove it. _remove_paragraph(_find_placeholder_paragraph(doc, "%%DOC_TOC%%")) else: # If there is no main content, remove both placeholders. _remove_paragraph(_find_placeholder_paragraph(doc, "%%DOC_TOC%%")) _remove_paragraph(_find_placeholder_paragraph(doc, "%%DOC_CONTENT%%")) doc.save(output_file) log.info(f"Document successfully created at {output_file}") finally: for temp_file in temp_files: if os.path.exists(temp_file): os.remove(temp_file) def convert_docx_to_pdf(input_docx_path: str, output_pdf_path: str) -> str: if not os.path.exists(input_docx_path): raise FileNotFoundError(f"Input DOCX file not found: {input_docx_path}") try: log.info("Attempting DOCX to PDF conversion using MS Word.") convert_word(input_docx_path, output_pdf_path) log.info(f"Successfully converted using MS Word: {output_pdf_path}") return output_pdf_path except Exception as e: log.warning(f"MS Word conversion failed. It might not be installed. Error: {e}") log.info("Falling back to LibreOffice conversion.") libreoffice_path = r"C:\Program Files\LibreOffice\program\soffice.exe" if not os.path.exists(libreoffice_path): log.error("LibreOffice executable not found. Cannot convert DOCX to PDF.") raise ConverterNotFoundError( "Neither MS Word nor LibreOffice could be used. Please install one to use this feature." ) if sys.platform == "win32": try: log.debug( "Attempting to terminate existing LibreOffice processes on Windows." ) subprocess.run( ["taskkill", "/f", "/im", "soffice.exe"], check=False, capture_output=True, ) subprocess.run( ["taskkill", "/f", "/im", "soffice.bin"], check=False, capture_output=True, ) log.debug("Termination commands sent.") except Exception as kill_e: log.warning(f"Could not terminate existing LibreOffice processes: {kill_e}") output_dir = os.path.dirname(output_pdf_path) log.info(f"Attempting conversion using LibreOffice at: {libreoffice_path}") try: expected_lo_output = os.path.join( output_dir, os.path.splitext(os.path.basename(input_docx_path))[0] + ".pdf" ) command = [ libreoffice_path, "--headless", "--convert-to", "pdf", "--outdir", output_dir, input_docx_path, ] process = subprocess.run( command, check=True, capture_output=True, text=True, encoding="utf-8", errors="ignore", timeout=60, ) log.debug(f"LibreOffice stdout: {process.stdout}") log.debug(f"LibreOffice stderr: {process.stderr}") if os.path.exists(output_pdf_path) and expected_lo_output != output_pdf_path: os.remove(output_pdf_path) if os.path.exists(expected_lo_output): if expected_lo_output != output_pdf_path: os.rename(expected_lo_output, output_pdf_path) else: raise FileNotFoundError( f"LibreOffice conversion process finished, but the output file was not found at the expected path: {expected_lo_output}" ) log.info(f"Successfully converted using LibreOffice: {output_pdf_path}") return output_pdf_path except subprocess.TimeoutExpired: log.error("LibreOffice conversion timed out after 60 seconds.") raise except (subprocess.CalledProcessError, FileNotFoundError) as e: log.error(f"LibreOffice conversion failed. Error: {e}", exc_info=True) raise def convert_markdown( input_file: str, output_path: str, output_format: str, add_toc: bool = False, template_path: str = None, metadata: dict = None, ): if not os.path.exists(input_file): raise FileNotFoundError(f"Input file not found: {input_file}") log.info( f"Starting conversion of '{os.path.basename(input_file)}' to {output_format}." ) with open(input_file, "r", encoding="utf-8") as f: markdown_text = f.read() if output_format == "PDF": _convert_to_pdf(markdown_text, output_path, add_toc) elif output_format == "DOCX": if metadata is None: metadata = {} _convert_to_docx(markdown_text, output_path, template_path, metadata, add_toc) else: raise ValueError(f"Unsupported output format: {output_format}") return output_path