MCP_CyberArk/docs/md_to_docx.py

"""
Convert the three MCP documentation Markdown files to Word (.docx) format.

Handles:
  - Heading levels 1–4
  - Bold (**text**) and inline code (`text`)
  - Fenced code blocks (``` ... ```)
  - Tables (| col | col |)
  - Unordered lists (- item, * item)
  - Ordered lists (1. item)
  - Horizontal rules (---)
  - Blank lines → paragraph spacing

Run:
    python docs/md_to_docx.py
Produces:
    docs/HLD.docx
    docs/LLD.docx
    docs/MANUAL.docx
"""

from __future__ import annotations

import re
from pathlib import Path

from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.shared import Inches, Pt, RGBColor


# ── Colour palette ────────────────────────────────────────────────────────────
DARK_BLUE   = RGBColor(0x1F, 0x49, 0x7D)   # heading 1
MID_BLUE    = RGBColor(0x2E, 0x74, 0xB5)   # heading 2
STEEL_BLUE  = RGBColor(0x1F, 0x78, 0xB4)   # heading 3
DARK_GREY   = RGBColor(0x40, 0x40, 0x40)   # body text
CODE_BG     = RGBColor(0xF2, 0xF2, 0xF2)   # code block shading
TABLE_HEAD  = RGBColor(0x1F, 0x49, 0x7D)   # table header background
TABLE_EVEN  = RGBColor(0xEA, 0xF2, 0xFF)   # alternating row colour


# ── Helpers ───────────────────────────────────────────────────────────────────

def _shade_cell(cell, colour: RGBColor) -> None:
    """Apply a solid background fill to a table cell."""
    tc = cell._tc
    tcPr = tc.get_or_add_tcPr()
    shd = OxmlElement("w:shd")
    shd.set(qn("w:val"), "clear")
    shd.set(qn("w:color"), "auto")
    shd.set(qn("w:fill"), f"{colour[0]:02X}{colour[1]:02X}{colour[2]:02X}")
    tcPr.append(shd)


def _set_cell_border(cell, **kwargs) -> None:
    """Set borders on a table cell."""
    tc = cell._tc
    tcPr = tc.get_or_add_tcPr()
    tcBorders = OxmlElement("w:tcBorders")
    for side in ("top", "left", "bottom", "right", "insideH", "insideV"):
        if side in kwargs:
            border = OxmlElement(f"w:{side}")
            for attr, val in kwargs[side].items():
                border.set(qn(f"w:{attr}"), val)
            tcBorders.append(border)
    tcPr.append(tcBorders)


def _apply_inline(run, text: str) -> None:
    """Set run text, detecting and stripping bold/inline-code markers."""
    run.text = text


def _parse_inline(para, text: str) -> None:
    """
    Parse a line of text for inline Markdown:
      **bold**  →  bold run
      `code`    →  monospace run
      plain     →  normal run
    Adds runs to the given paragraph.
    """
    pattern = re.compile(r'(\*\*[^*]+\*\*|`[^`]+`)')
    parts = pattern.split(text)
    for part in parts:
        if not part:
            continue
        if part.startswith("**") and part.endswith("**"):
            run = para.add_run(part[2:-2])
            run.bold = True
        elif part.startswith("`") and part.endswith("`"):
            run = para.add_run(part[1:-1])
            run.font.name = "Courier New"
            run.font.size = Pt(9)
            run.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
        else:
            run = para.add_run(part)


def _add_heading(doc: Document, text: str, level: int) -> None:
    """Add a styled heading, stripping any leading '#' symbols."""
    clean = re.sub(r"^#+\s*", "", text).strip()
    # Remove anchor links like {#section-name}
    clean = re.sub(r"\s*\{#[^}]+\}", "", clean)
    para = doc.add_heading(clean, level=level)
    run = para.runs[0] if para.runs else para.add_run(clean)
    if level == 1:
        run.font.color.rgb = DARK_BLUE
        run.font.size = Pt(20)
    elif level == 2:
        run.font.color.rgb = MID_BLUE
        run.font.size = Pt(15)
    elif level == 3:
        run.font.color.rgb = STEEL_BLUE
        run.font.size = Pt(12)
    else:
        run.font.color.rgb = DARK_GREY
        run.font.size = Pt(11)
        run.bold = True


def _add_code_block(doc: Document, lines: list[str]) -> None:
    """Add a shaded monospace code block."""
    para = doc.add_paragraph()
    para.paragraph_format.left_indent = Inches(0.3)
    para.paragraph_format.space_before = Pt(4)
    para.paragraph_format.space_after = Pt(4)
    # Add shading via XML
    pPr = para._p.get_or_add_pPr()
    shd = OxmlElement("w:shd")
    shd.set(qn("w:val"), "clear")
    shd.set(qn("w:color"), "auto")
    shd.set(qn("w:fill"), "F2F2F2")
    pPr.append(shd)

    text = "\n".join(lines)
    run = para.add_run(text)
    run.font.name = "Courier New"
    run.font.size = Pt(8.5)
    run.font.color.rgb = RGBColor(0x1A, 0x1A, 0x1A)


def _add_table(doc: Document, rows: list[list[str]]) -> None:
    """Add a formatted table. First row is treated as the header."""
    if not rows:
        return
    col_count = max(len(r) for r in rows)
    # Normalise row lengths
    rows = [r + [""] * (col_count - len(r)) for r in rows]

    table = doc.add_table(rows=len(rows), cols=col_count)
    table.style = "Table Grid"

    for row_idx, row_data in enumerate(rows):
        row = table.rows[row_idx]
        for col_idx, cell_text in enumerate(row_data):
            cell = row.cells[col_idx]
            clean = cell_text.strip().strip("`")
            para = cell.paragraphs[0]
            para.paragraph_format.space_before = Pt(2)
            para.paragraph_format.space_after = Pt(2)

            if row_idx == 0:
                # Header row
                _shade_cell(cell, TABLE_HEAD)
                run = para.add_run(clean)
                run.bold = True
                run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
                run.font.size = Pt(9)
            else:
                if row_idx % 2 == 0:
                    _shade_cell(cell, TABLE_EVEN)
                _parse_inline(para, clean)
                for run in para.runs:
                    run.font.size = Pt(9)

    doc.add_paragraph()  # spacing after table


def _add_list_item(doc: Document, text: str, level: int, ordered: bool,
                   counter: int) -> None:
    """Add a bullet or numbered list item."""
    style = "List Bullet" if not ordered else "List Number"
    para = doc.add_paragraph(style=style)
    if level > 0:
        para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
    _parse_inline(para, text)
    for run in para.runs:
        run.font.size = Pt(10)


def _parse_md_table(raw_rows: list[str]) -> list[list[str]]:
    """Convert raw Markdown table lines to a list of cell lists."""
    result = []
    for line in raw_rows:
        # Skip separator rows (---|---)
        if re.match(r"^\s*\|?[\s\-:]+\|[\s\-:|]+\s*$", line):
            continue
        cells = [c.strip() for c in line.strip().strip("|").split("|")]
        if cells:
            result.append(cells)
    return result


# ── Main converter ────────────────────────────────────────────────────────────

def convert(md_path: Path, docx_path: Path) -> None:
    doc = Document()

    # Page margins
    for section in doc.sections:
        section.top_margin    = Inches(1.0)
        section.bottom_margin = Inches(1.0)
        section.left_margin   = Inches(1.2)
        section.right_margin  = Inches(1.2)

    # Default body style
    style = doc.styles["Normal"]
    style.font.name = "Calibri"
    style.font.size = Pt(10.5)
    style.font.color.rgb = DARK_GREY

    lines = md_path.read_text(encoding="utf-8").splitlines()

    i = 0
    in_code_block = False
    code_lines: list[str] = []
    table_rows: list[str] = []
    in_table = False

    while i < len(lines):
        line = lines[i]

        # ── Fenced code block ──────────────────────────────────────────────
        if line.strip().startswith("```"):
            if not in_code_block:
                in_code_block = True
                code_lines = []
            else:
                in_code_block = False
                _add_code_block(doc, code_lines)
            i += 1
            continue

        if in_code_block:
            code_lines.append(line)
            i += 1
            continue

        # ── Table detection ────────────────────────────────────────────────
        is_table_line = "|" in line and line.strip().startswith("|")
        if is_table_line:
            table_rows.append(line)
            i += 1
            continue
        elif table_rows:
            parsed = _parse_md_table(table_rows)
            if parsed:
                _add_table(doc, parsed)
            table_rows = []

        # ── Headings ────────────────────────────────────────────────────────
        m = re.match(r"^(#{1,4})\s+(.+)$", line)
        if m:
            level = len(m.group(1))
            _add_heading(doc, m.group(2), level)
            i += 1
            continue

        # ── Horizontal rule ─────────────────────────────────────────────────
        if re.match(r"^[-*_]{3,}\s*$", line.strip()):
            para = doc.add_paragraph()
            pPr = para._p.get_or_add_pPr()
            pBdr = OxmlElement("w:pBdr")
            bottom = OxmlElement("w:bottom")
            bottom.set(qn("w:val"), "single")
            bottom.set(qn("w:sz"), "6")
            bottom.set(qn("w:space"), "1")
            bottom.set(qn("w:color"), "2E74B5")
            pBdr.append(bottom)
            pPr.append(pBdr)
            i += 1
            continue

        # ── Unordered list ──────────────────────────────────────────────────
        m = re.match(r"^(\s*)[-*]\s+(.+)$", line)
        if m:
            indent = len(m.group(1)) // 2
            _add_list_item(doc, m.group(2), indent, ordered=False, counter=0)
            i += 1
            continue

        # ── Ordered list ────────────────────────────────────────────────────
        m = re.match(r"^(\s*)\d+\.\s+(.+)$", line)
        if m:
            indent = len(m.group(1)) // 2
            _add_list_item(doc, m.group(2), indent, ordered=True, counter=0)
            i += 1
            continue

        # ── Blank line ──────────────────────────────────────────────────────
        if not line.strip():
            i += 1
            continue

        # ── Plain paragraph ─────────────────────────────────────────────────
        para = doc.add_paragraph()
        para.paragraph_format.space_after = Pt(4)
        _parse_inline(para, line)
        for run in para.runs:
            run.font.size = Pt(10.5)
        i += 1

    # Flush any remaining table
    if table_rows:
        parsed = _parse_md_table(table_rows)
        if parsed:
            _add_table(doc, parsed)

    doc.save(str(docx_path))
    print(f"  Written: {docx_path}  ({docx_path.stat().st_size // 1024} KB)")


# ── Entry point ───────────────────────────────────────────────────────────────

if __name__ == "__main__":
    docs_dir = Path(__file__).parent

    files = [
        ("HLD.md",    "HLD.docx"),
        ("LLD.md",    "LLD.docx"),
        ("MANUAL.md", "MANUAL.docx"),
    ]

    print("Converting Markdown → Word (.docx) ...")
    for md_name, docx_name in files:
        md_path   = docs_dir / md_name
        docx_path = docs_dir / docx_name
        print(f"  Processing {md_name} ...")
        convert(md_path, docx_path)

    print("Done.")