Files
MCP_CyberArk/docs/md_to_docx.py
2026-03-29 19:51:51 +02:00

344 lines
12 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Convert the three MCP documentation Markdown files to Word (.docx) format.
Handles:
- Heading levels 14
- Bold (**text**) and inline code (`text`)
- Fenced code blocks (``` ... ```)
- Tables (| col | col |)
- Unordered lists (- item, * item)
- Ordered lists (1. item)
- Horizontal rules (---)
- Blank lines → paragraph spacing
Run:
python docs/md_to_docx.py
Produces:
docs/HLD.docx
docs/LLD.docx
docs/MANUAL.docx
"""
from __future__ import annotations
import re
from pathlib import Path
from docx import Document
from docx.enum.text import WD_ALIGN_PARAGRAPH
from docx.oxml.ns import qn
from docx.oxml import OxmlElement
from docx.shared import Inches, Pt, RGBColor
# ── Colour palette ────────────────────────────────────────────────────────────
DARK_BLUE = RGBColor(0x1F, 0x49, 0x7D) # heading 1
MID_BLUE = RGBColor(0x2E, 0x74, 0xB5) # heading 2
STEEL_BLUE = RGBColor(0x1F, 0x78, 0xB4) # heading 3
DARK_GREY = RGBColor(0x40, 0x40, 0x40) # body text
CODE_BG = RGBColor(0xF2, 0xF2, 0xF2) # code block shading
TABLE_HEAD = RGBColor(0x1F, 0x49, 0x7D) # table header background
TABLE_EVEN = RGBColor(0xEA, 0xF2, 0xFF) # alternating row colour
# ── Helpers ───────────────────────────────────────────────────────────────────
def _shade_cell(cell, colour: RGBColor) -> None:
"""Apply a solid background fill to a table cell."""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
shd = OxmlElement("w:shd")
shd.set(qn("w:val"), "clear")
shd.set(qn("w:color"), "auto")
shd.set(qn("w:fill"), f"{colour[0]:02X}{colour[1]:02X}{colour[2]:02X}")
tcPr.append(shd)
def _set_cell_border(cell, **kwargs) -> None:
"""Set borders on a table cell."""
tc = cell._tc
tcPr = tc.get_or_add_tcPr()
tcBorders = OxmlElement("w:tcBorders")
for side in ("top", "left", "bottom", "right", "insideH", "insideV"):
if side in kwargs:
border = OxmlElement(f"w:{side}")
for attr, val in kwargs[side].items():
border.set(qn(f"w:{attr}"), val)
tcBorders.append(border)
tcPr.append(tcBorders)
def _apply_inline(run, text: str) -> None:
"""Set run text, detecting and stripping bold/inline-code markers."""
run.text = text
def _parse_inline(para, text: str) -> None:
"""
Parse a line of text for inline Markdown:
**bold** → bold run
`code` → monospace run
plain → normal run
Adds runs to the given paragraph.
"""
pattern = re.compile(r'(\*\*[^*]+\*\*|`[^`]+`)')
parts = pattern.split(text)
for part in parts:
if not part:
continue
if part.startswith("**") and part.endswith("**"):
run = para.add_run(part[2:-2])
run.bold = True
elif part.startswith("`") and part.endswith("`"):
run = para.add_run(part[1:-1])
run.font.name = "Courier New"
run.font.size = Pt(9)
run.font.color.rgb = RGBColor(0xC0, 0x39, 0x2B)
else:
run = para.add_run(part)
def _add_heading(doc: Document, text: str, level: int) -> None:
"""Add a styled heading, stripping any leading '#' symbols."""
clean = re.sub(r"^#+\s*", "", text).strip()
# Remove anchor links like {#section-name}
clean = re.sub(r"\s*\{#[^}]+\}", "", clean)
para = doc.add_heading(clean, level=level)
run = para.runs[0] if para.runs else para.add_run(clean)
if level == 1:
run.font.color.rgb = DARK_BLUE
run.font.size = Pt(20)
elif level == 2:
run.font.color.rgb = MID_BLUE
run.font.size = Pt(15)
elif level == 3:
run.font.color.rgb = STEEL_BLUE
run.font.size = Pt(12)
else:
run.font.color.rgb = DARK_GREY
run.font.size = Pt(11)
run.bold = True
def _add_code_block(doc: Document, lines: list[str]) -> None:
"""Add a shaded monospace code block."""
para = doc.add_paragraph()
para.paragraph_format.left_indent = Inches(0.3)
para.paragraph_format.space_before = Pt(4)
para.paragraph_format.space_after = Pt(4)
# Add shading via XML
pPr = para._p.get_or_add_pPr()
shd = OxmlElement("w:shd")
shd.set(qn("w:val"), "clear")
shd.set(qn("w:color"), "auto")
shd.set(qn("w:fill"), "F2F2F2")
pPr.append(shd)
text = "\n".join(lines)
run = para.add_run(text)
run.font.name = "Courier New"
run.font.size = Pt(8.5)
run.font.color.rgb = RGBColor(0x1A, 0x1A, 0x1A)
def _add_table(doc: Document, rows: list[list[str]]) -> None:
"""Add a formatted table. First row is treated as the header."""
if not rows:
return
col_count = max(len(r) for r in rows)
# Normalise row lengths
rows = [r + [""] * (col_count - len(r)) for r in rows]
table = doc.add_table(rows=len(rows), cols=col_count)
table.style = "Table Grid"
for row_idx, row_data in enumerate(rows):
row = table.rows[row_idx]
for col_idx, cell_text in enumerate(row_data):
cell = row.cells[col_idx]
clean = cell_text.strip().strip("`")
para = cell.paragraphs[0]
para.paragraph_format.space_before = Pt(2)
para.paragraph_format.space_after = Pt(2)
if row_idx == 0:
# Header row
_shade_cell(cell, TABLE_HEAD)
run = para.add_run(clean)
run.bold = True
run.font.color.rgb = RGBColor(0xFF, 0xFF, 0xFF)
run.font.size = Pt(9)
else:
if row_idx % 2 == 0:
_shade_cell(cell, TABLE_EVEN)
_parse_inline(para, clean)
for run in para.runs:
run.font.size = Pt(9)
doc.add_paragraph() # spacing after table
def _add_list_item(doc: Document, text: str, level: int, ordered: bool,
counter: int) -> None:
"""Add a bullet or numbered list item."""
style = "List Bullet" if not ordered else "List Number"
para = doc.add_paragraph(style=style)
if level > 0:
para.paragraph_format.left_indent = Inches(0.25 * (level + 1))
_parse_inline(para, text)
for run in para.runs:
run.font.size = Pt(10)
def _parse_md_table(raw_rows: list[str]) -> list[list[str]]:
"""Convert raw Markdown table lines to a list of cell lists."""
result = []
for line in raw_rows:
# Skip separator rows (---|---)
if re.match(r"^\s*\|?[\s\-:]+\|[\s\-:|]+\s*$", line):
continue
cells = [c.strip() for c in line.strip().strip("|").split("|")]
if cells:
result.append(cells)
return result
# ── Main converter ────────────────────────────────────────────────────────────
def convert(md_path: Path, docx_path: Path) -> None:
doc = Document()
# Page margins
for section in doc.sections:
section.top_margin = Inches(1.0)
section.bottom_margin = Inches(1.0)
section.left_margin = Inches(1.2)
section.right_margin = Inches(1.2)
# Default body style
style = doc.styles["Normal"]
style.font.name = "Calibri"
style.font.size = Pt(10.5)
style.font.color.rgb = DARK_GREY
lines = md_path.read_text(encoding="utf-8").splitlines()
i = 0
in_code_block = False
code_lines: list[str] = []
table_rows: list[str] = []
in_table = False
while i < len(lines):
line = lines[i]
# ── Fenced code block ──────────────────────────────────────────────
if line.strip().startswith("```"):
if not in_code_block:
in_code_block = True
code_lines = []
else:
in_code_block = False
_add_code_block(doc, code_lines)
i += 1
continue
if in_code_block:
code_lines.append(line)
i += 1
continue
# ── Table detection ────────────────────────────────────────────────
is_table_line = "|" in line and line.strip().startswith("|")
if is_table_line:
table_rows.append(line)
i += 1
continue
elif table_rows:
parsed = _parse_md_table(table_rows)
if parsed:
_add_table(doc, parsed)
table_rows = []
# ── Headings ────────────────────────────────────────────────────────
m = re.match(r"^(#{1,4})\s+(.+)$", line)
if m:
level = len(m.group(1))
_add_heading(doc, m.group(2), level)
i += 1
continue
# ── Horizontal rule ─────────────────────────────────────────────────
if re.match(r"^[-*_]{3,}\s*$", line.strip()):
para = doc.add_paragraph()
pPr = para._p.get_or_add_pPr()
pBdr = OxmlElement("w:pBdr")
bottom = OxmlElement("w:bottom")
bottom.set(qn("w:val"), "single")
bottom.set(qn("w:sz"), "6")
bottom.set(qn("w:space"), "1")
bottom.set(qn("w:color"), "2E74B5")
pBdr.append(bottom)
pPr.append(pBdr)
i += 1
continue
# ── Unordered list ──────────────────────────────────────────────────
m = re.match(r"^(\s*)[-*]\s+(.+)$", line)
if m:
indent = len(m.group(1)) // 2
_add_list_item(doc, m.group(2), indent, ordered=False, counter=0)
i += 1
continue
# ── Ordered list ────────────────────────────────────────────────────
m = re.match(r"^(\s*)\d+\.\s+(.+)$", line)
if m:
indent = len(m.group(1)) // 2
_add_list_item(doc, m.group(2), indent, ordered=True, counter=0)
i += 1
continue
# ── Blank line ──────────────────────────────────────────────────────
if not line.strip():
i += 1
continue
# ── Plain paragraph ─────────────────────────────────────────────────
para = doc.add_paragraph()
para.paragraph_format.space_after = Pt(4)
_parse_inline(para, line)
for run in para.runs:
run.font.size = Pt(10.5)
i += 1
# Flush any remaining table
if table_rows:
parsed = _parse_md_table(table_rows)
if parsed:
_add_table(doc, parsed)
doc.save(str(docx_path))
print(f" Written: {docx_path} ({docx_path.stat().st_size // 1024} KB)")
# ── Entry point ───────────────────────────────────────────────────────────────
if __name__ == "__main__":
docs_dir = Path(__file__).parent
files = [
("HLD.md", "HLD.docx"),
("LLD.md", "LLD.docx"),
("MANUAL.md", "MANUAL.docx"),
]
print("Converting Markdown → Word (.docx) ...")
for md_name, docx_name in files:
md_path = docs_dir / md_name
docx_path = docs_dir / docx_name
print(f" Processing {md_name} ...")
convert(md_path, docx_path)
print("Done.")