sciagent/scripts/build-word-template.py

#!/usr/bin/env python3
"""
Build Word template from original form file by replacing dots-lines ("..........")
với placeholders {{xxx}} theo vị trí label trong document.

Usage:
    python scripts/build-word-template.py

Input:  src/Backend/DYD.Api/Templates/bao-cao-template-original.docx
Output: src/Backend/DYD.Api/Templates/bao-cao-template.docx
"""

import re
import shutil
import sys
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path

NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
W = f'{{{NS_URI}}}'
ET.register_namespace('w', NS_URI)

# Paragraph index → placeholder token (after dot-line replacement).
# Determined by running script once and printing all paragraphs, then mapping manually.
# Key = paragraph index (in document iteration order)
# Value = placeholder name ({{VALUE}} will be inserted if current paragraph is dots-line
#         OR appended to preceding label if current paragraph IS the label itself)

# For cover page (no dots after labels), we append placeholder AT END of label paragraph
# Format: label_text_regex → placeholder (inject at end of paragraph)
COVER_LABEL_APPEND = [
    (re.compile(r'^Tên sáng kiến \(Tiếng Việt\):\s*$'), 'coverInitiativeName'),
    (re.compile(r'^Tác giả/nhóm tác giả sáng kiến:\s*$'), 'coverAuthors'),
    (re.compile(r'^Đơn vị công tác:\s*$'), 'coverUnit'),
    (re.compile(r'^Thông tin liên hệ \(Điện thoại, Email\):\s*$'), 'coverContact'),
    (re.compile(r'^NĂM 20\.\.\.$'), 'coverYear'),
]

# Map: label regex (previous paragraph) → placeholder for the dots-paragraph following
LABEL_TO_PLACEHOLDER = [
    # Mẫu 01
    (re.compile(r'^1\.\s*Mở đầu'), 'introduction'),
    (re.compile(r'^2\.\s*Tên sáng kiến\b'), 'initiativeName'),
    (re.compile(r'^3\.\s*Lĩnh vực áp dụng'), 'applicationField'),
    (re.compile(r'^4\.1\s*Tình trạng giải pháp'), 'currentStatus'),
    (re.compile(r'^-\s*Mục đích của sáng kiến'), 'purpose'),
    (re.compile(r'^\+\s*Các bước thực hiện'), 'implementationSteps'),
    (re.compile(r'^\+\s*Các điều kiện cần thiết'), 'conditions'),
    (re.compile(r'^-\s*Về tính mới'), 'novelty'),
    # Effectiveness 10 items
    (re.compile(r'^\+\s*Tạo ra lợi ích kinh tế'), 'effEconomic'),
    (re.compile(r'^\+\s*Đem lại hiệu quả trong giảng dạy'), 'effTeaching'),
    (re.compile(r'^\+\s*Tăng năng suất lao động'), 'effProductivity'),
    (re.compile(r'^\+\s*Nâng cao hiệu quả công việc'), 'effSocial'),
    (re.compile(r'^\+\s*Nâng cao chất lượng công việc'), 'effQuality'),
    (re.compile(r'^\+\s*Giảm chi phí'), 'effCost'),
    (re.compile(r'^\+\s*Cải thiện môi trường'), 'effEnvironment'),
    (re.compile(r'^\+\s*Bảo vệ sức khỏe'), 'effHealth'),
    (re.compile(r'^\+\s*Đảm bảo an toàn lao động'), 'effLaborSafety'),
    (re.compile(r'^\+\s*Nâng cao khả năng, trình độ'), 'effAwareness'),
    (re.compile(r'^6\.\s*Những thông tin cần được bảo mật'), 'confidentialInfo'),
    # Mẫu 02
    (re.compile(r'^-\s*Chủ đầu tư tạo ra sáng kiến'), 'investorName'),
    (re.compile(r'^-\s*Lĩnh vực áp dụng sáng kiến'), 'applicationField02'),
    (re.compile(r'^-\s*Ngày sáng kiến được áp dụng'), 'firstApplyDate'),
    (re.compile(r'^-\s*Nội dung của sáng kiến'), 'contentSummary'),
    (re.compile(r'^Những thông tin cần được bảo mật'), 'confidentialInfo02'),
    (re.compile(r'^Các điều kiện cần thiết để áp dụng'), 'conditions02'),
    (re.compile(r'^Đánh giá lợi ích thu được hoặc dự kiến có thể thu được do áp dụng sáng kiến theo ý kiến của tác giả'), 'authorEvaluation'),
    (re.compile(r'^Đánh giá lợi ích thu được hoặc dự kiến có thể thu được do áp dụng sáng kiến theo ý kiến của tổ chức'), 'trialEvaluation'),
    # Mẫu 03
    (re.compile(r'^1\.\s*Tên sáng kiến'), 'initiativeName03'),
    (re.compile(r'^2\.\s*Tác giả chính'), 'mainAuthor03'),
    (re.compile(r'^Chức vụ, đơn vị công tác'), 'position03'),
    # Mẫu 04
    (re.compile(r'^Kết luận'), 'conclusion'),
]

# Dots-line pattern — paragraph text (stripped, whitespace collapsed) is 50+ dots
DOTS_PATTERN = re.compile(r'^[\s\.]{50,}$')


def para_text(p):
    """Concat all w:t text of paragraph p."""
    return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip()


def set_para_text(p, new_text):
    """Replace paragraph's run text with single new_text. Keeps first run's properties."""
    # Find all runs
    runs = list(p.findall(f'{W}r'))
    if not runs:
        # No run — add one with text
        r = ET.SubElement(p, f'{W}r')
        t = ET.SubElement(r, f'{W}t')
        t.text = new_text
        t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')
        return

    # Remove all runs except first
    for r in runs[1:]:
        p.remove(r)

    # Clear all <w:t> in first run, leave <w:rPr> intact
    first = runs[0]
    for t in list(first.findall(f'{W}t')):
        first.remove(t)
    # Remove all non-rPr, non-text children? Leave them alone (breaks etc).
    # Add new text element
    t = ET.SubElement(first, f'{W}t')
    t.text = new_text
    t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')


def append_placeholder_to_para(p, placeholder):
    """Append text ' {{placeholder}}' to end of paragraph (new run)."""
    r = ET.SubElement(p, f'{W}r')
    t = ET.SubElement(r, f'{W}t')
    t.text = f' {{{{{placeholder}}}}}'
    t.set('{http://www.w3.org/XML/1998/namespace}space', 'preserve')


def find_placeholder_for_label(text):
    for regex, placeholder in LABEL_TO_PLACEHOLDER:
        if regex.match(text):
            return placeholder
    return None


def find_cover_label(text):
    for regex, placeholder in COVER_LABEL_APPEND:
        if regex.match(text):
            return placeholder
    return None


def process(xml_path):
    tree = ET.parse(xml_path)
    root = tree.getroot()

    paragraphs = list(root.iter(f'{W}p'))

    prev_label_placeholder = None
    used_placeholders = set()
    cover_done = set()  # only replace cover labels once (file has 2 cover pages)
    dots_counter = 0

    for i, p in enumerate(paragraphs):
        text = para_text(p)
        if not text:
            continue

        # 1. Cover label — append placeholder
        cover_pl = find_cover_label(text)
        if cover_pl and cover_pl not in cover_done:
            append_placeholder_to_para(p, cover_pl)
            cover_done.add(cover_pl)
            prev_label_placeholder = None
            continue

        # 2. Dots line → replace with placeholder from prev label
        if DOTS_PATTERN.match(text):
            dots_counter += 1
            if prev_label_placeholder and prev_label_placeholder not in used_placeholders:
                set_para_text(p, f'{{{{{prev_label_placeholder}}}}}')
                used_placeholders.add(prev_label_placeholder)
                prev_label_placeholder = None
            else:
                # extra dots line without matching label — tag with generic counter
                set_para_text(p, f'{{{{extra_{dots_counter}}}}}')
            continue

        # 3. Label paragraph → remember placeholder for NEXT dots line
        label_pl = find_placeholder_for_label(text)
        if label_pl:
            prev_label_placeholder = label_pl
            continue

        # 4. Other paragraph — reset label if it wasn't matched
        # Don't reset prev_label if current para is just description (italic note etc.)
        # Keep prev_label until we see dots or a new label

    tree.write(xml_path, encoding='UTF-8', xml_declaration=True)
    return used_placeholders, dots_counter


def main():
    repo_root = Path(__file__).parent.parent
    src = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx'
    dst = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template.docx'

    if not src.exists():
        print(f'ERROR: source template not found at {src}')
        sys.exit(1)

    with tempfile.TemporaryDirectory() as tmpdir:
        tmp = Path(tmpdir)
        # Unzip
        with zipfile.ZipFile(src, 'r') as z:
            z.extractall(tmp)
        doc_xml = tmp / 'word' / 'document.xml'

        used, dots = process(doc_xml)
        print(f'Replaced {len(used)} placeholders from {dots} dots-lines.')
        print(f'Placeholders: {sorted(used)}')

        # Rezip
        if dst.exists():
            dst.unlink()
        with zipfile.ZipFile(dst, 'w', zipfile.ZIP_DEFLATED) as zout:
            for path in tmp.rglob('*'):
                if path.is_file():
                    zout.write(path, path.relative_to(tmp))

        print(f'Wrote {dst}')


if __name__ == '__main__':
    main()