sciagent/scripts/split-word-template.py

#!/usr/bin/env python3
"""
Split file gốc "bao-cao-template-original.docx" thành 5 template riêng biệt:
- mau-01-bao-cao-mo-ta.docx (includes 2 cover pages)
- mau-02-don-de-nghi.docx
- mau-03-xac-nhan-ty-le.docx
- mau-04-phieu-danh-gia.docx
- ban-cam-ket.docx

Strategy: unzip original → parse document.xml → state-machine iterate body children
(p, tbl, ...), group by "Mẫu số 0X" markers → write 5 output docx bằng cách clone
unpacked directory và thay document.xml body children cho mỗi template.
"""

import shutil
import sys
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path

NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
W = f'{{{NS_URI}}}'
ET.register_namespace('w', NS_URI)
# Also register common namespaces in docx
EXTRA_NS = {
    'wpc': 'http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas',
    'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
    'o': 'urn:schemas-microsoft-com:office:office',
    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
    'v': 'urn:schemas-microsoft-com:vml',
    'wp14': 'http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing',
    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
    'w10': 'urn:schemas-microsoft-com:office:word',
    'w14': 'http://schemas.microsoft.com/office/word/2010/wordml',
    'w15': 'http://schemas.microsoft.com/office/word/2012/wordml',
    'wpg': 'http://schemas.microsoft.com/office/word/2010/wordprocessingGroup',
    'wpi': 'http://schemas.microsoft.com/office/word/2010/wordprocessingInk',
    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
    'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
}
for prefix, uri in EXTRA_NS.items():
    ET.register_namespace(prefix, uri)


def get_para_text(p):
    return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip()


import re

HEADER_PREFIXES = (
    'BỘ Y TẾ',
    'ĐẠI HỌC Y DƯỢC',
    'THÀNH PHỐ HỒ CHÍ MINH',
    'ĐƠN VỊ:',
    'ĐƠN VỊ ',
    'CỘNG HÒA XÃ HỘI',
    'CÔNG HÒA XÃ HỘI',
    'CỘNG HOÀ XÃ HỘI',
    'CỘNG HOÀ XÃ HỘI CHỦ NGHĨA',
    'Độc lập',
    'TP. Hồ Chí Minh',
    'Tp. Hồ Chí Minh',
    'Thành phố Hồ Chí Minh',
)


def is_header_like(text: str) -> bool:
    if not text:
        return False
    for p in HEADER_PREFIXES:
        if text.startswith(p):
            return True
    return bool(re.match(r'^Mẫu\s*số\s*\d', text))


def find_section_markers(children):
    """
    Scan children, tìm marker paragraphs "Mẫu số 0X" / "BẢN CAM KẾT" + walk back
    để include header block (BỘ Y TẾ / ĐẠI HỌC Y DƯỢC / ...). Return dict
    section_name → start_index.
    """
    section_starts = {'mau01': 0}  # mau01 bao gồm cover + content

    for i, child in enumerate(children):
        tag = child.tag.replace(W, '')
        if tag != 'p':
            continue
        text = get_para_text(child)
        if not text:
            continue

        section_key = None
        if 'BẢN CAM KẾT' in text.upper():
            section_key = 'camket'
        else:
            m = re.match(r'^Mẫu\s*số\s*(0[234])', text)
            if m:
                section_key = f'mau{m.group(1)}'

        if section_key is None or section_key in section_starts:
            continue

        # Walk back: include paragraphs + tables chứa header text
        start = i
        for j in range(i - 1, -1, -1):
            prev = children[j]
            tag = prev.tag.replace(W, '')
            ptext = get_para_text(prev).strip()
            if tag == 'p':
                if ptext == '' or is_header_like(ptext):
                    start = j
                else:
                    break
            elif tag == 'tbl':
                # Header table bắt buộc có "BỘ Y TẾ" hoặc "ĐẠI HỌC Y DƯỢC" hoặc
                # "CỘNG HÒA/HOÀ XÃ HỘI" (strong markers). Tránh nhầm với signature
                # tbl của mẫu trước (chỉ có "Tp. Hồ Chí Minh, ngày...").
                strong_markers = (
                    'BỘ Y TẾ',
                    'ĐẠI HỌC Y DƯỢC',
                    'CỘNG HÒA XÃ HỘI',
                    'CỘNG HOÀ XÃ HỘI',
                    'CÔNG HÒA XÃ HỘI',
                )
                if any(mk in ptext for mk in strong_markers):
                    start = j
                else:
                    break
            else:
                break

        section_starts[section_key] = start

    return section_starts


def split_document(original_path: Path, output_dir: Path):
    """Split original docx into 5 templates."""
    with tempfile.TemporaryDirectory() as tmp_str:
        tmp = Path(tmp_str)
        # Extract original
        base_dir = tmp / 'base'
        base_dir.mkdir()
        with zipfile.ZipFile(original_path, 'r') as z:
            z.extractall(base_dir)

        # Parse document.xml
        doc_xml_path = base_dir / 'word' / 'document.xml'
        tree = ET.parse(doc_xml_path)
        root = tree.getroot()
        body = root.find(f'{W}body')
        assert body is not None, 'No body element'

        # Collect sectPr (preserve for each output)
        sect_pr = None
        children = list(body)
        content_children = []  # children excluding sectPr
        for child in children:
            if child.tag.replace(W, '') == 'sectPr':
                sect_pr = child
            else:
                content_children.append(child)

        # Find section start indices (với walk-back include header block)
        starts = find_section_markers(content_children)
        # Build ordered list: mau01 at 0, then other sections sorted by start index
        ordered = sorted(starts.items(), key=lambda kv: kv[1])

        # Build sections dict — assign children [start_i, start_{i+1}) to each section
        sections = {k: [] for k in ['mau01', 'mau02', 'mau03', 'mau04', 'camket']}
        for idx, (sec_name, start) in enumerate(ordered):
            end = ordered[idx + 1][1] if idx + 1 < len(ordered) else len(content_children)
            sections[sec_name] = content_children[start:end]

        # Write 5 output files
        output_dir.mkdir(parents=True, exist_ok=True)
        outputs = [
            ('mau-01-bao-cao-mo-ta.docx', 'mau01'),
            ('mau-02-don-de-nghi.docx', 'mau02'),
            ('mau-03-xac-nhan-ty-le.docx', 'mau03'),
            ('mau-04-phieu-danh-gia.docx', 'mau04'),
            ('ban-cam-ket.docx', 'camket'),
        ]

        for filename, section_key in outputs:
            out_path = output_dir / filename
            # Clone base directory
            clone_dir = tmp / f'clone-{section_key}'
            shutil.copytree(base_dir, clone_dir)

            # Modify document.xml
            clone_doc = clone_dir / 'word' / 'document.xml'
            ctree = ET.parse(clone_doc)
            croot = ctree.getroot()
            cbody = croot.find(f'{W}body')
            assert cbody is not None

            # Clear existing body children
            for c in list(cbody):
                cbody.remove(c)

            # Add section-specific children
            for elem in sections[section_key]:
                cbody.append(elem)
            # Add sectPr last
            if sect_pr is not None:
                cbody.append(sect_pr)

            ctree.write(clone_doc, encoding='UTF-8', xml_declaration=True)

            # Rezip
            if out_path.exists():
                out_path.unlink()
            with zipfile.ZipFile(out_path, 'w', zipfile.ZIP_DEFLATED) as zout:
                for file_path in clone_dir.rglob('*'):
                    if file_path.is_file():
                        zout.write(file_path, file_path.relative_to(clone_dir))

            count = len(sections[section_key])
            print(f'Wrote {out_path.name} ({count} body children)')


def main():
    repo_root = Path(__file__).parent.parent
    original = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx'
    output_dir = repo_root / 'src/Backend/DYD.Api/Templates'

    if not original.exists():
        print(f'ERROR: source template not found: {original}')
        sys.exit(1)

    split_document(original, output_dir)
    print('\nDone. 5 template files created.')


if __name__ == '__main__':
    main()