#!/usr/bin/env python3 """ Split file gốc "bao-cao-template-original.docx" thành 5 template riêng biệt: - mau-01-bao-cao-mo-ta.docx (includes 2 cover pages) - mau-02-don-de-nghi.docx - mau-03-xac-nhan-ty-le.docx - mau-04-phieu-danh-gia.docx - ban-cam-ket.docx Strategy: unzip original → parse document.xml → state-machine iterate body children (p, tbl, ...), group by "Mẫu số 0X" markers → write 5 output docx bằng cách clone unpacked directory và thay document.xml body children cho mỗi template. """ import shutil import sys import tempfile import xml.etree.ElementTree as ET import zipfile from pathlib import Path NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main' W = f'{{{NS_URI}}}' ET.register_namespace('w', NS_URI) # Also register common namespaces in docx EXTRA_NS = { 'wpc': 'http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas', 'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006', 'o': 'urn:schemas-microsoft-com:office:office', 'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships', 'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math', 'v': 'urn:schemas-microsoft-com:vml', 'wp14': 'http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing', 'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing', 'w10': 'urn:schemas-microsoft-com:office:word', 'w14': 'http://schemas.microsoft.com/office/word/2010/wordml', 'w15': 'http://schemas.microsoft.com/office/word/2012/wordml', 'wpg': 'http://schemas.microsoft.com/office/word/2010/wordprocessingGroup', 'wpi': 'http://schemas.microsoft.com/office/word/2010/wordprocessingInk', 'wne': 'http://schemas.microsoft.com/office/word/2006/wordml', 'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape', } for prefix, uri in EXTRA_NS.items(): ET.register_namespace(prefix, uri) def get_para_text(p): return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip() import re HEADER_PREFIXES = ( 'BỘ Y TẾ', 'ĐẠI HỌC Y DƯỢC', 'THÀNH PHỐ HỒ CHÍ MINH', 'ĐƠN VỊ:', 'ĐƠN VỊ ', 'CỘNG HÒA XÃ HỘI', 'CÔNG HÒA XÃ HỘI', 'CỘNG HOÀ XÃ HỘI', 'CỘNG HOÀ XÃ HỘI CHỦ NGHĨA', 'Độc lập', 'TP. Hồ Chí Minh', 'Tp. Hồ Chí Minh', 'Thành phố Hồ Chí Minh', ) def is_header_like(text: str) -> bool: if not text: return False for p in HEADER_PREFIXES: if text.startswith(p): return True return bool(re.match(r'^Mẫu\s*số\s*\d', text)) def find_section_markers(children): """ Scan children, tìm marker paragraphs "Mẫu số 0X" / "BẢN CAM KẾT" + walk back để include header block (BỘ Y TẾ / ĐẠI HỌC Y DƯỢC / ...). Return dict section_name → start_index. """ section_starts = {'mau01': 0} # mau01 bao gồm cover + content for i, child in enumerate(children): tag = child.tag.replace(W, '') if tag != 'p': continue text = get_para_text(child) if not text: continue section_key = None if 'BẢN CAM KẾT' in text.upper(): section_key = 'camket' else: m = re.match(r'^Mẫu\s*số\s*(0[234])', text) if m: section_key = f'mau{m.group(1)}' if section_key is None or section_key in section_starts: continue # Walk back: include paragraphs + tables chứa header text start = i for j in range(i - 1, -1, -1): prev = children[j] tag = prev.tag.replace(W, '') ptext = get_para_text(prev).strip() if tag == 'p': if ptext == '' or is_header_like(ptext): start = j else: break elif tag == 'tbl': # Header table bắt buộc có "BỘ Y TẾ" hoặc "ĐẠI HỌC Y DƯỢC" hoặc # "CỘNG HÒA/HOÀ XÃ HỘI" (strong markers). Tránh nhầm với signature # tbl của mẫu trước (chỉ có "Tp. Hồ Chí Minh, ngày..."). strong_markers = ( 'BỘ Y TẾ', 'ĐẠI HỌC Y DƯỢC', 'CỘNG HÒA XÃ HỘI', 'CỘNG HOÀ XÃ HỘI', 'CÔNG HÒA XÃ HỘI', ) if any(mk in ptext for mk in strong_markers): start = j else: break else: break section_starts[section_key] = start return section_starts def split_document(original_path: Path, output_dir: Path): """Split original docx into 5 templates.""" with tempfile.TemporaryDirectory() as tmp_str: tmp = Path(tmp_str) # Extract original base_dir = tmp / 'base' base_dir.mkdir() with zipfile.ZipFile(original_path, 'r') as z: z.extractall(base_dir) # Parse document.xml doc_xml_path = base_dir / 'word' / 'document.xml' tree = ET.parse(doc_xml_path) root = tree.getroot() body = root.find(f'{W}body') assert body is not None, 'No body element' # Collect sectPr (preserve for each output) sect_pr = None children = list(body) content_children = [] # children excluding sectPr for child in children: if child.tag.replace(W, '') == 'sectPr': sect_pr = child else: content_children.append(child) # Find section start indices (với walk-back include header block) starts = find_section_markers(content_children) # Build ordered list: mau01 at 0, then other sections sorted by start index ordered = sorted(starts.items(), key=lambda kv: kv[1]) # Build sections dict — assign children [start_i, start_{i+1}) to each section sections = {k: [] for k in ['mau01', 'mau02', 'mau03', 'mau04', 'camket']} for idx, (sec_name, start) in enumerate(ordered): end = ordered[idx + 1][1] if idx + 1 < len(ordered) else len(content_children) sections[sec_name] = content_children[start:end] # Write 5 output files output_dir.mkdir(parents=True, exist_ok=True) outputs = [ ('mau-01-bao-cao-mo-ta.docx', 'mau01'), ('mau-02-don-de-nghi.docx', 'mau02'), ('mau-03-xac-nhan-ty-le.docx', 'mau03'), ('mau-04-phieu-danh-gia.docx', 'mau04'), ('ban-cam-ket.docx', 'camket'), ] for filename, section_key in outputs: out_path = output_dir / filename # Clone base directory clone_dir = tmp / f'clone-{section_key}' shutil.copytree(base_dir, clone_dir) # Modify document.xml clone_doc = clone_dir / 'word' / 'document.xml' ctree = ET.parse(clone_doc) croot = ctree.getroot() cbody = croot.find(f'{W}body') assert cbody is not None # Clear existing body children for c in list(cbody): cbody.remove(c) # Add section-specific children for elem in sections[section_key]: cbody.append(elem) # Add sectPr last if sect_pr is not None: cbody.append(sect_pr) ctree.write(clone_doc, encoding='UTF-8', xml_declaration=True) # Rezip if out_path.exists(): out_path.unlink() with zipfile.ZipFile(out_path, 'w', zipfile.ZIP_DEFLATED) as zout: for file_path in clone_dir.rglob('*'): if file_path.is_file(): zout.write(file_path, file_path.relative_to(clone_dir)) count = len(sections[section_key]) print(f'Wrote {out_path.name} ({count} body children)') def main(): repo_root = Path(__file__).parent.parent original = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx' output_dir = repo_root / 'src/Backend/DYD.Api/Templates' if not original.exists(): print(f'ERROR: source template not found: {original}') sys.exit(1) split_document(original, output_dir) print('\nDone. 5 template files created.') if __name__ == '__main__': main()