sciagent code + Gitea Actions CI/CD

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 09:38:30 +07:00
commit 688fac73e9
1167 changed files with 158244 additions and 0 deletions
@@ -0,0 +1,240 @@
+#!/usr/bin/env python3
+"""
+Split file gốc "bao-cao-template-original.docx" thành 5 template riêng biệt:
+- mau-01-bao-cao-mo-ta.docx (includes 2 cover pages)
+- mau-02-don-de-nghi.docx
+- mau-03-xac-nhan-ty-le.docx
+- mau-04-phieu-danh-gia.docx
+- ban-cam-ket.docx
+
+Strategy: unzip original → parse document.xml → state-machine iterate body children
+(p, tbl, ...), group by "Mẫu số 0X" markers → write 5 output docx bằng cách clone
+unpacked directory và thay document.xml body children cho mỗi template.
+"""
+
+import shutil
+import sys
+import tempfile
+import xml.etree.ElementTree as ET
+import zipfile
+from pathlib import Path
+
+NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
+W = f'{{{NS_URI}}}'
+ET.register_namespace('w', NS_URI)
+# Also register common namespaces in docx
+EXTRA_NS = {
+    'wpc': 'http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas',
+    'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
+    'o': 'urn:schemas-microsoft-com:office:office',
+    'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
+    'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
+    'v': 'urn:schemas-microsoft-com:vml',
+    'wp14': 'http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing',
+    'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
+    'w10': 'urn:schemas-microsoft-com:office:word',
+    'w14': 'http://schemas.microsoft.com/office/word/2010/wordml',
+    'w15': 'http://schemas.microsoft.com/office/word/2012/wordml',
+    'wpg': 'http://schemas.microsoft.com/office/word/2010/wordprocessingGroup',
+    'wpi': 'http://schemas.microsoft.com/office/word/2010/wordprocessingInk',
+    'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
+    'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
+}
+for prefix, uri in EXTRA_NS.items():
+    ET.register_namespace(prefix, uri)
+
+
+def get_para_text(p):
+    return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip()
+
+
+import re
+
+HEADER_PREFIXES = (
+    'BỘ Y TẾ',
+    'ĐẠI HỌC Y DƯỢC',
+    'THÀNH PHỐ HỒ CHÍ MINH',
+    'ĐƠN VỊ:',
+    'ĐƠN VỊ ',
+    'CỘNG HÒA XÃ HỘI',
+    'CÔNG HÒA XÃ HỘI',
+    'CỘNG HOÀ XÃ HỘI',
+    'CỘNG HOÀ XÃ HỘI CHỦ NGHĨA',
+    'Độc lập',
+    'TP. Hồ Chí Minh',
+    'Tp. Hồ Chí Minh',
+    'Thành phố Hồ Chí Minh',
+)
+
+
+def is_header_like(text: str) -> bool:
+    if not text:
+        return False
+    for p in HEADER_PREFIXES:
+        if text.startswith(p):
+            return True
+    return bool(re.match(r'^Mẫu\s*số\s*\d', text))
+
+
+def find_section_markers(children):
+    """
+    Scan children, tìm marker paragraphs "Mẫu số 0X" / "BẢN CAM KẾT" + walk back
+    để include header block (BỘ Y TẾ / ĐẠI HỌC Y DƯỢC / ...). Return dict
+    section_name → start_index.
+    """
+    section_starts = {'mau01': 0}  # mau01 bao gồm cover + content
+
+    for i, child in enumerate(children):
+        tag = child.tag.replace(W, '')
+        if tag != 'p':
+            continue
+        text = get_para_text(child)
+        if not text:
+            continue
+
+        section_key = None
+        if 'BẢN CAM KẾT' in text.upper():
+            section_key = 'camket'
+        else:
+            m = re.match(r'^Mẫu\s*số\s*(0[234])', text)
+            if m:
+                section_key = f'mau{m.group(1)}'
+
+        if section_key is None or section_key in section_starts:
+            continue
+
+        # Walk back: include paragraphs + tables chứa header text
+        start = i
+        for j in range(i - 1, -1, -1):
+            prev = children[j]
+            tag = prev.tag.replace(W, '')
+            ptext = get_para_text(prev).strip()
+            if tag == 'p':
+                if ptext == '' or is_header_like(ptext):
+                    start = j
+                else:
+                    break
+            elif tag == 'tbl':
+                # Header table bắt buộc có "BỘ Y TẾ" hoặc "ĐẠI HỌC Y DƯỢC" hoặc
+                # "CỘNG HÒA/HOÀ XÃ HỘI" (strong markers). Tránh nhầm với signature
+                # tbl của mẫu trước (chỉ có "Tp. Hồ Chí Minh, ngày...").
+                strong_markers = (
+                    'BỘ Y TẾ',
+                    'ĐẠI HỌC Y DƯỢC',
+                    'CỘNG HÒA XÃ HỘI',
+                    'CỘNG HOÀ XÃ HỘI',
+                    'CÔNG HÒA XÃ HỘI',
+                )
+                if any(mk in ptext for mk in strong_markers):
+                    start = j
+                else:
+                    break
+            else:
+                break
+
+        section_starts[section_key] = start
+
+    return section_starts
+
+
+def split_document(original_path: Path, output_dir: Path):
+    """Split original docx into 5 templates."""
+    with tempfile.TemporaryDirectory() as tmp_str:
+        tmp = Path(tmp_str)
+        # Extract original
+        base_dir = tmp / 'base'
+        base_dir.mkdir()
+        with zipfile.ZipFile(original_path, 'r') as z:
+            z.extractall(base_dir)
+
+        # Parse document.xml
+        doc_xml_path = base_dir / 'word' / 'document.xml'
+        tree = ET.parse(doc_xml_path)
+        root = tree.getroot()
+        body = root.find(f'{W}body')
+        assert body is not None, 'No body element'
+
+        # Collect sectPr (preserve for each output)
+        sect_pr = None
+        children = list(body)
+        content_children = []  # children excluding sectPr
+        for child in children:
+            if child.tag.replace(W, '') == 'sectPr':
+                sect_pr = child
+            else:
+                content_children.append(child)
+
+        # Find section start indices (với walk-back include header block)
+        starts = find_section_markers(content_children)
+        # Build ordered list: mau01 at 0, then other sections sorted by start index
+        ordered = sorted(starts.items(), key=lambda kv: kv[1])
+
+        # Build sections dict — assign children [start_i, start_{i+1}) to each section
+        sections = {k: [] for k in ['mau01', 'mau02', 'mau03', 'mau04', 'camket']}
+        for idx, (sec_name, start) in enumerate(ordered):
+            end = ordered[idx + 1][1] if idx + 1 < len(ordered) else len(content_children)
+            sections[sec_name] = content_children[start:end]
+
+        # Write 5 output files
+        output_dir.mkdir(parents=True, exist_ok=True)
+        outputs = [
+            ('mau-01-bao-cao-mo-ta.docx', 'mau01'),
+            ('mau-02-don-de-nghi.docx', 'mau02'),
+            ('mau-03-xac-nhan-ty-le.docx', 'mau03'),
+            ('mau-04-phieu-danh-gia.docx', 'mau04'),
+            ('ban-cam-ket.docx', 'camket'),
+        ]
+
+        for filename, section_key in outputs:
+            out_path = output_dir / filename
+            # Clone base directory
+            clone_dir = tmp / f'clone-{section_key}'
+            shutil.copytree(base_dir, clone_dir)
+
+            # Modify document.xml
+            clone_doc = clone_dir / 'word' / 'document.xml'
+            ctree = ET.parse(clone_doc)
+            croot = ctree.getroot()
+            cbody = croot.find(f'{W}body')
+            assert cbody is not None
+
+            # Clear existing body children
+            for c in list(cbody):
+                cbody.remove(c)
+
+            # Add section-specific children
+            for elem in sections[section_key]:
+                cbody.append(elem)
+            # Add sectPr last
+            if sect_pr is not None:
+                cbody.append(sect_pr)
+
+            ctree.write(clone_doc, encoding='UTF-8', xml_declaration=True)
+
+            # Rezip
+            if out_path.exists():
+                out_path.unlink()
+            with zipfile.ZipFile(out_path, 'w', zipfile.ZIP_DEFLATED) as zout:
+                for file_path in clone_dir.rglob('*'):
+                    if file_path.is_file():
+                        zout.write(file_path, file_path.relative_to(clone_dir))
+
+            count = len(sections[section_key])
+            print(f'Wrote {out_path.name} ({count} body children)')
+
+
+def main():
+    repo_root = Path(__file__).parent.parent
+    original = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx'
+    output_dir = repo_root / 'src/Backend/DYD.Api/Templates'
+
+    if not original.exists():
+        print(f'ERROR: source template not found: {original}')
+        sys.exit(1)
+
+    split_document(original, output_dir)
+    print('\nDone. 5 template files created.')
+
+
+if __name__ == '__main__':
+    main()