Files
sciagent/scripts/split-word-template.py
T
Thinh Lam 688fac73e9
CI/CD / backend (push) Failing after 2m8s
CI/CD / frontend (push) Failing after 1m40s
CI/CD / deploy (push) Has been skipped
sciagent code + Gitea Actions CI/CD
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 09:38:30 +07:00

241 lines
8.4 KiB
Python

#!/usr/bin/env python3
"""
Split file gốc "bao-cao-template-original.docx" thành 5 template riêng biệt:
- mau-01-bao-cao-mo-ta.docx (includes 2 cover pages)
- mau-02-don-de-nghi.docx
- mau-03-xac-nhan-ty-le.docx
- mau-04-phieu-danh-gia.docx
- ban-cam-ket.docx
Strategy: unzip original → parse document.xml → state-machine iterate body children
(p, tbl, ...), group by "Mẫu số 0X" markers → write 5 output docx bằng cách clone
unpacked directory và thay document.xml body children cho mỗi template.
"""
import shutil
import sys
import tempfile
import xml.etree.ElementTree as ET
import zipfile
from pathlib import Path
NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
W = f'{{{NS_URI}}}'
ET.register_namespace('w', NS_URI)
# Also register common namespaces in docx
EXTRA_NS = {
'wpc': 'http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas',
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
'o': 'urn:schemas-microsoft-com:office:office',
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
'v': 'urn:schemas-microsoft-com:vml',
'wp14': 'http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing',
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
'w10': 'urn:schemas-microsoft-com:office:word',
'w14': 'http://schemas.microsoft.com/office/word/2010/wordml',
'w15': 'http://schemas.microsoft.com/office/word/2012/wordml',
'wpg': 'http://schemas.microsoft.com/office/word/2010/wordprocessingGroup',
'wpi': 'http://schemas.microsoft.com/office/word/2010/wordprocessingInk',
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
}
for prefix, uri in EXTRA_NS.items():
ET.register_namespace(prefix, uri)
def get_para_text(p):
return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip()
import re
HEADER_PREFIXES = (
'BỘ Y TẾ',
'ĐẠI HỌC Y DƯỢC',
'THÀNH PHỐ HỒ CHÍ MINH',
'ĐƠN VỊ:',
'ĐƠN VỊ ',
'CỘNG HÒA XÃ HỘI',
'CÔNG HÒA XÃ HỘI',
'CỘNG HOÀ XÃ HỘI',
'CỘNG HOÀ XÃ HỘI CHỦ NGHĨA',
'Độc lập',
'TP. Hồ Chí Minh',
'Tp. Hồ Chí Minh',
'Thành phố Hồ Chí Minh',
)
def is_header_like(text: str) -> bool:
if not text:
return False
for p in HEADER_PREFIXES:
if text.startswith(p):
return True
return bool(re.match(r'^Mẫu\s*số\s*\d', text))
def find_section_markers(children):
"""
Scan children, tìm marker paragraphs "Mẫu số 0X" / "BẢN CAM KẾT" + walk back
để include header block (BỘ Y TẾ / ĐẠI HỌC Y DƯỢC / ...). Return dict
section_name → start_index.
"""
section_starts = {'mau01': 0} # mau01 bao gồm cover + content
for i, child in enumerate(children):
tag = child.tag.replace(W, '')
if tag != 'p':
continue
text = get_para_text(child)
if not text:
continue
section_key = None
if 'BẢN CAM KẾT' in text.upper():
section_key = 'camket'
else:
m = re.match(r'^Mẫu\s*số\s*(0[234])', text)
if m:
section_key = f'mau{m.group(1)}'
if section_key is None or section_key in section_starts:
continue
# Walk back: include paragraphs + tables chứa header text
start = i
for j in range(i - 1, -1, -1):
prev = children[j]
tag = prev.tag.replace(W, '')
ptext = get_para_text(prev).strip()
if tag == 'p':
if ptext == '' or is_header_like(ptext):
start = j
else:
break
elif tag == 'tbl':
# Header table bắt buộc có "BỘ Y TẾ" hoặc "ĐẠI HỌC Y DƯỢC" hoặc
# "CỘNG HÒA/HOÀ XÃ HỘI" (strong markers). Tránh nhầm với signature
# tbl của mẫu trước (chỉ có "Tp. Hồ Chí Minh, ngày...").
strong_markers = (
'BỘ Y TẾ',
'ĐẠI HỌC Y DƯỢC',
'CỘNG HÒA XÃ HỘI',
'CỘNG HOÀ XÃ HỘI',
'CÔNG HÒA XÃ HỘI',
)
if any(mk in ptext for mk in strong_markers):
start = j
else:
break
else:
break
section_starts[section_key] = start
return section_starts
def split_document(original_path: Path, output_dir: Path):
"""Split original docx into 5 templates."""
with tempfile.TemporaryDirectory() as tmp_str:
tmp = Path(tmp_str)
# Extract original
base_dir = tmp / 'base'
base_dir.mkdir()
with zipfile.ZipFile(original_path, 'r') as z:
z.extractall(base_dir)
# Parse document.xml
doc_xml_path = base_dir / 'word' / 'document.xml'
tree = ET.parse(doc_xml_path)
root = tree.getroot()
body = root.find(f'{W}body')
assert body is not None, 'No body element'
# Collect sectPr (preserve for each output)
sect_pr = None
children = list(body)
content_children = [] # children excluding sectPr
for child in children:
if child.tag.replace(W, '') == 'sectPr':
sect_pr = child
else:
content_children.append(child)
# Find section start indices (với walk-back include header block)
starts = find_section_markers(content_children)
# Build ordered list: mau01 at 0, then other sections sorted by start index
ordered = sorted(starts.items(), key=lambda kv: kv[1])
# Build sections dict — assign children [start_i, start_{i+1}) to each section
sections = {k: [] for k in ['mau01', 'mau02', 'mau03', 'mau04', 'camket']}
for idx, (sec_name, start) in enumerate(ordered):
end = ordered[idx + 1][1] if idx + 1 < len(ordered) else len(content_children)
sections[sec_name] = content_children[start:end]
# Write 5 output files
output_dir.mkdir(parents=True, exist_ok=True)
outputs = [
('mau-01-bao-cao-mo-ta.docx', 'mau01'),
('mau-02-don-de-nghi.docx', 'mau02'),
('mau-03-xac-nhan-ty-le.docx', 'mau03'),
('mau-04-phieu-danh-gia.docx', 'mau04'),
('ban-cam-ket.docx', 'camket'),
]
for filename, section_key in outputs:
out_path = output_dir / filename
# Clone base directory
clone_dir = tmp / f'clone-{section_key}'
shutil.copytree(base_dir, clone_dir)
# Modify document.xml
clone_doc = clone_dir / 'word' / 'document.xml'
ctree = ET.parse(clone_doc)
croot = ctree.getroot()
cbody = croot.find(f'{W}body')
assert cbody is not None
# Clear existing body children
for c in list(cbody):
cbody.remove(c)
# Add section-specific children
for elem in sections[section_key]:
cbody.append(elem)
# Add sectPr last
if sect_pr is not None:
cbody.append(sect_pr)
ctree.write(clone_doc, encoding='UTF-8', xml_declaration=True)
# Rezip
if out_path.exists():
out_path.unlink()
with zipfile.ZipFile(out_path, 'w', zipfile.ZIP_DEFLATED) as zout:
for file_path in clone_dir.rglob('*'):
if file_path.is_file():
zout.write(file_path, file_path.relative_to(clone_dir))
count = len(sections[section_key])
print(f'Wrote {out_path.name} ({count} body children)')
def main():
repo_root = Path(__file__).parent.parent
original = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx'
output_dir = repo_root / 'src/Backend/DYD.Api/Templates'
if not original.exists():
print(f'ERROR: source template not found: {original}')
sys.exit(1)
split_document(original, output_dir)
print('\nDone. 5 template files created.')
if __name__ == '__main__':
main()