241 lines
8.4 KiB
Python
241 lines
8.4 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Split file gốc "bao-cao-template-original.docx" thành 5 template riêng biệt:
|
|
- mau-01-bao-cao-mo-ta.docx (includes 2 cover pages)
|
|
- mau-02-don-de-nghi.docx
|
|
- mau-03-xac-nhan-ty-le.docx
|
|
- mau-04-phieu-danh-gia.docx
|
|
- ban-cam-ket.docx
|
|
|
|
Strategy: unzip original → parse document.xml → state-machine iterate body children
|
|
(p, tbl, ...), group by "Mẫu số 0X" markers → write 5 output docx bằng cách clone
|
|
unpacked directory và thay document.xml body children cho mỗi template.
|
|
"""
|
|
|
|
import shutil
|
|
import sys
|
|
import tempfile
|
|
import xml.etree.ElementTree as ET
|
|
import zipfile
|
|
from pathlib import Path
|
|
|
|
NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
|
W = f'{{{NS_URI}}}'
|
|
ET.register_namespace('w', NS_URI)
|
|
# Also register common namespaces in docx
|
|
EXTRA_NS = {
|
|
'wpc': 'http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas',
|
|
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
|
|
'o': 'urn:schemas-microsoft-com:office:office',
|
|
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
|
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
|
|
'v': 'urn:schemas-microsoft-com:vml',
|
|
'wp14': 'http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing',
|
|
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
|
'w10': 'urn:schemas-microsoft-com:office:word',
|
|
'w14': 'http://schemas.microsoft.com/office/word/2010/wordml',
|
|
'w15': 'http://schemas.microsoft.com/office/word/2012/wordml',
|
|
'wpg': 'http://schemas.microsoft.com/office/word/2010/wordprocessingGroup',
|
|
'wpi': 'http://schemas.microsoft.com/office/word/2010/wordprocessingInk',
|
|
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
|
|
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
|
|
}
|
|
for prefix, uri in EXTRA_NS.items():
|
|
ET.register_namespace(prefix, uri)
|
|
|
|
|
|
def get_para_text(p):
|
|
return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip()
|
|
|
|
|
|
import re
|
|
|
|
HEADER_PREFIXES = (
|
|
'BỘ Y TẾ',
|
|
'ĐẠI HỌC Y DƯỢC',
|
|
'THÀNH PHỐ HỒ CHÍ MINH',
|
|
'ĐƠN VỊ:',
|
|
'ĐƠN VỊ ',
|
|
'CỘNG HÒA XÃ HỘI',
|
|
'CÔNG HÒA XÃ HỘI',
|
|
'CỘNG HOÀ XÃ HỘI',
|
|
'CỘNG HOÀ XÃ HỘI CHỦ NGHĨA',
|
|
'Độc lập',
|
|
'TP. Hồ Chí Minh',
|
|
'Tp. Hồ Chí Minh',
|
|
'Thành phố Hồ Chí Minh',
|
|
)
|
|
|
|
|
|
def is_header_like(text: str) -> bool:
|
|
if not text:
|
|
return False
|
|
for p in HEADER_PREFIXES:
|
|
if text.startswith(p):
|
|
return True
|
|
return bool(re.match(r'^Mẫu\s*số\s*\d', text))
|
|
|
|
|
|
def find_section_markers(children):
|
|
"""
|
|
Scan children, tìm marker paragraphs "Mẫu số 0X" / "BẢN CAM KẾT" + walk back
|
|
để include header block (BỘ Y TẾ / ĐẠI HỌC Y DƯỢC / ...). Return dict
|
|
section_name → start_index.
|
|
"""
|
|
section_starts = {'mau01': 0} # mau01 bao gồm cover + content
|
|
|
|
for i, child in enumerate(children):
|
|
tag = child.tag.replace(W, '')
|
|
if tag != 'p':
|
|
continue
|
|
text = get_para_text(child)
|
|
if not text:
|
|
continue
|
|
|
|
section_key = None
|
|
if 'BẢN CAM KẾT' in text.upper():
|
|
section_key = 'camket'
|
|
else:
|
|
m = re.match(r'^Mẫu\s*số\s*(0[234])', text)
|
|
if m:
|
|
section_key = f'mau{m.group(1)}'
|
|
|
|
if section_key is None or section_key in section_starts:
|
|
continue
|
|
|
|
# Walk back: include paragraphs + tables chứa header text
|
|
start = i
|
|
for j in range(i - 1, -1, -1):
|
|
prev = children[j]
|
|
tag = prev.tag.replace(W, '')
|
|
ptext = get_para_text(prev).strip()
|
|
if tag == 'p':
|
|
if ptext == '' or is_header_like(ptext):
|
|
start = j
|
|
else:
|
|
break
|
|
elif tag == 'tbl':
|
|
# Header table bắt buộc có "BỘ Y TẾ" hoặc "ĐẠI HỌC Y DƯỢC" hoặc
|
|
# "CỘNG HÒA/HOÀ XÃ HỘI" (strong markers). Tránh nhầm với signature
|
|
# tbl của mẫu trước (chỉ có "Tp. Hồ Chí Minh, ngày...").
|
|
strong_markers = (
|
|
'BỘ Y TẾ',
|
|
'ĐẠI HỌC Y DƯỢC',
|
|
'CỘNG HÒA XÃ HỘI',
|
|
'CỘNG HOÀ XÃ HỘI',
|
|
'CÔNG HÒA XÃ HỘI',
|
|
)
|
|
if any(mk in ptext for mk in strong_markers):
|
|
start = j
|
|
else:
|
|
break
|
|
else:
|
|
break
|
|
|
|
section_starts[section_key] = start
|
|
|
|
return section_starts
|
|
|
|
|
|
def split_document(original_path: Path, output_dir: Path):
|
|
"""Split original docx into 5 templates."""
|
|
with tempfile.TemporaryDirectory() as tmp_str:
|
|
tmp = Path(tmp_str)
|
|
# Extract original
|
|
base_dir = tmp / 'base'
|
|
base_dir.mkdir()
|
|
with zipfile.ZipFile(original_path, 'r') as z:
|
|
z.extractall(base_dir)
|
|
|
|
# Parse document.xml
|
|
doc_xml_path = base_dir / 'word' / 'document.xml'
|
|
tree = ET.parse(doc_xml_path)
|
|
root = tree.getroot()
|
|
body = root.find(f'{W}body')
|
|
assert body is not None, 'No body element'
|
|
|
|
# Collect sectPr (preserve for each output)
|
|
sect_pr = None
|
|
children = list(body)
|
|
content_children = [] # children excluding sectPr
|
|
for child in children:
|
|
if child.tag.replace(W, '') == 'sectPr':
|
|
sect_pr = child
|
|
else:
|
|
content_children.append(child)
|
|
|
|
# Find section start indices (với walk-back include header block)
|
|
starts = find_section_markers(content_children)
|
|
# Build ordered list: mau01 at 0, then other sections sorted by start index
|
|
ordered = sorted(starts.items(), key=lambda kv: kv[1])
|
|
|
|
# Build sections dict — assign children [start_i, start_{i+1}) to each section
|
|
sections = {k: [] for k in ['mau01', 'mau02', 'mau03', 'mau04', 'camket']}
|
|
for idx, (sec_name, start) in enumerate(ordered):
|
|
end = ordered[idx + 1][1] if idx + 1 < len(ordered) else len(content_children)
|
|
sections[sec_name] = content_children[start:end]
|
|
|
|
# Write 5 output files
|
|
output_dir.mkdir(parents=True, exist_ok=True)
|
|
outputs = [
|
|
('mau-01-bao-cao-mo-ta.docx', 'mau01'),
|
|
('mau-02-don-de-nghi.docx', 'mau02'),
|
|
('mau-03-xac-nhan-ty-le.docx', 'mau03'),
|
|
('mau-04-phieu-danh-gia.docx', 'mau04'),
|
|
('ban-cam-ket.docx', 'camket'),
|
|
]
|
|
|
|
for filename, section_key in outputs:
|
|
out_path = output_dir / filename
|
|
# Clone base directory
|
|
clone_dir = tmp / f'clone-{section_key}'
|
|
shutil.copytree(base_dir, clone_dir)
|
|
|
|
# Modify document.xml
|
|
clone_doc = clone_dir / 'word' / 'document.xml'
|
|
ctree = ET.parse(clone_doc)
|
|
croot = ctree.getroot()
|
|
cbody = croot.find(f'{W}body')
|
|
assert cbody is not None
|
|
|
|
# Clear existing body children
|
|
for c in list(cbody):
|
|
cbody.remove(c)
|
|
|
|
# Add section-specific children
|
|
for elem in sections[section_key]:
|
|
cbody.append(elem)
|
|
# Add sectPr last
|
|
if sect_pr is not None:
|
|
cbody.append(sect_pr)
|
|
|
|
ctree.write(clone_doc, encoding='UTF-8', xml_declaration=True)
|
|
|
|
# Rezip
|
|
if out_path.exists():
|
|
out_path.unlink()
|
|
with zipfile.ZipFile(out_path, 'w', zipfile.ZIP_DEFLATED) as zout:
|
|
for file_path in clone_dir.rglob('*'):
|
|
if file_path.is_file():
|
|
zout.write(file_path, file_path.relative_to(clone_dir))
|
|
|
|
count = len(sections[section_key])
|
|
print(f'Wrote {out_path.name} ({count} body children)')
|
|
|
|
|
|
def main():
|
|
repo_root = Path(__file__).parent.parent
|
|
original = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx'
|
|
output_dir = repo_root / 'src/Backend/DYD.Api/Templates'
|
|
|
|
if not original.exists():
|
|
print(f'ERROR: source template not found: {original}')
|
|
sys.exit(1)
|
|
|
|
split_document(original, output_dir)
|
|
print('\nDone. 5 template files created.')
|
|
|
|
|
|
if __name__ == '__main__':
|
|
main()
|