sciagent code + Gitea Actions CI/CD
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,240 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Split file gốc "bao-cao-template-original.docx" thành 5 template riêng biệt:
|
||||
- mau-01-bao-cao-mo-ta.docx (includes 2 cover pages)
|
||||
- mau-02-don-de-nghi.docx
|
||||
- mau-03-xac-nhan-ty-le.docx
|
||||
- mau-04-phieu-danh-gia.docx
|
||||
- ban-cam-ket.docx
|
||||
|
||||
Strategy: unzip original → parse document.xml → state-machine iterate body children
|
||||
(p, tbl, ...), group by "Mẫu số 0X" markers → write 5 output docx bằng cách clone
|
||||
unpacked directory và thay document.xml body children cho mỗi template.
|
||||
"""
|
||||
|
||||
import shutil
|
||||
import sys
|
||||
import tempfile
|
||||
import xml.etree.ElementTree as ET
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
|
||||
NS_URI = 'http://schemas.openxmlformats.org/wordprocessingml/2006/main'
|
||||
W = f'{{{NS_URI}}}'
|
||||
ET.register_namespace('w', NS_URI)
|
||||
# Also register common namespaces in docx
|
||||
EXTRA_NS = {
|
||||
'wpc': 'http://schemas.microsoft.com/office/word/2010/wordprocessingCanvas',
|
||||
'mc': 'http://schemas.openxmlformats.org/markup-compatibility/2006',
|
||||
'o': 'urn:schemas-microsoft-com:office:office',
|
||||
'r': 'http://schemas.openxmlformats.org/officeDocument/2006/relationships',
|
||||
'm': 'http://schemas.openxmlformats.org/officeDocument/2006/math',
|
||||
'v': 'urn:schemas-microsoft-com:vml',
|
||||
'wp14': 'http://schemas.microsoft.com/office/word/2010/wordprocessingDrawing',
|
||||
'wp': 'http://schemas.openxmlformats.org/drawingml/2006/wordprocessingDrawing',
|
||||
'w10': 'urn:schemas-microsoft-com:office:word',
|
||||
'w14': 'http://schemas.microsoft.com/office/word/2010/wordml',
|
||||
'w15': 'http://schemas.microsoft.com/office/word/2012/wordml',
|
||||
'wpg': 'http://schemas.microsoft.com/office/word/2010/wordprocessingGroup',
|
||||
'wpi': 'http://schemas.microsoft.com/office/word/2010/wordprocessingInk',
|
||||
'wne': 'http://schemas.microsoft.com/office/word/2006/wordml',
|
||||
'wps': 'http://schemas.microsoft.com/office/word/2010/wordprocessingShape',
|
||||
}
|
||||
for prefix, uri in EXTRA_NS.items():
|
||||
ET.register_namespace(prefix, uri)
|
||||
|
||||
|
||||
def get_para_text(p):
|
||||
return ''.join((t.text or '') for t in p.iter(f'{W}t')).strip()
|
||||
|
||||
|
||||
import re
|
||||
|
||||
HEADER_PREFIXES = (
|
||||
'BỘ Y TẾ',
|
||||
'ĐẠI HỌC Y DƯỢC',
|
||||
'THÀNH PHỐ HỒ CHÍ MINH',
|
||||
'ĐƠN VỊ:',
|
||||
'ĐƠN VỊ ',
|
||||
'CỘNG HÒA XÃ HỘI',
|
||||
'CÔNG HÒA XÃ HỘI',
|
||||
'CỘNG HOÀ XÃ HỘI',
|
||||
'CỘNG HOÀ XÃ HỘI CHỦ NGHĨA',
|
||||
'Độc lập',
|
||||
'TP. Hồ Chí Minh',
|
||||
'Tp. Hồ Chí Minh',
|
||||
'Thành phố Hồ Chí Minh',
|
||||
)
|
||||
|
||||
|
||||
def is_header_like(text: str) -> bool:
|
||||
if not text:
|
||||
return False
|
||||
for p in HEADER_PREFIXES:
|
||||
if text.startswith(p):
|
||||
return True
|
||||
return bool(re.match(r'^Mẫu\s*số\s*\d', text))
|
||||
|
||||
|
||||
def find_section_markers(children):
|
||||
"""
|
||||
Scan children, tìm marker paragraphs "Mẫu số 0X" / "BẢN CAM KẾT" + walk back
|
||||
để include header block (BỘ Y TẾ / ĐẠI HỌC Y DƯỢC / ...). Return dict
|
||||
section_name → start_index.
|
||||
"""
|
||||
section_starts = {'mau01': 0} # mau01 bao gồm cover + content
|
||||
|
||||
for i, child in enumerate(children):
|
||||
tag = child.tag.replace(W, '')
|
||||
if tag != 'p':
|
||||
continue
|
||||
text = get_para_text(child)
|
||||
if not text:
|
||||
continue
|
||||
|
||||
section_key = None
|
||||
if 'BẢN CAM KẾT' in text.upper():
|
||||
section_key = 'camket'
|
||||
else:
|
||||
m = re.match(r'^Mẫu\s*số\s*(0[234])', text)
|
||||
if m:
|
||||
section_key = f'mau{m.group(1)}'
|
||||
|
||||
if section_key is None or section_key in section_starts:
|
||||
continue
|
||||
|
||||
# Walk back: include paragraphs + tables chứa header text
|
||||
start = i
|
||||
for j in range(i - 1, -1, -1):
|
||||
prev = children[j]
|
||||
tag = prev.tag.replace(W, '')
|
||||
ptext = get_para_text(prev).strip()
|
||||
if tag == 'p':
|
||||
if ptext == '' or is_header_like(ptext):
|
||||
start = j
|
||||
else:
|
||||
break
|
||||
elif tag == 'tbl':
|
||||
# Header table bắt buộc có "BỘ Y TẾ" hoặc "ĐẠI HỌC Y DƯỢC" hoặc
|
||||
# "CỘNG HÒA/HOÀ XÃ HỘI" (strong markers). Tránh nhầm với signature
|
||||
# tbl của mẫu trước (chỉ có "Tp. Hồ Chí Minh, ngày...").
|
||||
strong_markers = (
|
||||
'BỘ Y TẾ',
|
||||
'ĐẠI HỌC Y DƯỢC',
|
||||
'CỘNG HÒA XÃ HỘI',
|
||||
'CỘNG HOÀ XÃ HỘI',
|
||||
'CÔNG HÒA XÃ HỘI',
|
||||
)
|
||||
if any(mk in ptext for mk in strong_markers):
|
||||
start = j
|
||||
else:
|
||||
break
|
||||
else:
|
||||
break
|
||||
|
||||
section_starts[section_key] = start
|
||||
|
||||
return section_starts
|
||||
|
||||
|
||||
def split_document(original_path: Path, output_dir: Path):
|
||||
"""Split original docx into 5 templates."""
|
||||
with tempfile.TemporaryDirectory() as tmp_str:
|
||||
tmp = Path(tmp_str)
|
||||
# Extract original
|
||||
base_dir = tmp / 'base'
|
||||
base_dir.mkdir()
|
||||
with zipfile.ZipFile(original_path, 'r') as z:
|
||||
z.extractall(base_dir)
|
||||
|
||||
# Parse document.xml
|
||||
doc_xml_path = base_dir / 'word' / 'document.xml'
|
||||
tree = ET.parse(doc_xml_path)
|
||||
root = tree.getroot()
|
||||
body = root.find(f'{W}body')
|
||||
assert body is not None, 'No body element'
|
||||
|
||||
# Collect sectPr (preserve for each output)
|
||||
sect_pr = None
|
||||
children = list(body)
|
||||
content_children = [] # children excluding sectPr
|
||||
for child in children:
|
||||
if child.tag.replace(W, '') == 'sectPr':
|
||||
sect_pr = child
|
||||
else:
|
||||
content_children.append(child)
|
||||
|
||||
# Find section start indices (với walk-back include header block)
|
||||
starts = find_section_markers(content_children)
|
||||
# Build ordered list: mau01 at 0, then other sections sorted by start index
|
||||
ordered = sorted(starts.items(), key=lambda kv: kv[1])
|
||||
|
||||
# Build sections dict — assign children [start_i, start_{i+1}) to each section
|
||||
sections = {k: [] for k in ['mau01', 'mau02', 'mau03', 'mau04', 'camket']}
|
||||
for idx, (sec_name, start) in enumerate(ordered):
|
||||
end = ordered[idx + 1][1] if idx + 1 < len(ordered) else len(content_children)
|
||||
sections[sec_name] = content_children[start:end]
|
||||
|
||||
# Write 5 output files
|
||||
output_dir.mkdir(parents=True, exist_ok=True)
|
||||
outputs = [
|
||||
('mau-01-bao-cao-mo-ta.docx', 'mau01'),
|
||||
('mau-02-don-de-nghi.docx', 'mau02'),
|
||||
('mau-03-xac-nhan-ty-le.docx', 'mau03'),
|
||||
('mau-04-phieu-danh-gia.docx', 'mau04'),
|
||||
('ban-cam-ket.docx', 'camket'),
|
||||
]
|
||||
|
||||
for filename, section_key in outputs:
|
||||
out_path = output_dir / filename
|
||||
# Clone base directory
|
||||
clone_dir = tmp / f'clone-{section_key}'
|
||||
shutil.copytree(base_dir, clone_dir)
|
||||
|
||||
# Modify document.xml
|
||||
clone_doc = clone_dir / 'word' / 'document.xml'
|
||||
ctree = ET.parse(clone_doc)
|
||||
croot = ctree.getroot()
|
||||
cbody = croot.find(f'{W}body')
|
||||
assert cbody is not None
|
||||
|
||||
# Clear existing body children
|
||||
for c in list(cbody):
|
||||
cbody.remove(c)
|
||||
|
||||
# Add section-specific children
|
||||
for elem in sections[section_key]:
|
||||
cbody.append(elem)
|
||||
# Add sectPr last
|
||||
if sect_pr is not None:
|
||||
cbody.append(sect_pr)
|
||||
|
||||
ctree.write(clone_doc, encoding='UTF-8', xml_declaration=True)
|
||||
|
||||
# Rezip
|
||||
if out_path.exists():
|
||||
out_path.unlink()
|
||||
with zipfile.ZipFile(out_path, 'w', zipfile.ZIP_DEFLATED) as zout:
|
||||
for file_path in clone_dir.rglob('*'):
|
||||
if file_path.is_file():
|
||||
zout.write(file_path, file_path.relative_to(clone_dir))
|
||||
|
||||
count = len(sections[section_key])
|
||||
print(f'Wrote {out_path.name} ({count} body children)')
|
||||
|
||||
|
||||
def main():
|
||||
repo_root = Path(__file__).parent.parent
|
||||
original = repo_root / 'src/Backend/DYD.Api/Templates/bao-cao-template-original.docx'
|
||||
output_dir = repo_root / 'src/Backend/DYD.Api/Templates'
|
||||
|
||||
if not original.exists():
|
||||
print(f'ERROR: source template not found: {original}')
|
||||
sys.exit(1)
|
||||
|
||||
split_document(original, output_dir)
|
||||
print('\nDone. 5 template files created.')
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
main()
|
||||
Reference in New Issue
Block a user