Files
sciagent/be0/tests/test_docx_normalize.py
Thinh Lam 688fac73e9
CI/CD / backend (push) Failing after 2m8s
CI/CD / frontend (push) Failing after 1m40s
CI/CD / deploy (push) Has been skipped
sciagent code + Gitea Actions CI/CD
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 09:38:30 +07:00

679 lines
33 KiB
Python

"""Tests for OOXML normalization used after docxtpl render."""
from __future__ import annotations
import io
import re
import unittest
import zipfile
from src.be01.docx_normalize import (
collapse_empty_page_break_paragraphs_in_docx,
force_times_new_roman_in_styles_docx,
move_signature_date_to_top_row,
normalize_bo_y_te_header_lines,
relax_justified_softbreak_paragraphs_in_docx,
shift_selected_header_lines_left,
strip_mau_04_evaluation_section_in_docx,
strip_table_row_height_rules_from_docx,
)
def _wrap_doc_in_zip(doc_xml: bytes) -> bytes:
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
return buf.getvalue()
def _read_document_xml(docx_bytes: bytes) -> str:
with zipfile.ZipFile(io.BytesIO(docx_bytes)) as z:
return z.read("word/document.xml").decode("utf-8")
class DocxNormalizeTests(unittest.TestCase):
def test_strip_tr_height_removes_self_closing(self) -> None:
xml = (
b'<?xml version="1.0" encoding="UTF-8"?><w:document '
b'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
b"<w:tbl><w:tr><w:trPr>"
b'<w:trHeight w:val="720" w:hRule="atLeast"/>'
b"</w:trPr><w:tc><w:p><w:r><w:t>a</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"
b"</w:document>"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = strip_table_row_height_rules_from_docx(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
self.assertNotIn("trHeight", doc)
self.assertNotIn("720", doc)
def test_normalize_bo_y_te_strips_ministry_bold_centers(self) -> None:
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/><w:ind w:left="3600"/></w:pPr>
<w:r><w:rPr><w:b w:val="1"/><w:rFonts w:ascii="Arial"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r>
</w:p>
<w:p><w:pPr><w:jc w:val="center"/><w:ind w:left="-150"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC</w:t><w:br/><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
phase1 = shift_selected_header_lines_left(buf.getvalue())
out = normalize_bo_y_te_header_lines(phase1)
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
ministry = re.search(r"<[^>]*:p\b[^>]*>.*?BỘ Y TẾ.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
self.assertIsNotNone(ministry)
assert ministry is not None
block = ministry.group(0)
self.assertNotIn("ns0:b", block.split("BỘ Y TẾ")[0])
self.assertIn('val="center"', block)
uni = re.search(r"<[^>]*:p\b[^>]*>.*?ĐẠI HỘC Y DƯỢC.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
self.assertIsNotNone(uni)
assert uni is not None
self.assertIn("ns0:b", uni.group(0))
self.assertIn("Times New Roman", uni.group(0))
def test_university_letterhead_two_paragraphs_bold_centered(self) -> None:
"""Cover may use two paragraphs instead of one line break."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC</w:t></w:r>
</w:p>
<w:p><w:pPr><w:jc w:val="right"/><w:ind w:left="-150"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = normalize_bo_y_te_header_lines(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
for label, needle in (
("dhyd", "ĐẠI HỘC Y DƯỢC"),
("tphcm", "THÀNH PHỐ HỒ CHÍ MINH"),
):
blk = re.search(
rf"<[^>]*:p\b[^>]*>.*?{re.escape(needle)}.*?</[^>]*:p>",
doc,
re.DOTALL | re.IGNORECASE,
)
self.assertIsNotNone(blk, msg=label)
assert blk is not None
b = blk.group(0)
self.assertIn("ns0:b", b, msg=label)
self.assertIn('val="center"', b, msg=label)
def test_university_letterhead_one_paragraph_gets_soft_break_inserted(self) -> None:
"""When both letterhead phrases share one paragraph on a single visual line, a
soft <w:br/> is inserted before the city line so the cover renders on two lines.
Also asserts the runs end up bold + upright (no italic) + Times New Roman."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:rPr><w:i w:val="1"/><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = normalize_bo_y_te_header_lines(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
# A soft break should now sit between the two phrases.
self.assertRegex(
doc,
r"ĐẠI HỘC Y DƯỢC.*?<[^>]*:br[^>]*/?>.*?THÀNH PHỐ HỒ CHÍ MINH",
)
# Paragraph is centered, runs are bold + not italic + Times New Roman.
self.assertIn('val="center"', doc)
self.assertIn("ns0:b", doc)
self.assertIn('ns0:i ns0:val="0"', doc)
self.assertIn("Times New Roman", doc)
def test_university_letterhead_soft_break_idempotent(self) -> None:
"""Running normalize twice should not stack additional <w:br/> elements between
the letterhead phrases."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
once = normalize_bo_y_te_header_lines(buf.getvalue())
twice = normalize_bo_y_te_header_lines(once)
with zipfile.ZipFile(io.BytesIO(twice)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
br_count = len(re.findall(r"<[^>]*:br\b[^>]*/?>", doc))
self.assertEqual(br_count, 1, msg=f"expected exactly one <w:br/>, got {br_count}: {doc!r}")
def test_first_page_scope_second_bo_te_unchanged(self) -> None:
"""Only the cover « BỘ Y TẾ » is stripped of bold; a later duplicate keeps bold."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:rPr><w:b w:val="1"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r></w:p>
<w:p><w:r><w:t>ĐẠI HỘC Y DƯỢC</w:t><w:br/><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:rPr><w:b w:val="1"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = normalize_bo_y_te_header_lines(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
self.assertEqual(len(paras), 4, msg="expected 4 paragraphs")
first_bo = paras[0]
late_bo = paras[3]
self.assertNotIn("ns0:b", first_bo.split("BỘ Y TẾ")[0])
self.assertIn("ns0:b", late_bo)
def test_move_signature_date_creates_full_width_top_row(self) -> None:
"""The date paragraph is lifted into a single-cell top row spanning every column."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:tbl>
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
<w:tr>
<w:tc><w:p><w:r><w:t>LÃNH ĐẠO ĐƠN VỊ</w:t></w:r></w:p><w:p><w:r><w:t>(Ký, ghi rõ họ tên)</w:t></w:r></w:p></w:tc>
<w:tc><w:p><w:r><w:t>Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026</w:t></w:r></w:p><w:p><w:r><w:t>Tác giả sáng kiến</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = move_signature_date_to_top_row(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
self.assertEqual(len(rows), 2, msg=f"expected 2 rows after lift, got: {doc!r}")
first_row, second_row = rows
# Top row: single cell, gridSpan=2, contains the date, right-aligned.
self.assertEqual(first_row.count("<ns0:tc>") + first_row.count("<ns0:tc "), 1)
self.assertRegex(first_row, r'<[^>]*:gridSpan\s+[^>]*:val="2"')
self.assertIn("Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026", first_row)
self.assertRegex(first_row, r'<[^>]*:jc\s+[^>]*:val="right"')
# Second row: original 2 cells. Right cell starts with "Tác giả sáng kiến"
# (no date paragraph anymore), so it aligns with "LÃNH ĐẠO ĐƠN VỊ".
self.assertNotIn("Tp. Hồ Chí Minh, ngày", second_row)
self.assertIn("LÃNH ĐẠO ĐƠN VỊ", second_row)
self.assertIn("Tác giả sáng kiến", second_row)
def test_move_signature_date_is_idempotent(self) -> None:
"""A second pass over an already-lifted table is a no-op (still exactly 2 rows)."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:tbl>
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
<w:tr>
<w:tc><w:p><w:r><w:t>LÃNH ĐẠO ĐƠN VỊ</w:t></w:r></w:p></w:tc>
<w:tc><w:p><w:r><w:t>Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026</w:t></w:r></w:p><w:p><w:r><w:t>Tác giả sáng kiến</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
once = move_signature_date_to_top_row(buf.getvalue())
twice = move_signature_date_to_top_row(once)
with zipfile.ZipFile(io.BytesIO(twice)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
self.assertEqual(len(rows), 2, msg=f"expected 2 rows after second pass, got: {doc!r}")
date_hits = doc.count("Tp. Hồ Chí Minh, ngày")
self.assertEqual(date_hits, 1, msg=f"date should appear exactly once, got {date_hits}")
def test_move_signature_date_skips_table_without_date(self) -> None:
"""Tables that do not contain the date prefix are left untouched."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:tbl>
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
<w:tr><w:tc><w:p><w:r><w:t>cell A</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>cell B</w:t></w:r></w:p></w:tc></w:tr>
</w:tbl>
</w:body></w:document>""".encode(
"utf-8"
)
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = move_signature_date_to_top_row(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
doc = z2.read("word/document.xml").decode("utf-8")
rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
self.assertEqual(len(rows), 1, msg="non-signature tables must be left untouched")
def test_relax_justified_splits_paragraph_at_soft_break_in_run(self) -> None:
"""Justified paragraph with a soft <w:br/> mid-run is split into two paragraphs.
Both fragments keep <w:jc w:val="both"/> so the layout stays justified, and the
line that used to be followed by the soft break (« first chunk ») becomes the
last line of its own paragraph -> stops being stretched."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>first chunk</w:t><w:br/><w:t>second chunk</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
self.assertNotRegex(
doc, r"<[^>]*:br\b(?![^>]*:type=\"page\")[^>]*/?>",
msg="soft <w:br/> should be consumed by the split",
)
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
self.assertEqual(len(paras), 2, msg=f"expected 2 paragraphs after split: {doc!r}")
for p in paras:
self.assertRegex(p, r'<[^>]*:jc\s+[^>]*:val="both"')
self.assertIn("Arial", p) # run properties preserved on both fragments
self.assertIn("first chunk", paras[0])
self.assertIn("second chunk", paras[1])
self.assertNotIn("second chunk", paras[0])
def test_relax_justified_distribute_becomes_both(self) -> None:
"""`distribute` stretches every line including the last; rewrite it to `both`."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="distribute"/></w:pPr><w:r><w:t>solo line</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
self.assertNotIn('val="distribute"', doc)
self.assertRegex(doc, r'<[^>]*:jc\s+[^>]*:val="both"')
def test_relax_justified_rewrites_distribute_in_styles_xml(self) -> None:
"""Paragraph styles may use ``distribute``; rewrite so body text is justified like Word ``both``."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>x</w:t></w:r></w:p></w:body></w:document>""".encode("utf-8")
styles_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:style w:type="paragraph" w:styleId="BodyText">
<w:pPr><w:jc w:val="distribute"/></w:pPr>
</w:style>
</w:styles>""".encode("utf-8")
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr("word/styles.xml", styles_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = relax_justified_softbreak_paragraphs_in_docx(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
styles = z2.read("word/styles.xml").decode("utf-8")
self.assertNotIn('val="distribute"', styles)
self.assertIn('val="both"', styles)
def test_relax_justified_merges_do_not_expand_shift_return_in_settings(self) -> None:
"""Compatibility flag so lines ending in soft breaks are not fully stretched when justified."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>x</w:t></w:r></w:p></w:body></w:document>""".encode("utf-8")
settings_xml = b"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:settings xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:zoom w:percent="100"/>
</w:settings>"""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/document.xml", doc_xml)
z.writestr("word/settings.xml", settings_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = relax_justified_softbreak_paragraphs_in_docx(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
settings = z2.read("word/settings.xml").decode("utf-8")
self.assertIn("doNotExpandShiftReturn", settings)
self.assertRegex(settings, r'doNotExpandShiftReturn[^>]*val="1"')
def test_relax_justified_preserves_non_justified_paragraphs(self) -> None:
"""Soft breaks in non-justified paragraphs are left alone (no surprise splits)."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:t>line1</w:t><w:br/><w:t>line2</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
self.assertEqual(len(paras), 1, msg="left-aligned paragraphs must not be split")
self.assertRegex(doc, r"<[^>]*:br\b[^>]*/?>", msg="soft break should survive")
def test_relax_justified_preserves_page_break(self) -> None:
"""Page breaks (`<w:br w:type="page"/>`) are NOT treated as soft breaks."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
<w:r><w:t>before</w:t><w:br w:type="page"/><w:t>after</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
self.assertEqual(len(paras), 1, msg="page breaks must not trigger paragraph split")
self.assertRegex(doc, r'<[^>]*:br\s+[^>]*:type="page"')
def test_relax_justified_idempotent(self) -> None:
"""Running twice produces the same output as running once."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
<w:r><w:t>aaa</w:t><w:br/><w:t>bbb</w:t><w:br/><w:t>ccc</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
"utf-8"
)
once = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
twice = relax_justified_softbreak_paragraphs_in_docx(once)
self.assertEqual(
_read_document_xml(once),
_read_document_xml(twice),
msg="second pass should be a no-op",
)
paras = re.findall(
r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", _read_document_xml(once), re.DOTALL
)
self.assertEqual(len(paras), 3, msg="two soft breaks should yield three paragraphs")
def test_strip_mau_04_removes_section_between_page_breaks(self) -> None:
"""Body order: mau_03 sig, page-break, letterhead table, « Mẫu số 04 », content,
page-break, « Bản cam kết ». Strip should drop everything from the leading
page-break paragraph through the last Mẫu số 04 content paragraph (inclusive),
keeping the trailing page-break paragraph that opens « Bản cam kết »."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>{{ mau_03.tac_gia_chinh_ky }}</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:tbl><w:tr><w:tc><w:p><w:r><w:t>BỘ Y TẾ</w:t></w:r></w:p></w:tc></w:tr></w:tbl>
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
<w:p><w:r><w:t>PHIẾU ĐÁNH GIÁ SÁNG KIẾN</w:t></w:r></w:p>
<w:p><w:r><w:t>1. Tên sáng kiến: {{ mau_04.ten_sang_kien }}</w:t></w:r></w:p>
<w:p><w:r><w:t>Kết luận: {{ mau_04.ket_luan }}</w:t></w:r></w:p>
<w:p><w:r><w:t>{{ mau_04.thanh_vien_hoi_dong }}</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM</w:t></w:r></w:p>
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
<w:sectPr/>
</w:body></w:document>""".encode(
"utf-8"
)
out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
self.assertNotIn("Mẫu số 04", doc)
self.assertNotIn("PHIẾU ĐÁNH GIÁ", doc)
self.assertNotIn("mau_04", doc)
# The leading page break + letterhead + content are gone, but the trailing
# page-break paragraph (now the only page break) must survive so Bản cam kết
# still starts on its own page.
page_breaks = re.findall(r'<[^>]*:br\s+[^>]*:type="page"', doc)
self.assertEqual(len(page_breaks), 1, msg=f"expected 1 page break, got {len(page_breaks)}: {doc!r}")
self.assertIn("mau_03.tac_gia_chinh_ky", doc)
self.assertIn("BẢN CAM KẾT", doc)
self.assertIn("CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM", doc)
# sectPr must survive the trim.
self.assertRegex(doc, r"<[^>]*:sectPr")
def test_strip_mau_04_is_idempotent(self) -> None:
"""Second pass over an already-stripped document is a no-op."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>mau_03 end</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
<w:p><w:r><w:t>content</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
once = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
twice = strip_mau_04_evaluation_section_in_docx(once)
self.assertEqual(_read_document_xml(once), _read_document_xml(twice))
self.assertNotIn("Mẫu số 04", _read_document_xml(once))
def test_strip_mau_04_noop_when_marker_missing(self) -> None:
"""Documents that don't carry the « Mẫu số 04 » header are left untouched."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>only mau_03</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
before = _read_document_xml(_wrap_doc_in_zip(doc_xml))
after = _read_document_xml(out)
# Allow whitespace / declaration differences from ElementTree round-trip; the
# human-readable text content must be unchanged.
for needle in ("only mau_03", "BẢN CAM KẾT"):
self.assertIn(needle, after)
self.assertNotIn("Mẫu số 04", after)
def test_strip_mau_04_bails_out_without_leading_page_break(self) -> None:
"""If there's no page break before the Mẫu số 04 header (malformed template),
leave the document alone instead of removing the previous section by mistake."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>previous section content</w:t></w:r></w:p>
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
<w:p><w:r><w:t>{{ mau_04.ten_sang_kien }}</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
self.assertIn("previous section content", doc)
self.assertIn("Mẫu số 04", doc, msg="strip must not run when leading page break is missing")
def test_collapse_empty_pagebreak_before_table_uses_pagebreakbefore(self) -> None:
"""An empty paragraph that hosts only ``<w:br w:type="page"/>`` followed by a
table is removed; the first paragraph in the first cell of the table gets
``<w:pageBreakBefore/>`` so the table anchors to a new page without an
intervening empty body paragraph."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>previous content</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:tbl>
<w:tr>
<w:tc><w:p><w:r><w:t>letterhead cell</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
<w:p><w:r><w:t>next section paragraph</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
self.assertNotRegex(
doc, r'<[^>]*:br\s+[^>]*:type="page"',
msg="inline <w:br w:type=\"page\"/> should be replaced",
)
self.assertRegex(doc, r"<[^>]*:pageBreakBefore")
# The empty page-break paragraph is gone but original content survives.
self.assertIn("previous content", doc)
self.assertIn("letterhead cell", doc)
self.assertIn("next section paragraph", doc)
def test_collapse_empty_pagebreak_before_paragraph(self) -> None:
"""Empty page-break paragraph followed by a non-empty paragraph: the empty
paragraph is removed and ``<w:pageBreakBefore/>`` is added to the next paragraph
so it starts on a new page."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>A</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:pPr><w:jc w:val="center"/></w:pPr><w:r><w:t>B</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
# Exactly two body paragraphs left (empty break paragraph collapsed).
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
self.assertEqual(len(paras), 2, msg=f"expected 2 paragraphs, got {len(paras)}: {doc!r}")
# The B paragraph keeps its center alignment AND gains pageBreakBefore.
b_para = next(p for p in paras if "B</" in p or "B<" in p)
self.assertIn("pageBreakBefore", b_para)
self.assertRegex(b_para, r'<[^>]*:jc\s+[^>]*:val="center"')
def test_collapse_empty_pagebreak_idempotent(self) -> None:
"""Second pass produces the same output as first pass."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>A</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>B</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
once = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
twice = collapse_empty_page_break_paragraphs_in_docx(once)
self.assertEqual(_read_document_xml(once), _read_document_xml(twice))
# And exactly one pageBreakBefore in the result (not double-registered).
pbb_count = len(re.findall(r"<[^>]*:pageBreakBefore", _read_document_xml(once)))
self.assertEqual(pbb_count, 1)
def test_collapse_empty_pagebreak_preserves_text_carrying_breaks(self) -> None:
"""A paragraph that carries real text *and* an inline page break (rare; usually
Word-edited) must not be collapsed: dropping the text would lose content."""
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>visible text</w:t><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>after</w:t></w:r></w:p>
</w:body></w:document>""".encode(
"utf-8"
)
out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
doc = _read_document_xml(out)
self.assertRegex(doc, r'<[^>]*:br\s+[^>]*:type="page"', msg="break must survive")
self.assertIn("visible text", doc)
self.assertIn("after", doc)
self.assertNotIn("pageBreakBefore", doc)
def test_force_times_new_roman_styles(self) -> None:
styles_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:docDefaults><w:rPrDefault><w:rPr><w:sz w:val="26"/></w:rPr></w:rPrDefault></w:docDefaults>
<w:style w:styleId="Heading1"><w:rPr><w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:cs="Calibri" w:eastAsia="Calibri"/></w:rPr></w:style>
</w:styles>"""
buf = io.BytesIO()
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
z.writestr("word/styles.xml", styles_xml)
z.writestr(
"[Content_Types].xml",
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
)
out = force_times_new_roman_in_styles_docx(buf.getvalue())
with zipfile.ZipFile(io.BytesIO(out)) as z2:
st = z2.read("word/styles.xml").decode("utf-8")
self.assertNotIn("Calibri", st)
self.assertIn("Times New Roman", st)
if __name__ == "__main__":
unittest.main()