sciagent/be0/tests/test_docx_normalize.py

"""Tests for OOXML normalization used after docxtpl render."""

from __future__ import annotations

import io
import re
import unittest
import zipfile

from src.be01.docx_normalize import (
    collapse_empty_page_break_paragraphs_in_docx,
    force_times_new_roman_in_styles_docx,
    move_signature_date_to_top_row,
    normalize_bo_y_te_header_lines,
    relax_justified_softbreak_paragraphs_in_docx,
    shift_selected_header_lines_left,
    strip_mau_04_evaluation_section_in_docx,
    strip_table_row_height_rules_from_docx,
)


def _wrap_doc_in_zip(doc_xml: bytes) -> bytes:
    buf = io.BytesIO()
    with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
        z.writestr("word/document.xml", doc_xml)
        z.writestr(
            "[Content_Types].xml",
            b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
        )
    return buf.getvalue()


def _read_document_xml(docx_bytes: bytes) -> str:
    with zipfile.ZipFile(io.BytesIO(docx_bytes)) as z:
        return z.read("word/document.xml").decode("utf-8")


class DocxNormalizeTests(unittest.TestCase):
    def test_strip_tr_height_removes_self_closing(self) -> None:
        xml = (
            b'<?xml version="1.0" encoding="UTF-8"?><w:document '
            b'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
            b"<w:tbl><w:tr><w:trPr>"
            b'<w:trHeight w:val="720" w:hRule="atLeast"/>'
            b"</w:trPr><w:tc><w:p><w:r><w:t>a</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"
            b"</w:document>"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = strip_table_row_height_rules_from_docx(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        self.assertNotIn("trHeight", doc)
        self.assertNotIn("720", doc)

    def test_normalize_bo_y_te_strips_ministry_bold_centers(self) -> None:
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/><w:ind w:left="3600"/></w:pPr>
<w:r><w:rPr><w:b w:val="1"/><w:rFonts w:ascii="Arial"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r>
</w:p>
<w:p><w:pPr><w:jc w:val="center"/><w:ind w:left="-150"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC</w:t><w:br/><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        phase1 = shift_selected_header_lines_left(buf.getvalue())
        out = normalize_bo_y_te_header_lines(phase1)
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        ministry = re.search(r"<[^>]*:p\b[^>]*>.*?BỘ Y TẾ.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
        self.assertIsNotNone(ministry)
        assert ministry is not None
        block = ministry.group(0)
        self.assertNotIn("ns0:b", block.split("BỘ Y TẾ")[0])
        self.assertIn('val="center"', block)
        uni = re.search(r"<[^>]*:p\b[^>]*>.*?ĐẠI HỘC Y DƯỢC.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
        self.assertIsNotNone(uni)
        assert uni is not None
        self.assertIn("ns0:b", uni.group(0))
        self.assertIn("Times New Roman", uni.group(0))

    def test_university_letterhead_two_paragraphs_bold_centered(self) -> None:
        """Cover may use two paragraphs instead of one line break."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC</w:t></w:r>
</w:p>
<w:p><w:pPr><w:jc w:val="right"/><w:ind w:left="-150"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = normalize_bo_y_te_header_lines(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        for label, needle in (
            ("dhyd", "ĐẠI HỘC Y DƯỢC"),
            ("tphcm", "THÀNH PHỐ HỒ CHÍ MINH"),
        ):
            blk = re.search(
                rf"<[^>]*:p\b[^>]*>.*?{re.escape(needle)}.*?</[^>]*:p>",
                doc,
                re.DOTALL | re.IGNORECASE,
            )
            self.assertIsNotNone(blk, msg=label)
            assert blk is not None
            b = blk.group(0)
            self.assertIn("ns0:b", b, msg=label)
            self.assertIn('val="center"', b, msg=label)

    def test_university_letterhead_one_paragraph_gets_soft_break_inserted(self) -> None:
        """When both letterhead phrases share one paragraph on a single visual line, a
        soft <w:br/> is inserted before the city line so the cover renders on two lines.

        Also asserts the runs end up bold + upright (no italic) + Times New Roman."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:rPr><w:i w:val="1"/><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = normalize_bo_y_te_header_lines(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        # A soft break should now sit between the two phrases.
        self.assertRegex(
            doc,
            r"ĐẠI HỘC Y DƯỢC.*?<[^>]*:br[^>]*/?>.*?THÀNH PHỐ HỒ CHÍ MINH",
        )
        # Paragraph is centered, runs are bold + not italic + Times New Roman.
        self.assertIn('val="center"', doc)
        self.assertIn("ns0:b", doc)
        self.assertIn('ns0:i ns0:val="0"', doc)
        self.assertIn("Times New Roman", doc)

    def test_university_letterhead_soft_break_idempotent(self) -> None:
        """Running normalize twice should not stack additional <w:br/> elements between
        the letterhead phrases."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        once = normalize_bo_y_te_header_lines(buf.getvalue())
        twice = normalize_bo_y_te_header_lines(once)
        with zipfile.ZipFile(io.BytesIO(twice)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        br_count = len(re.findall(r"<[^>]*:br\b[^>]*/?>", doc))
        self.assertEqual(br_count, 1, msg=f"expected exactly one <w:br/>, got {br_count}: {doc!r}")

    def test_first_page_scope_second_bo_te_unchanged(self) -> None:
        """Only the cover « BỘ Y TẾ » is stripped of bold; a later duplicate keeps bold."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:rPr><w:b w:val="1"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r></w:p>
<w:p><w:r><w:t>ĐẠI HỘC Y DƯỢC</w:t><w:br/><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:rPr><w:b w:val="1"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = normalize_bo_y_te_header_lines(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
        self.assertEqual(len(paras), 4, msg="expected 4 paragraphs")
        first_bo = paras[0]
        late_bo = paras[3]
        self.assertNotIn("ns0:b", first_bo.split("BỘ Y TẾ")[0])
        self.assertIn("ns0:b", late_bo)

    def test_move_signature_date_creates_full_width_top_row(self) -> None:
        """The date paragraph is lifted into a single-cell top row spanning every column."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:tbl>
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
<w:tr>
<w:tc><w:p><w:r><w:t>LÃNH ĐẠO ĐƠN VỊ</w:t></w:r></w:p><w:p><w:r><w:t>(Ký, ghi rõ họ tên)</w:t></w:r></w:p></w:tc>
<w:tc><w:p><w:r><w:t>Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026</w:t></w:r></w:p><w:p><w:r><w:t>Tác giả sáng kiến</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = move_signature_date_to_top_row(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")

        rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
        self.assertEqual(len(rows), 2, msg=f"expected 2 rows after lift, got: {doc!r}")

        first_row, second_row = rows
        # Top row: single cell, gridSpan=2, contains the date, right-aligned.
        self.assertEqual(first_row.count("<ns0:tc>") + first_row.count("<ns0:tc "), 1)
        self.assertRegex(first_row, r'<[^>]*:gridSpan\s+[^>]*:val="2"')
        self.assertIn("Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026", first_row)
        self.assertRegex(first_row, r'<[^>]*:jc\s+[^>]*:val="right"')

        # Second row: original 2 cells. Right cell starts with "Tác giả sáng kiến"
        # (no date paragraph anymore), so it aligns with "LÃNH ĐẠO ĐƠN VỊ".
        self.assertNotIn("Tp. Hồ Chí Minh, ngày", second_row)
        self.assertIn("LÃNH ĐẠO ĐƠN VỊ", second_row)
        self.assertIn("Tác giả sáng kiến", second_row)

    def test_move_signature_date_is_idempotent(self) -> None:
        """A second pass over an already-lifted table is a no-op (still exactly 2 rows)."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:tbl>
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
<w:tr>
<w:tc><w:p><w:r><w:t>LÃNH ĐẠO ĐƠN VỊ</w:t></w:r></w:p></w:tc>
<w:tc><w:p><w:r><w:t>Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026</w:t></w:r></w:p><w:p><w:r><w:t>Tác giả sáng kiến</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        once = move_signature_date_to_top_row(buf.getvalue())
        twice = move_signature_date_to_top_row(once)
        with zipfile.ZipFile(io.BytesIO(twice)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
        self.assertEqual(len(rows), 2, msg=f"expected 2 rows after second pass, got: {doc!r}")
        date_hits = doc.count("Tp. Hồ Chí Minh, ngày")
        self.assertEqual(date_hits, 1, msg=f"date should appear exactly once, got {date_hits}")

    def test_move_signature_date_skips_table_without_date(self) -> None:
        """Tables that do not contain the date prefix are left untouched."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:tbl>
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
<w:tr><w:tc><w:p><w:r><w:t>cell A</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>cell B</w:t></w:r></w:p></w:tc></w:tr>
</w:tbl>
</w:body></w:document>""".encode(
            "utf-8"
        )
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = move_signature_date_to_top_row(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            doc = z2.read("word/document.xml").decode("utf-8")
        rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
        self.assertEqual(len(rows), 1, msg="non-signature tables must be left untouched")

    def test_relax_justified_splits_paragraph_at_soft_break_in_run(self) -> None:
        """Justified paragraph with a soft <w:br/> mid-run is split into two paragraphs.
        Both fragments keep <w:jc w:val="both"/> so the layout stays justified, and the
        line that used to be followed by the soft break (« first chunk ») becomes the
        last line of its own paragraph -> stops being stretched."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>first chunk</w:t><w:br/><w:t>second chunk</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        self.assertNotRegex(
            doc, r"<[^>]*:br\b(?![^>]*:type=\"page\")[^>]*/?>",
            msg="soft <w:br/> should be consumed by the split",
        )
        paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
        self.assertEqual(len(paras), 2, msg=f"expected 2 paragraphs after split: {doc!r}")
        for p in paras:
            self.assertRegex(p, r'<[^>]*:jc\s+[^>]*:val="both"')
            self.assertIn("Arial", p)  # run properties preserved on both fragments
        self.assertIn("first chunk", paras[0])
        self.assertIn("second chunk", paras[1])
        self.assertNotIn("second chunk", paras[0])

    def test_relax_justified_distribute_becomes_both(self) -> None:
        """`distribute` stretches every line including the last; rewrite it to `both`."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="distribute"/></w:pPr><w:r><w:t>solo line</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        self.assertNotIn('val="distribute"', doc)
        self.assertRegex(doc, r'<[^>]*:jc\s+[^>]*:val="both"')

    def test_relax_justified_rewrites_distribute_in_styles_xml(self) -> None:
        """Paragraph styles may use ``distribute``; rewrite so body text is justified like Word ``both``."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>x</w:t></w:r></w:p></w:body></w:document>""".encode("utf-8")
        styles_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:style w:type="paragraph" w:styleId="BodyText">
<w:pPr><w:jc w:val="distribute"/></w:pPr>
</w:style>
</w:styles>""".encode("utf-8")
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr("word/styles.xml", styles_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = relax_justified_softbreak_paragraphs_in_docx(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            styles = z2.read("word/styles.xml").decode("utf-8")
        self.assertNotIn('val="distribute"', styles)
        self.assertIn('val="both"', styles)

    def test_relax_justified_merges_do_not_expand_shift_return_in_settings(self) -> None:
        """Compatibility flag so lines ending in soft breaks are not fully stretched when justified."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body><w:p><w:r><w:t>x</w:t></w:r></w:p></w:body></w:document>""".encode("utf-8")
        settings_xml = b"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
<w:settings xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:zoom w:percent="100"/>
</w:settings>"""
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/document.xml", doc_xml)
            z.writestr("word/settings.xml", settings_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = relax_justified_softbreak_paragraphs_in_docx(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            settings = z2.read("word/settings.xml").decode("utf-8")
        self.assertIn("doNotExpandShiftReturn", settings)
        self.assertRegex(settings, r'doNotExpandShiftReturn[^>]*val="1"')

    def test_relax_justified_preserves_non_justified_paragraphs(self) -> None:
        """Soft breaks in non-justified paragraphs are left alone (no surprise splits)."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
<w:r><w:t>line1</w:t><w:br/><w:t>line2</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
        self.assertEqual(len(paras), 1, msg="left-aligned paragraphs must not be split")
        self.assertRegex(doc, r"<[^>]*:br\b[^>]*/?>", msg="soft break should survive")

    def test_relax_justified_preserves_page_break(self) -> None:
        """Page breaks (`<w:br w:type="page"/>`) are NOT treated as soft breaks."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
<w:r><w:t>before</w:t><w:br w:type="page"/><w:t>after</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
        self.assertEqual(len(paras), 1, msg="page breaks must not trigger paragraph split")
        self.assertRegex(doc, r'<[^>]*:br\s+[^>]*:type="page"')

    def test_relax_justified_idempotent(self) -> None:
        """Running twice produces the same output as running once."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
<w:r><w:t>aaa</w:t><w:br/><w:t>bbb</w:t><w:br/><w:t>ccc</w:t></w:r>
</w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        once = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        twice = relax_justified_softbreak_paragraphs_in_docx(once)
        self.assertEqual(
            _read_document_xml(once),
            _read_document_xml(twice),
            msg="second pass should be a no-op",
        )
        paras = re.findall(
            r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", _read_document_xml(once), re.DOTALL
        )
        self.assertEqual(len(paras), 3, msg="two soft breaks should yield three paragraphs")

    def test_strip_mau_04_removes_section_between_page_breaks(self) -> None:
        """Body order: mau_03 sig, page-break, letterhead table, « Mẫu số 04 », content,
        page-break, « Bản cam kết ». Strip should drop everything from the leading
        page-break paragraph through the last Mẫu số 04 content paragraph (inclusive),
        keeping the trailing page-break paragraph that opens « Bản cam kết »."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>{{ mau_03.tac_gia_chinh_ky }}</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:tbl><w:tr><w:tc><w:p><w:r><w:t>BỘ Y TẾ</w:t></w:r></w:p></w:tc></w:tr></w:tbl>
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
<w:p><w:r><w:t>PHIẾU ĐÁNH GIÁ SÁNG KIẾN</w:t></w:r></w:p>
<w:p><w:r><w:t>1. Tên sáng kiến: {{ mau_04.ten_sang_kien }}</w:t></w:r></w:p>
<w:p><w:r><w:t>Kết luận: {{ mau_04.ket_luan }}</w:t></w:r></w:p>
<w:p><w:r><w:t>{{ mau_04.thanh_vien_hoi_dong }}</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM</w:t></w:r></w:p>
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
<w:sectPr/>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        self.assertNotIn("Mẫu số 04", doc)
        self.assertNotIn("PHIẾU ĐÁNH GIÁ", doc)
        self.assertNotIn("mau_04", doc)
        # The leading page break + letterhead + content are gone, but the trailing
        # page-break paragraph (now the only page break) must survive so Bản cam kết
        # still starts on its own page.
        page_breaks = re.findall(r'<[^>]*:br\s+[^>]*:type="page"', doc)
        self.assertEqual(len(page_breaks), 1, msg=f"expected 1 page break, got {len(page_breaks)}: {doc!r}")
        self.assertIn("mau_03.tac_gia_chinh_ky", doc)
        self.assertIn("BẢN CAM KẾT", doc)
        self.assertIn("CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM", doc)
        # sectPr must survive the trim.
        self.assertRegex(doc, r"<[^>]*:sectPr")

    def test_strip_mau_04_is_idempotent(self) -> None:
        """Second pass over an already-stripped document is a no-op."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>mau_03 end</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
<w:p><w:r><w:t>content</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        once = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
        twice = strip_mau_04_evaluation_section_in_docx(once)
        self.assertEqual(_read_document_xml(once), _read_document_xml(twice))
        self.assertNotIn("Mẫu số 04", _read_document_xml(once))

    def test_strip_mau_04_noop_when_marker_missing(self) -> None:
        """Documents that don't carry the « Mẫu số 04 » header are left untouched."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>only mau_03</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
        before = _read_document_xml(_wrap_doc_in_zip(doc_xml))
        after = _read_document_xml(out)
        # Allow whitespace / declaration differences from ElementTree round-trip; the
        # human-readable text content must be unchanged.
        for needle in ("only mau_03", "BẢN CAM KẾT"):
            self.assertIn(needle, after)
        self.assertNotIn("Mẫu số 04", after)

    def test_strip_mau_04_bails_out_without_leading_page_break(self) -> None:
        """If there's no page break before the Mẫu số 04 header (malformed template),
        leave the document alone instead of removing the previous section by mistake."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>previous section content</w:t></w:r></w:p>
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
<w:p><w:r><w:t>{{ mau_04.ten_sang_kien }}</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        self.assertIn("previous section content", doc)
        self.assertIn("Mẫu số 04", doc, msg="strip must not run when leading page break is missing")

    def test_collapse_empty_pagebreak_before_table_uses_pagebreakbefore(self) -> None:
        """An empty paragraph that hosts only ``<w:br w:type="page"/>`` followed by a
        table is removed; the first paragraph in the first cell of the table gets
        ``<w:pageBreakBefore/>`` so the table anchors to a new page without an
        intervening empty body paragraph."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>previous content</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:tbl>
<w:tr>
<w:tc><w:p><w:r><w:t>letterhead cell</w:t></w:r></w:p></w:tc>
</w:tr>
</w:tbl>
<w:p><w:r><w:t>next section paragraph</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        self.assertNotRegex(
            doc, r'<[^>]*:br\s+[^>]*:type="page"',
            msg="inline <w:br w:type=\"page\"/> should be replaced",
        )
        self.assertRegex(doc, r"<[^>]*:pageBreakBefore")
        # The empty page-break paragraph is gone but original content survives.
        self.assertIn("previous content", doc)
        self.assertIn("letterhead cell", doc)
        self.assertIn("next section paragraph", doc)

    def test_collapse_empty_pagebreak_before_paragraph(self) -> None:
        """Empty page-break paragraph followed by a non-empty paragraph: the empty
        paragraph is removed and ``<w:pageBreakBefore/>`` is added to the next paragraph
        so it starts on a new page."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>A</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:pPr><w:jc w:val="center"/></w:pPr><w:r><w:t>B</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        # Exactly two body paragraphs left (empty break paragraph collapsed).
        paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
        self.assertEqual(len(paras), 2, msg=f"expected 2 paragraphs, got {len(paras)}: {doc!r}")
        # The B paragraph keeps its center alignment AND gains pageBreakBefore.
        b_para = next(p for p in paras if "B</" in p or "B<" in p)
        self.assertIn("pageBreakBefore", b_para)
        self.assertRegex(b_para, r'<[^>]*:jc\s+[^>]*:val="center"')

    def test_collapse_empty_pagebreak_idempotent(self) -> None:
        """Second pass produces the same output as first pass."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>A</w:t></w:r></w:p>
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>B</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        once = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        twice = collapse_empty_page_break_paragraphs_in_docx(once)
        self.assertEqual(_read_document_xml(once), _read_document_xml(twice))
        # And exactly one pageBreakBefore in the result (not double-registered).
        pbb_count = len(re.findall(r"<[^>]*:pageBreakBefore", _read_document_xml(once)))
        self.assertEqual(pbb_count, 1)

    def test_collapse_empty_pagebreak_preserves_text_carrying_breaks(self) -> None:
        """A paragraph that carries real text *and* an inline page break (rare; usually
        Word-edited) must not be collapsed: dropping the text would lose content."""
        doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:body>
<w:p><w:r><w:t>visible text</w:t><w:br w:type="page"/></w:r></w:p>
<w:p><w:r><w:t>after</w:t></w:r></w:p>
</w:body></w:document>""".encode(
            "utf-8"
        )
        out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
        doc = _read_document_xml(out)
        self.assertRegex(doc, r'<[^>]*:br\s+[^>]*:type="page"', msg="break must survive")
        self.assertIn("visible text", doc)
        self.assertIn("after", doc)
        self.assertNotIn("pageBreakBefore", doc)

    def test_force_times_new_roman_styles(self) -> None:
        styles_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
<w:docDefaults><w:rPrDefault><w:rPr><w:sz w:val="26"/></w:rPr></w:rPrDefault></w:docDefaults>
<w:style w:styleId="Heading1"><w:rPr><w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:cs="Calibri" w:eastAsia="Calibri"/></w:rPr></w:style>
</w:styles>"""
        buf = io.BytesIO()
        with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
            z.writestr("word/styles.xml", styles_xml)
            z.writestr(
                "[Content_Types].xml",
                b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
            )
        out = force_times_new_roman_in_styles_docx(buf.getvalue())
        with zipfile.ZipFile(io.BytesIO(out)) as z2:
            st = z2.read("word/styles.xml").decode("utf-8")
        self.assertNotIn("Calibri", st)
        self.assertIn("Times New Roman", st)


if __name__ == "__main__":
    unittest.main()