679 lines
33 KiB
Python
679 lines
33 KiB
Python
"""Tests for OOXML normalization used after docxtpl render."""
|
|
|
|
from __future__ import annotations
|
|
|
|
import io
|
|
import re
|
|
import unittest
|
|
import zipfile
|
|
|
|
from src.be01.docx_normalize import (
|
|
collapse_empty_page_break_paragraphs_in_docx,
|
|
force_times_new_roman_in_styles_docx,
|
|
move_signature_date_to_top_row,
|
|
normalize_bo_y_te_header_lines,
|
|
relax_justified_softbreak_paragraphs_in_docx,
|
|
shift_selected_header_lines_left,
|
|
strip_mau_04_evaluation_section_in_docx,
|
|
strip_table_row_height_rules_from_docx,
|
|
)
|
|
|
|
|
|
def _wrap_doc_in_zip(doc_xml: bytes) -> bytes:
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
return buf.getvalue()
|
|
|
|
|
|
def _read_document_xml(docx_bytes: bytes) -> str:
|
|
with zipfile.ZipFile(io.BytesIO(docx_bytes)) as z:
|
|
return z.read("word/document.xml").decode("utf-8")
|
|
|
|
|
|
class DocxNormalizeTests(unittest.TestCase):
|
|
def test_strip_tr_height_removes_self_closing(self) -> None:
|
|
xml = (
|
|
b'<?xml version="1.0" encoding="UTF-8"?><w:document '
|
|
b'xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">'
|
|
b"<w:tbl><w:tr><w:trPr>"
|
|
b'<w:trHeight w:val="720" w:hRule="atLeast"/>'
|
|
b"</w:trPr><w:tc><w:p><w:r><w:t>a</w:t></w:r></w:p></w:tc></w:tr></w:tbl>"
|
|
b"</w:document>"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = strip_table_row_height_rules_from_docx(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
self.assertNotIn("trHeight", doc)
|
|
self.assertNotIn("720", doc)
|
|
|
|
def test_normalize_bo_y_te_strips_ministry_bold_centers(self) -> None:
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="left"/><w:ind w:left="3600"/></w:pPr>
|
|
<w:r><w:rPr><w:b w:val="1"/><w:rFonts w:ascii="Arial"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r>
|
|
</w:p>
|
|
<w:p><w:pPr><w:jc w:val="center"/><w:ind w:left="-150"/></w:pPr>
|
|
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC</w:t><w:br/><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
phase1 = shift_selected_header_lines_left(buf.getvalue())
|
|
out = normalize_bo_y_te_header_lines(phase1)
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
ministry = re.search(r"<[^>]*:p\b[^>]*>.*?BỘ Y TẾ.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
|
|
self.assertIsNotNone(ministry)
|
|
assert ministry is not None
|
|
block = ministry.group(0)
|
|
self.assertNotIn("ns0:b", block.split("BỘ Y TẾ")[0])
|
|
self.assertIn('val="center"', block)
|
|
uni = re.search(r"<[^>]*:p\b[^>]*>.*?ĐẠI HỘC Y DƯỢC.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
|
|
self.assertIsNotNone(uni)
|
|
assert uni is not None
|
|
self.assertIn("ns0:b", uni.group(0))
|
|
self.assertIn("Times New Roman", uni.group(0))
|
|
|
|
def test_university_letterhead_two_paragraphs_bold_centered(self) -> None:
|
|
"""Cover may use two paragraphs instead of one line break."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
|
|
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC</w:t></w:r>
|
|
</w:p>
|
|
<w:p><w:pPr><w:jc w:val="right"/><w:ind w:left="-150"/></w:pPr>
|
|
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = normalize_bo_y_te_header_lines(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
for label, needle in (
|
|
("dhyd", "ĐẠI HỘC Y DƯỢC"),
|
|
("tphcm", "THÀNH PHỐ HỒ CHÍ MINH"),
|
|
):
|
|
blk = re.search(
|
|
rf"<[^>]*:p\b[^>]*>.*?{re.escape(needle)}.*?</[^>]*:p>",
|
|
doc,
|
|
re.DOTALL | re.IGNORECASE,
|
|
)
|
|
self.assertIsNotNone(blk, msg=label)
|
|
assert blk is not None
|
|
b = blk.group(0)
|
|
self.assertIn("ns0:b", b, msg=label)
|
|
self.assertIn('val="center"', b, msg=label)
|
|
|
|
def test_university_letterhead_one_paragraph_gets_soft_break_inserted(self) -> None:
|
|
"""When both letterhead phrases share one paragraph on a single visual line, a
|
|
soft <w:br/> is inserted before the city line so the cover renders on two lines.
|
|
|
|
Also asserts the runs end up bold + upright (no italic) + Times New Roman."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
|
|
<w:r><w:rPr><w:i w:val="1"/><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = normalize_bo_y_te_header_lines(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
# A soft break should now sit between the two phrases.
|
|
self.assertRegex(
|
|
doc,
|
|
r"ĐẠI HỘC Y DƯỢC.*?<[^>]*:br[^>]*/?>.*?THÀNH PHỐ HỒ CHÍ MINH",
|
|
)
|
|
# Paragraph is centered, runs are bold + not italic + Times New Roman.
|
|
self.assertIn('val="center"', doc)
|
|
self.assertIn("ns0:b", doc)
|
|
self.assertIn('ns0:i ns0:val="0"', doc)
|
|
self.assertIn("Times New Roman", doc)
|
|
|
|
def test_university_letterhead_soft_break_idempotent(self) -> None:
|
|
"""Running normalize twice should not stack additional <w:br/> elements between
|
|
the letterhead phrases."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
|
|
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>ĐẠI HỘC Y DƯỢC THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
once = normalize_bo_y_te_header_lines(buf.getvalue())
|
|
twice = normalize_bo_y_te_header_lines(once)
|
|
with zipfile.ZipFile(io.BytesIO(twice)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
br_count = len(re.findall(r"<[^>]*:br\b[^>]*/?>", doc))
|
|
self.assertEqual(br_count, 1, msg=f"expected exactly one <w:br/>, got {br_count}: {doc!r}")
|
|
|
|
def test_first_page_scope_second_bo_te_unchanged(self) -> None:
|
|
"""Only the cover « BỘ Y TẾ » is stripped of bold; a later duplicate keeps bold."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:rPr><w:b w:val="1"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>ĐẠI HỘC Y DƯỢC</w:t><w:br/><w:t>THÀNH PHỐ HỒ CHÍ MINH</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:rPr><w:b w:val="1"/></w:rPr><w:t>BỘ Y TẾ</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = normalize_bo_y_te_header_lines(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL | re.IGNORECASE)
|
|
self.assertEqual(len(paras), 4, msg="expected 4 paragraphs")
|
|
first_bo = paras[0]
|
|
late_bo = paras[3]
|
|
self.assertNotIn("ns0:b", first_bo.split("BỘ Y TẾ")[0])
|
|
self.assertIn("ns0:b", late_bo)
|
|
|
|
def test_move_signature_date_creates_full_width_top_row(self) -> None:
|
|
"""The date paragraph is lifted into a single-cell top row spanning every column."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:tbl>
|
|
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
|
|
<w:tr>
|
|
<w:tc><w:p><w:r><w:t>LÃNH ĐẠO ĐƠN VỊ</w:t></w:r></w:p><w:p><w:r><w:t>(Ký, ghi rõ họ tên)</w:t></w:r></w:p></w:tc>
|
|
<w:tc><w:p><w:r><w:t>Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026</w:t></w:r></w:p><w:p><w:r><w:t>Tác giả sáng kiến</w:t></w:r></w:p></w:tc>
|
|
</w:tr>
|
|
</w:tbl>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = move_signature_date_to_top_row(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
|
|
rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
|
|
self.assertEqual(len(rows), 2, msg=f"expected 2 rows after lift, got: {doc!r}")
|
|
|
|
first_row, second_row = rows
|
|
# Top row: single cell, gridSpan=2, contains the date, right-aligned.
|
|
self.assertEqual(first_row.count("<ns0:tc>") + first_row.count("<ns0:tc "), 1)
|
|
self.assertRegex(first_row, r'<[^>]*:gridSpan\s+[^>]*:val="2"')
|
|
self.assertIn("Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026", first_row)
|
|
self.assertRegex(first_row, r'<[^>]*:jc\s+[^>]*:val="right"')
|
|
|
|
# Second row: original 2 cells. Right cell starts with "Tác giả sáng kiến"
|
|
# (no date paragraph anymore), so it aligns with "LÃNH ĐẠO ĐƠN VỊ".
|
|
self.assertNotIn("Tp. Hồ Chí Minh, ngày", second_row)
|
|
self.assertIn("LÃNH ĐẠO ĐƠN VỊ", second_row)
|
|
self.assertIn("Tác giả sáng kiến", second_row)
|
|
|
|
def test_move_signature_date_is_idempotent(self) -> None:
|
|
"""A second pass over an already-lifted table is a no-op (still exactly 2 rows)."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:tbl>
|
|
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
|
|
<w:tr>
|
|
<w:tc><w:p><w:r><w:t>LÃNH ĐẠO ĐƠN VỊ</w:t></w:r></w:p></w:tc>
|
|
<w:tc><w:p><w:r><w:t>Tp. Hồ Chí Minh, ngày 11 tháng 5 năm 2026</w:t></w:r></w:p><w:p><w:r><w:t>Tác giả sáng kiến</w:t></w:r></w:p></w:tc>
|
|
</w:tr>
|
|
</w:tbl>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
once = move_signature_date_to_top_row(buf.getvalue())
|
|
twice = move_signature_date_to_top_row(once)
|
|
with zipfile.ZipFile(io.BytesIO(twice)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
|
|
self.assertEqual(len(rows), 2, msg=f"expected 2 rows after second pass, got: {doc!r}")
|
|
date_hits = doc.count("Tp. Hồ Chí Minh, ngày")
|
|
self.assertEqual(date_hits, 1, msg=f"date should appear exactly once, got {date_hits}")
|
|
|
|
def test_move_signature_date_skips_table_without_date(self) -> None:
|
|
"""Tables that do not contain the date prefix are left untouched."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:tbl>
|
|
<w:tblGrid><w:gridCol w:w="4702"/><w:gridCol w:w="4702"/></w:tblGrid>
|
|
<w:tr><w:tc><w:p><w:r><w:t>cell A</w:t></w:r></w:p></w:tc><w:tc><w:p><w:r><w:t>cell B</w:t></w:r></w:p></w:tc></w:tr>
|
|
</w:tbl>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = move_signature_date_to_top_row(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
doc = z2.read("word/document.xml").decode("utf-8")
|
|
rows = re.findall(r"<[^>]*:tr\b[^>]*>.*?</[^>]*:tr>", doc, re.DOTALL)
|
|
self.assertEqual(len(rows), 1, msg="non-signature tables must be left untouched")
|
|
|
|
def test_relax_justified_splits_paragraph_at_soft_break_in_run(self) -> None:
|
|
"""Justified paragraph with a soft <w:br/> mid-run is split into two paragraphs.
|
|
Both fragments keep <w:jc w:val="both"/> so the layout stays justified, and the
|
|
line that used to be followed by the soft break (« first chunk ») becomes the
|
|
last line of its own paragraph -> stops being stretched."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
|
|
<w:r><w:rPr><w:rFonts w:ascii="Arial"/></w:rPr><w:t>first chunk</w:t><w:br/><w:t>second chunk</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
self.assertNotRegex(
|
|
doc, r"<[^>]*:br\b(?![^>]*:type=\"page\")[^>]*/?>",
|
|
msg="soft <w:br/> should be consumed by the split",
|
|
)
|
|
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
|
|
self.assertEqual(len(paras), 2, msg=f"expected 2 paragraphs after split: {doc!r}")
|
|
for p in paras:
|
|
self.assertRegex(p, r'<[^>]*:jc\s+[^>]*:val="both"')
|
|
self.assertIn("Arial", p) # run properties preserved on both fragments
|
|
self.assertIn("first chunk", paras[0])
|
|
self.assertIn("second chunk", paras[1])
|
|
self.assertNotIn("second chunk", paras[0])
|
|
|
|
def test_relax_justified_distribute_becomes_both(self) -> None:
|
|
"""`distribute` stretches every line including the last; rewrite it to `both`."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="distribute"/></w:pPr><w:r><w:t>solo line</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
self.assertNotIn('val="distribute"', doc)
|
|
self.assertRegex(doc, r'<[^>]*:jc\s+[^>]*:val="both"')
|
|
|
|
def test_relax_justified_rewrites_distribute_in_styles_xml(self) -> None:
|
|
"""Paragraph styles may use ``distribute``; rewrite so body text is justified like Word ``both``."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body><w:p><w:r><w:t>x</w:t></w:r></w:p></w:body></w:document>""".encode("utf-8")
|
|
styles_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:style w:type="paragraph" w:styleId="BodyText">
|
|
<w:pPr><w:jc w:val="distribute"/></w:pPr>
|
|
</w:style>
|
|
</w:styles>""".encode("utf-8")
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr("word/styles.xml", styles_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = relax_justified_softbreak_paragraphs_in_docx(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
styles = z2.read("word/styles.xml").decode("utf-8")
|
|
self.assertNotIn('val="distribute"', styles)
|
|
self.assertIn('val="both"', styles)
|
|
|
|
def test_relax_justified_merges_do_not_expand_shift_return_in_settings(self) -> None:
|
|
"""Compatibility flag so lines ending in soft breaks are not fully stretched when justified."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body><w:p><w:r><w:t>x</w:t></w:r></w:p></w:body></w:document>""".encode("utf-8")
|
|
settings_xml = b"""<?xml version="1.0" encoding="UTF-8" standalone="yes"?>
|
|
<w:settings xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:zoom w:percent="100"/>
|
|
</w:settings>"""
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/document.xml", doc_xml)
|
|
z.writestr("word/settings.xml", settings_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = relax_justified_softbreak_paragraphs_in_docx(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
settings = z2.read("word/settings.xml").decode("utf-8")
|
|
self.assertIn("doNotExpandShiftReturn", settings)
|
|
self.assertRegex(settings, r'doNotExpandShiftReturn[^>]*val="1"')
|
|
|
|
def test_relax_justified_preserves_non_justified_paragraphs(self) -> None:
|
|
"""Soft breaks in non-justified paragraphs are left alone (no surprise splits)."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="left"/></w:pPr>
|
|
<w:r><w:t>line1</w:t><w:br/><w:t>line2</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
|
|
self.assertEqual(len(paras), 1, msg="left-aligned paragraphs must not be split")
|
|
self.assertRegex(doc, r"<[^>]*:br\b[^>]*/?>", msg="soft break should survive")
|
|
|
|
def test_relax_justified_preserves_page_break(self) -> None:
|
|
"""Page breaks (`<w:br w:type="page"/>`) are NOT treated as soft breaks."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
|
|
<w:r><w:t>before</w:t><w:br w:type="page"/><w:t>after</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
|
|
self.assertEqual(len(paras), 1, msg="page breaks must not trigger paragraph split")
|
|
self.assertRegex(doc, r'<[^>]*:br\s+[^>]*:type="page"')
|
|
|
|
def test_relax_justified_idempotent(self) -> None:
|
|
"""Running twice produces the same output as running once."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:pPr><w:jc w:val="both"/></w:pPr>
|
|
<w:r><w:t>aaa</w:t><w:br/><w:t>bbb</w:t><w:br/><w:t>ccc</w:t></w:r>
|
|
</w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
once = relax_justified_softbreak_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
twice = relax_justified_softbreak_paragraphs_in_docx(once)
|
|
self.assertEqual(
|
|
_read_document_xml(once),
|
|
_read_document_xml(twice),
|
|
msg="second pass should be a no-op",
|
|
)
|
|
paras = re.findall(
|
|
r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", _read_document_xml(once), re.DOTALL
|
|
)
|
|
self.assertEqual(len(paras), 3, msg="two soft breaks should yield three paragraphs")
|
|
|
|
def test_strip_mau_04_removes_section_between_page_breaks(self) -> None:
|
|
"""Body order: mau_03 sig, page-break, letterhead table, « Mẫu số 04 », content,
|
|
page-break, « Bản cam kết ». Strip should drop everything from the leading
|
|
page-break paragraph through the last Mẫu số 04 content paragraph (inclusive),
|
|
keeping the trailing page-break paragraph that opens « Bản cam kết »."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>{{ mau_03.tac_gia_chinh_ky }}</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:tbl><w:tr><w:tc><w:p><w:r><w:t>BỘ Y TẾ</w:t></w:r></w:p></w:tc></w:tr></w:tbl>
|
|
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>PHIẾU ĐÁNH GIÁ SÁNG KIẾN</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>1. Tên sáng kiến: {{ mau_04.ten_sang_kien }}</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>Kết luận: {{ mau_04.ket_luan }}</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>{{ mau_04.thanh_vien_hoi_dong }}</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:t>CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
|
|
<w:sectPr/>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
self.assertNotIn("Mẫu số 04", doc)
|
|
self.assertNotIn("PHIẾU ĐÁNH GIÁ", doc)
|
|
self.assertNotIn("mau_04", doc)
|
|
# The leading page break + letterhead + content are gone, but the trailing
|
|
# page-break paragraph (now the only page break) must survive so Bản cam kết
|
|
# still starts on its own page.
|
|
page_breaks = re.findall(r'<[^>]*:br\s+[^>]*:type="page"', doc)
|
|
self.assertEqual(len(page_breaks), 1, msg=f"expected 1 page break, got {len(page_breaks)}: {doc!r}")
|
|
self.assertIn("mau_03.tac_gia_chinh_ky", doc)
|
|
self.assertIn("BẢN CAM KẾT", doc)
|
|
self.assertIn("CỘNG HOÀ XÃ HỘI CHỦ NGHĨA VIỆT NAM", doc)
|
|
# sectPr must survive the trim.
|
|
self.assertRegex(doc, r"<[^>]*:sectPr")
|
|
|
|
def test_strip_mau_04_is_idempotent(self) -> None:
|
|
"""Second pass over an already-stripped document is a no-op."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>mau_03 end</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>content</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
once = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
twice = strip_mau_04_evaluation_section_in_docx(once)
|
|
self.assertEqual(_read_document_xml(once), _read_document_xml(twice))
|
|
self.assertNotIn("Mẫu số 04", _read_document_xml(once))
|
|
|
|
def test_strip_mau_04_noop_when_marker_missing(self) -> None:
|
|
"""Documents that don't carry the « Mẫu số 04 » header are left untouched."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>only mau_03</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:t>BẢN CAM KẾT</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
before = _read_document_xml(_wrap_doc_in_zip(doc_xml))
|
|
after = _read_document_xml(out)
|
|
# Allow whitespace / declaration differences from ElementTree round-trip; the
|
|
# human-readable text content must be unchanged.
|
|
for needle in ("only mau_03", "BẢN CAM KẾT"):
|
|
self.assertIn(needle, after)
|
|
self.assertNotIn("Mẫu số 04", after)
|
|
|
|
def test_strip_mau_04_bails_out_without_leading_page_break(self) -> None:
|
|
"""If there's no page break before the Mẫu số 04 header (malformed template),
|
|
leave the document alone instead of removing the previous section by mistake."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>previous section content</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>Mẫu số 04</w:t></w:r></w:p>
|
|
<w:p><w:r><w:t>{{ mau_04.ten_sang_kien }}</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = strip_mau_04_evaluation_section_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
self.assertIn("previous section content", doc)
|
|
self.assertIn("Mẫu số 04", doc, msg="strip must not run when leading page break is missing")
|
|
|
|
def test_collapse_empty_pagebreak_before_table_uses_pagebreakbefore(self) -> None:
|
|
"""An empty paragraph that hosts only ``<w:br w:type="page"/>`` followed by a
|
|
table is removed; the first paragraph in the first cell of the table gets
|
|
``<w:pageBreakBefore/>`` so the table anchors to a new page without an
|
|
intervening empty body paragraph."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>previous content</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:tbl>
|
|
<w:tr>
|
|
<w:tc><w:p><w:r><w:t>letterhead cell</w:t></w:r></w:p></w:tc>
|
|
</w:tr>
|
|
</w:tbl>
|
|
<w:p><w:r><w:t>next section paragraph</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
self.assertNotRegex(
|
|
doc, r'<[^>]*:br\s+[^>]*:type="page"',
|
|
msg="inline <w:br w:type=\"page\"/> should be replaced",
|
|
)
|
|
self.assertRegex(doc, r"<[^>]*:pageBreakBefore")
|
|
# The empty page-break paragraph is gone but original content survives.
|
|
self.assertIn("previous content", doc)
|
|
self.assertIn("letterhead cell", doc)
|
|
self.assertIn("next section paragraph", doc)
|
|
|
|
def test_collapse_empty_pagebreak_before_paragraph(self) -> None:
|
|
"""Empty page-break paragraph followed by a non-empty paragraph: the empty
|
|
paragraph is removed and ``<w:pageBreakBefore/>`` is added to the next paragraph
|
|
so it starts on a new page."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>A</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:pPr><w:jc w:val="center"/></w:pPr><w:r><w:t>B</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
# Exactly two body paragraphs left (empty break paragraph collapsed).
|
|
paras = re.findall(r"<[^>]*:p\b[^>]*>.*?</[^>]*:p>", doc, re.DOTALL)
|
|
self.assertEqual(len(paras), 2, msg=f"expected 2 paragraphs, got {len(paras)}: {doc!r}")
|
|
# The B paragraph keeps its center alignment AND gains pageBreakBefore.
|
|
b_para = next(p for p in paras if "B</" in p or "B<" in p)
|
|
self.assertIn("pageBreakBefore", b_para)
|
|
self.assertRegex(b_para, r'<[^>]*:jc\s+[^>]*:val="center"')
|
|
|
|
def test_collapse_empty_pagebreak_idempotent(self) -> None:
|
|
"""Second pass produces the same output as first pass."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>A</w:t></w:r></w:p>
|
|
<w:p><w:r><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:t>B</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
once = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
twice = collapse_empty_page_break_paragraphs_in_docx(once)
|
|
self.assertEqual(_read_document_xml(once), _read_document_xml(twice))
|
|
# And exactly one pageBreakBefore in the result (not double-registered).
|
|
pbb_count = len(re.findall(r"<[^>]*:pageBreakBefore", _read_document_xml(once)))
|
|
self.assertEqual(pbb_count, 1)
|
|
|
|
def test_collapse_empty_pagebreak_preserves_text_carrying_breaks(self) -> None:
|
|
"""A paragraph that carries real text *and* an inline page break (rare; usually
|
|
Word-edited) must not be collapsed: dropping the text would lose content."""
|
|
doc_xml = """<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:document xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:body>
|
|
<w:p><w:r><w:t>visible text</w:t><w:br w:type="page"/></w:r></w:p>
|
|
<w:p><w:r><w:t>after</w:t></w:r></w:p>
|
|
</w:body></w:document>""".encode(
|
|
"utf-8"
|
|
)
|
|
out = collapse_empty_page_break_paragraphs_in_docx(_wrap_doc_in_zip(doc_xml))
|
|
doc = _read_document_xml(out)
|
|
self.assertRegex(doc, r'<[^>]*:br\s+[^>]*:type="page"', msg="break must survive")
|
|
self.assertIn("visible text", doc)
|
|
self.assertIn("after", doc)
|
|
self.assertNotIn("pageBreakBefore", doc)
|
|
|
|
def test_force_times_new_roman_styles(self) -> None:
|
|
styles_xml = b"""<?xml version="1.0" encoding="UTF-8"?>
|
|
<w:styles xmlns:w="http://schemas.openxmlformats.org/wordprocessingml/2006/main">
|
|
<w:docDefaults><w:rPrDefault><w:rPr><w:sz w:val="26"/></w:rPr></w:rPrDefault></w:docDefaults>
|
|
<w:style w:styleId="Heading1"><w:rPr><w:rFonts w:ascii="Calibri" w:hAnsi="Calibri" w:cs="Calibri" w:eastAsia="Calibri"/></w:rPr></w:style>
|
|
</w:styles>"""
|
|
buf = io.BytesIO()
|
|
with zipfile.ZipFile(buf, "w", zipfile.ZIP_DEFLATED) as z:
|
|
z.writestr("word/styles.xml", styles_xml)
|
|
z.writestr(
|
|
"[Content_Types].xml",
|
|
b'<?xml version="1.0"?><Types xmlns="http://schemas.openxmlformats.org/package/2006/content-types"/>',
|
|
)
|
|
out = force_times_new_roman_in_styles_docx(buf.getvalue())
|
|
with zipfile.ZipFile(io.BytesIO(out)) as z2:
|
|
st = z2.read("word/styles.xml").decode("utf-8")
|
|
self.assertNotIn("Calibri", st)
|
|
self.assertIn("Times New Roman", st)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
unittest.main()
|