Files
sciagent/shared/src/initiative/convertDocxToPdfBlob.ts
T
Thinh Lam 688fac73e9
CI/CD / backend (push) Failing after 2m8s
CI/CD / frontend (push) Failing after 1m40s
CI/CD / deploy (push) Has been skipped
sciagent code + Gitea Actions CI/CD
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
2026-06-30 09:38:30 +07:00

289 lines
10 KiB
TypeScript
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
import { renderAsync } from 'docx-preview';
import html2canvas from 'html2canvas';
import jsPDF from 'jspdf';
import { injectDocxJustifyMitigationStyles } from '../lib/docxJustifyMitigationCss';
import { injectDocxTableReflowStyles } from '../lib/docxTableReflow';
import { SHARED_DOCX_OFFICIAL_FORM_RENDER_OPTIONS } from '../lib/sharedDocxOfficialFormRenderOptions';
const PX_TO_MM = 25.4 / 96;
async function waitForRenderableAssets(container: HTMLElement): Promise<void> {
const images = Array.from(container.querySelectorAll<HTMLImageElement>('img'));
if (images.length === 0) return;
await Promise.all(
images.map(async (img) => {
if (img.complete) return;
if (typeof img.decode === 'function') {
try {
await img.decode();
return;
} catch {
// Fallback to load/error listeners below.
}
}
await new Promise<void>((resolve) => {
const done = () => resolve();
img.addEventListener('load', done, { once: true });
img.addEventListener('error', done, { once: true });
});
}),
);
}
/**
* Matches `docx-to-pdf-demo.html`: `.shell { max-width }` + `.preview { padding }`.
* Visible preview and off-screen capture use the same numbers so line breaks and tables match.
*/
export const DOCX_PDF_PREVIEW_SHELL_MAX_WIDTH_PX = 980;
export const DOCX_PDF_PREVIEW_INNER_PADDING_PX = 28;
export type ConvertDocxToPdfOptions = {
/** html2canvas scale. Default 2. */
renderScale?: number;
/** JPEG quality 01 when not lossless. Default 0.95. */
imageQuality?: number;
/** PNG page images in the PDF instead of JPEG. */
losslessImages?: boolean;
/** Fired before `renderAsync` (docx-preview). */
onPhaseRendering?: () => void;
/** Fired after layout settle, before per-page capture. */
onPhaseCapturing?: (pageCount: number) => void;
/** After each page is rasterised (`current` is 1-based). */
onCaptureProgress?: (current: number, total: number) => void;
/**
* Extra ms after layout settles (tables/fonts). Mirrors docx-to-pdf-demo.html (120).
*/
layoutSettleExtraMs?: number;
/** Optional callback with render-layout signals for QA/advisory UI. */
onLayoutAnalysed?: (insights: ConvertDocxToPdfLayoutInsights) => void;
};
export type ConvertDocxToPdfLayoutInsights = {
/** True when the rendered page appears to include absolute-positioned drawing/shape elements. */
hasFloatingShapeCandidates: boolean;
/** True when Times New Roman is unavailable; capture CSS still uses a serif stack. */
appliedTimesFallbackOverride: boolean;
};
function isTimesNewRomanAvailable(): boolean {
if (!document.fonts || typeof document.fonts.check !== 'function') return true;
try {
return document.fonts.check('16px "Times New Roman"');
} catch {
return true;
}
}
/** Word-accurate typography for raster PDF: force serif stack (html2canvas often used system sans otherwise). */
function injectCaptureTypographyStyles(scope: HTMLElement): void {
if (scope.querySelector('style[data-docx-capture-typography="1"]')) return;
if (!scope.dataset.docxCaptureRoot) scope.dataset.docxCaptureRoot = '1';
const style = document.createElement('style');
style.setAttribute('data-docx-capture-typography', '1');
style.textContent = `
[data-docx-capture-root] .docx-wrapper,
[data-docx-capture-root] .docx-wrapper * {
font-family: "Times New Roman", Times, "Liberation Serif", "Noto Serif", serif !important;
}
`;
scope.appendChild(style);
}
/**
* docx-preview can keep « BỘ Y TẾ » bold (style / strong) and may leak italic from inherited
* styles even when the OOXML run is regular. Normalize the first page letterhead in the DOM
* so the rasterised PDF matches the official template:
*
* - « BỘ Y TẾ » → regular (400), upright
* - « ĐẠI HỌC Y DƯỢC » / « THÀNH PHỐ HỒ CHÍ MINH » → bold (700), upright
*
* The source DOCX contains the typo « ĐẠI HỘC »; match both spellings so we keep working if
* the template is ever corrected.
*/
function normalizeOfficialFormCoverForPdfCapture(root: HTMLElement): void {
const section = root.querySelector<HTMLElement>('section.docx');
if (!section) return;
const setLetterheadTypography = (el: HTMLElement, weight: '400' | '700') => {
el.style.setProperty('font-weight', weight, 'important');
el.style.setProperty('font-style', 'normal', 'important');
el.querySelectorAll<HTMLElement>('*').forEach((c) => {
c.style.setProperty('font-weight', weight, 'important');
c.style.setProperty('font-style', 'normal', 'important');
});
};
const hasUniversity = (line: string) =>
line.includes('ĐẠI HỌC Y DƯỢC') || line.includes('ĐẠI HỘC Y DƯỢC');
const isUniversityOnly = (line: string) =>
line === 'ĐẠI HỌC Y DƯỢC' || line === 'ĐẠI HỘC Y DƯỢC';
const paras = section.querySelectorAll<HTMLElement>('p');
for (const el of paras) {
const line = (el.textContent ?? '').replace(/\s+/g, ' ').trim();
if (line === 'BỘ Y TẾ') {
el.style.setProperty('text-align', 'center', 'important');
setLetterheadTypography(el, '400');
continue;
}
const isUniversity =
(hasUniversity(line) && line.includes('THÀNH PHỐ HỒ CHÍ MINH')) ||
isUniversityOnly(line) ||
line === 'THÀNH PHỐ HỒ CHÍ MINH';
if (isUniversity) {
el.style.setProperty('text-align', 'center', 'important');
setLetterheadTypography(el, '700');
}
}
}
function hasFloatingShapeCandidates(container: HTMLElement): boolean {
const obvious = container.querySelector(
'.docx-drawing, [data-anchor], [data-wrap], [style*="position:absolute"]',
);
if (obvious) return true;
const all = Array.from(container.querySelectorAll<HTMLElement>('.docx *'));
for (const el of all) {
const style = window.getComputedStyle(el);
if (style.position !== 'absolute') continue;
if (el.querySelector('svg, canvas, img') || el.tagName.toLowerCase() === 'svg') {
return true;
}
}
return false;
}
/**
* Creates a body-mounted host positioned off-screen so html2canvas can capture
* without `display` / `visibility` / `opacity` hiding the tree (PDF_converter.md §7 Rule 2).
*/
export function createOffScreenDocxCaptureHost(): HTMLDivElement {
const host = document.createElement('div');
host.setAttribute('aria-hidden', 'true');
host.setAttribute('inert', '');
Object.assign(host.style, {
position: 'fixed',
left: '-100000px',
top: '0',
boxSizing: 'border-box',
width: `${DOCX_PDF_PREVIEW_SHELL_MAX_WIDTH_PX}px`,
maxWidth: `${DOCX_PDF_PREVIEW_SHELL_MAX_WIDTH_PX}px`,
padding: `${DOCX_PDF_PREVIEW_INNER_PADDING_PX}px`,
pointerEvents: 'none',
overflow: 'visible',
backgroundColor: '#ffffff',
});
document.body.appendChild(host);
return host;
}
/**
* Renders a .docx into `container`, then rasterises each `section` page to a multi-page PDF.
* Clears `container.innerHTML` before rendering. The element must be attached to the document.
*/
export async function convertDocxToPdfBlob(
source: Blob | File,
container: HTMLElement,
options: ConvertDocxToPdfOptions = {},
): Promise<Blob> {
const renderScale = options.renderScale ?? 2;
const imageQuality = options.imageQuality ?? 0.95;
const losslessImages = options.losslessImages ?? false;
const layoutSettleExtraMs = options.layoutSettleExtraMs ?? 120;
options.onPhaseRendering?.();
container.innerHTML = '';
container.dataset.docxCaptureRoot = '1';
await renderAsync(source, container, undefined, {
...SHARED_DOCX_OFFICIAL_FORM_RENDER_OPTIONS,
ignoreLastRenderedPageBreak: false,
useBase64URL: true,
renderHeaders: true,
renderFooters: true,
renderFootnotes: true,
});
injectDocxTableReflowStyles(container, { pdfPreviewChrome: true });
injectCaptureTypographyStyles(container);
injectDocxJustifyMitigationStyles(container);
normalizeOfficialFormCoverForPdfCapture(container);
const appliedTimesFallbackOverride = !isTimesNewRomanAvailable();
try {
await (document.fonts?.ready ?? Promise.resolve());
} catch {
/* ignore */
}
await waitForRenderableAssets(container);
await new Promise<void>((r) => requestAnimationFrame(() => r()));
await new Promise<void>((r) => requestAnimationFrame(() => r()));
await new Promise<void>((r) => setTimeout(r, layoutSettleExtraMs));
normalizeOfficialFormCoverForPdfCapture(container);
options.onLayoutAnalysed?.({
hasFloatingShapeCandidates: hasFloatingShapeCandidates(container),
appliedTimesFallbackOverride,
});
let pages = Array.from(container.querySelectorAll<HTMLElement>('section.docx'));
if (pages.length === 0) {
pages = Array.from(container.querySelectorAll<HTMLElement>('section'));
}
if (pages.length === 0) {
throw new Error(
'docx-preview rendered the document but produced no <section> page elements.',
);
}
options.onPhaseCapturing?.(pages.length);
const imgType = losslessImages ? 'PNG' : 'JPEG';
let pdf: jsPDF | null = null;
for (let i = 0; i < pages.length; i++) {
const page = pages[i];
const canvas = await html2canvas(page, {
scale: renderScale,
useCORS: true,
backgroundColor: '#ffffff',
logging: false,
windowWidth: page.offsetWidth,
windowHeight: page.offsetHeight,
});
const cssWpx = canvas.width / renderScale;
const cssHpx = canvas.height / renderScale;
const pageWidthMm = cssWpx * PX_TO_MM;
const pageHeightMm = cssHpx * PX_TO_MM;
const imgData = canvas.toDataURL(
losslessImages ? 'image/png' : 'image/jpeg',
losslessImages ? undefined : imageQuality,
);
if (!pdf) {
pdf = new jsPDF({
orientation: pageWidthMm > pageHeightMm ? 'landscape' : 'portrait',
unit: 'mm',
format: [pageWidthMm, pageHeightMm],
compress: true,
});
} else {
pdf.addPage(
[pageWidthMm, pageHeightMm],
pageWidthMm > pageHeightMm ? 'landscape' : 'portrait',
);
}
pdf.addImage(imgData, imgType, 0, 0, pageWidthMm, pageHeightMm, undefined, 'FAST');
options.onCaptureProgress?.(i + 1, pages.length);
}
return pdf!.output('blob');
}