sciagent code + Gitea Actions CI/CD
Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
This commit is contained in:
@@ -0,0 +1,288 @@
|
||||
import { renderAsync } from 'docx-preview';
|
||||
import html2canvas from 'html2canvas';
|
||||
import jsPDF from 'jspdf';
|
||||
import { injectDocxJustifyMitigationStyles } from '../lib/docxJustifyMitigationCss';
|
||||
import { injectDocxTableReflowStyles } from '../lib/docxTableReflow';
|
||||
import { SHARED_DOCX_OFFICIAL_FORM_RENDER_OPTIONS } from '../lib/sharedDocxOfficialFormRenderOptions';
|
||||
|
||||
const PX_TO_MM = 25.4 / 96;
|
||||
|
||||
async function waitForRenderableAssets(container: HTMLElement): Promise<void> {
|
||||
const images = Array.from(container.querySelectorAll<HTMLImageElement>('img'));
|
||||
if (images.length === 0) return;
|
||||
|
||||
await Promise.all(
|
||||
images.map(async (img) => {
|
||||
if (img.complete) return;
|
||||
if (typeof img.decode === 'function') {
|
||||
try {
|
||||
await img.decode();
|
||||
return;
|
||||
} catch {
|
||||
// Fallback to load/error listeners below.
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise<void>((resolve) => {
|
||||
const done = () => resolve();
|
||||
img.addEventListener('load', done, { once: true });
|
||||
img.addEventListener('error', done, { once: true });
|
||||
});
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Matches `docx-to-pdf-demo.html`: `.shell { max-width }` + `.preview { padding }`.
|
||||
* Visible preview and off-screen capture use the same numbers so line breaks and tables match.
|
||||
*/
|
||||
export const DOCX_PDF_PREVIEW_SHELL_MAX_WIDTH_PX = 980;
|
||||
export const DOCX_PDF_PREVIEW_INNER_PADDING_PX = 28;
|
||||
|
||||
export type ConvertDocxToPdfOptions = {
|
||||
/** html2canvas scale. Default 2. */
|
||||
renderScale?: number;
|
||||
/** JPEG quality 0–1 when not lossless. Default 0.95. */
|
||||
imageQuality?: number;
|
||||
/** PNG page images in the PDF instead of JPEG. */
|
||||
losslessImages?: boolean;
|
||||
/** Fired before `renderAsync` (docx-preview). */
|
||||
onPhaseRendering?: () => void;
|
||||
/** Fired after layout settle, before per-page capture. */
|
||||
onPhaseCapturing?: (pageCount: number) => void;
|
||||
/** After each page is rasterised (`current` is 1-based). */
|
||||
onCaptureProgress?: (current: number, total: number) => void;
|
||||
/**
|
||||
* Extra ms after layout settles (tables/fonts). Mirrors docx-to-pdf-demo.html (120).
|
||||
*/
|
||||
layoutSettleExtraMs?: number;
|
||||
/** Optional callback with render-layout signals for QA/advisory UI. */
|
||||
onLayoutAnalysed?: (insights: ConvertDocxToPdfLayoutInsights) => void;
|
||||
};
|
||||
|
||||
export type ConvertDocxToPdfLayoutInsights = {
|
||||
/** True when the rendered page appears to include absolute-positioned drawing/shape elements. */
|
||||
hasFloatingShapeCandidates: boolean;
|
||||
/** True when Times New Roman is unavailable; capture CSS still uses a serif stack. */
|
||||
appliedTimesFallbackOverride: boolean;
|
||||
};
|
||||
|
||||
function isTimesNewRomanAvailable(): boolean {
|
||||
if (!document.fonts || typeof document.fonts.check !== 'function') return true;
|
||||
try {
|
||||
return document.fonts.check('16px "Times New Roman"');
|
||||
} catch {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
/** Word-accurate typography for raster PDF: force serif stack (html2canvas often used system sans otherwise). */
|
||||
function injectCaptureTypographyStyles(scope: HTMLElement): void {
|
||||
if (scope.querySelector('style[data-docx-capture-typography="1"]')) return;
|
||||
if (!scope.dataset.docxCaptureRoot) scope.dataset.docxCaptureRoot = '1';
|
||||
const style = document.createElement('style');
|
||||
style.setAttribute('data-docx-capture-typography', '1');
|
||||
style.textContent = `
|
||||
[data-docx-capture-root] .docx-wrapper,
|
||||
[data-docx-capture-root] .docx-wrapper * {
|
||||
font-family: "Times New Roman", Times, "Liberation Serif", "Noto Serif", serif !important;
|
||||
}
|
||||
`;
|
||||
scope.appendChild(style);
|
||||
}
|
||||
|
||||
/**
|
||||
* docx-preview can keep « BỘ Y TẾ » bold (style / strong) and may leak italic from inherited
|
||||
* styles even when the OOXML run is regular. Normalize the first page letterhead in the DOM
|
||||
* so the rasterised PDF matches the official template:
|
||||
*
|
||||
* - « BỘ Y TẾ » → regular (400), upright
|
||||
* - « ĐẠI HỌC Y DƯỢC » / « THÀNH PHỐ HỒ CHÍ MINH » → bold (700), upright
|
||||
*
|
||||
* The source DOCX contains the typo « ĐẠI HỘC »; match both spellings so we keep working if
|
||||
* the template is ever corrected.
|
||||
*/
|
||||
function normalizeOfficialFormCoverForPdfCapture(root: HTMLElement): void {
|
||||
const section = root.querySelector<HTMLElement>('section.docx');
|
||||
if (!section) return;
|
||||
|
||||
const setLetterheadTypography = (el: HTMLElement, weight: '400' | '700') => {
|
||||
el.style.setProperty('font-weight', weight, 'important');
|
||||
el.style.setProperty('font-style', 'normal', 'important');
|
||||
el.querySelectorAll<HTMLElement>('*').forEach((c) => {
|
||||
c.style.setProperty('font-weight', weight, 'important');
|
||||
c.style.setProperty('font-style', 'normal', 'important');
|
||||
});
|
||||
};
|
||||
|
||||
const hasUniversity = (line: string) =>
|
||||
line.includes('ĐẠI HỌC Y DƯỢC') || line.includes('ĐẠI HỘC Y DƯỢC');
|
||||
const isUniversityOnly = (line: string) =>
|
||||
line === 'ĐẠI HỌC Y DƯỢC' || line === 'ĐẠI HỘC Y DƯỢC';
|
||||
|
||||
const paras = section.querySelectorAll<HTMLElement>('p');
|
||||
for (const el of paras) {
|
||||
const line = (el.textContent ?? '').replace(/\s+/g, ' ').trim();
|
||||
if (line === 'BỘ Y TẾ') {
|
||||
el.style.setProperty('text-align', 'center', 'important');
|
||||
setLetterheadTypography(el, '400');
|
||||
continue;
|
||||
}
|
||||
const isUniversity =
|
||||
(hasUniversity(line) && line.includes('THÀNH PHỐ HỒ CHÍ MINH')) ||
|
||||
isUniversityOnly(line) ||
|
||||
line === 'THÀNH PHỐ HỒ CHÍ MINH';
|
||||
if (isUniversity) {
|
||||
el.style.setProperty('text-align', 'center', 'important');
|
||||
setLetterheadTypography(el, '700');
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
function hasFloatingShapeCandidates(container: HTMLElement): boolean {
|
||||
const obvious = container.querySelector(
|
||||
'.docx-drawing, [data-anchor], [data-wrap], [style*="position:absolute"]',
|
||||
);
|
||||
if (obvious) return true;
|
||||
|
||||
const all = Array.from(container.querySelectorAll<HTMLElement>('.docx *'));
|
||||
for (const el of all) {
|
||||
const style = window.getComputedStyle(el);
|
||||
if (style.position !== 'absolute') continue;
|
||||
if (el.querySelector('svg, canvas, img') || el.tagName.toLowerCase() === 'svg') {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a body-mounted host positioned off-screen so html2canvas can capture
|
||||
* without `display` / `visibility` / `opacity` hiding the tree (PDF_converter.md §7 Rule 2).
|
||||
*/
|
||||
export function createOffScreenDocxCaptureHost(): HTMLDivElement {
|
||||
const host = document.createElement('div');
|
||||
host.setAttribute('aria-hidden', 'true');
|
||||
host.setAttribute('inert', '');
|
||||
Object.assign(host.style, {
|
||||
position: 'fixed',
|
||||
left: '-100000px',
|
||||
top: '0',
|
||||
boxSizing: 'border-box',
|
||||
width: `${DOCX_PDF_PREVIEW_SHELL_MAX_WIDTH_PX}px`,
|
||||
maxWidth: `${DOCX_PDF_PREVIEW_SHELL_MAX_WIDTH_PX}px`,
|
||||
padding: `${DOCX_PDF_PREVIEW_INNER_PADDING_PX}px`,
|
||||
pointerEvents: 'none',
|
||||
overflow: 'visible',
|
||||
backgroundColor: '#ffffff',
|
||||
});
|
||||
document.body.appendChild(host);
|
||||
return host;
|
||||
}
|
||||
|
||||
/**
|
||||
* Renders a .docx into `container`, then rasterises each `section` page to a multi-page PDF.
|
||||
* Clears `container.innerHTML` before rendering. The element must be attached to the document.
|
||||
*/
|
||||
export async function convertDocxToPdfBlob(
|
||||
source: Blob | File,
|
||||
container: HTMLElement,
|
||||
options: ConvertDocxToPdfOptions = {},
|
||||
): Promise<Blob> {
|
||||
const renderScale = options.renderScale ?? 2;
|
||||
const imageQuality = options.imageQuality ?? 0.95;
|
||||
const losslessImages = options.losslessImages ?? false;
|
||||
const layoutSettleExtraMs = options.layoutSettleExtraMs ?? 120;
|
||||
|
||||
options.onPhaseRendering?.();
|
||||
container.innerHTML = '';
|
||||
container.dataset.docxCaptureRoot = '1';
|
||||
await renderAsync(source, container, undefined, {
|
||||
...SHARED_DOCX_OFFICIAL_FORM_RENDER_OPTIONS,
|
||||
ignoreLastRenderedPageBreak: false,
|
||||
useBase64URL: true,
|
||||
renderHeaders: true,
|
||||
renderFooters: true,
|
||||
renderFootnotes: true,
|
||||
});
|
||||
|
||||
injectDocxTableReflowStyles(container, { pdfPreviewChrome: true });
|
||||
injectCaptureTypographyStyles(container);
|
||||
injectDocxJustifyMitigationStyles(container);
|
||||
normalizeOfficialFormCoverForPdfCapture(container);
|
||||
|
||||
const appliedTimesFallbackOverride = !isTimesNewRomanAvailable();
|
||||
|
||||
try {
|
||||
await (document.fonts?.ready ?? Promise.resolve());
|
||||
} catch {
|
||||
/* ignore */
|
||||
}
|
||||
await waitForRenderableAssets(container);
|
||||
await new Promise<void>((r) => requestAnimationFrame(() => r()));
|
||||
await new Promise<void>((r) => requestAnimationFrame(() => r()));
|
||||
await new Promise<void>((r) => setTimeout(r, layoutSettleExtraMs));
|
||||
normalizeOfficialFormCoverForPdfCapture(container);
|
||||
|
||||
options.onLayoutAnalysed?.({
|
||||
hasFloatingShapeCandidates: hasFloatingShapeCandidates(container),
|
||||
appliedTimesFallbackOverride,
|
||||
});
|
||||
|
||||
let pages = Array.from(container.querySelectorAll<HTMLElement>('section.docx'));
|
||||
if (pages.length === 0) {
|
||||
pages = Array.from(container.querySelectorAll<HTMLElement>('section'));
|
||||
}
|
||||
if (pages.length === 0) {
|
||||
throw new Error(
|
||||
'docx-preview rendered the document but produced no <section> page elements.',
|
||||
);
|
||||
}
|
||||
|
||||
options.onPhaseCapturing?.(pages.length);
|
||||
|
||||
const imgType = losslessImages ? 'PNG' : 'JPEG';
|
||||
let pdf: jsPDF | null = null;
|
||||
|
||||
for (let i = 0; i < pages.length; i++) {
|
||||
const page = pages[i];
|
||||
|
||||
const canvas = await html2canvas(page, {
|
||||
scale: renderScale,
|
||||
useCORS: true,
|
||||
backgroundColor: '#ffffff',
|
||||
logging: false,
|
||||
windowWidth: page.offsetWidth,
|
||||
windowHeight: page.offsetHeight,
|
||||
});
|
||||
|
||||
const cssWpx = canvas.width / renderScale;
|
||||
const cssHpx = canvas.height / renderScale;
|
||||
const pageWidthMm = cssWpx * PX_TO_MM;
|
||||
const pageHeightMm = cssHpx * PX_TO_MM;
|
||||
|
||||
const imgData = canvas.toDataURL(
|
||||
losslessImages ? 'image/png' : 'image/jpeg',
|
||||
losslessImages ? undefined : imageQuality,
|
||||
);
|
||||
|
||||
if (!pdf) {
|
||||
pdf = new jsPDF({
|
||||
orientation: pageWidthMm > pageHeightMm ? 'landscape' : 'portrait',
|
||||
unit: 'mm',
|
||||
format: [pageWidthMm, pageHeightMm],
|
||||
compress: true,
|
||||
});
|
||||
} else {
|
||||
pdf.addPage(
|
||||
[pageWidthMm, pageHeightMm],
|
||||
pageWidthMm > pageHeightMm ? 'landscape' : 'portrait',
|
||||
);
|
||||
}
|
||||
pdf.addImage(imgData, imgType, 0, 0, pageWidthMm, pageHeightMm, undefined, 'FAST');
|
||||
|
||||
options.onCaptureProgress?.(i + 1, pages.length);
|
||||
}
|
||||
|
||||
return pdf!.output('blob');
|
||||
}
|
||||
Reference in New Issue
Block a user