import { logger } from "./chunk-47DJ6YUB.js"; import { getTempDir } from "./chunk-FTVFWJFJ.js"; import { InternalError, ValidationError, resolveNotebookPath } from "./chunk-ER4KPD22.js"; import { NOTEBOOK_ROOT, PROJECT_ROOT, TEMP_ROOT, asyncHandler, createApiModule, defineApiModule, successResponse } from "./chunk-74TMTGBG.js"; // api/modules/document-parser/index.ts import express3 from "express"; // shared/modules/document-parser/index.ts var DOCUMENT_PARSER_MODULE = defineApiModule({ id: "document-parser", name: "Document Parser", basePath: "/document-parser", order: 60, version: "1.0.0", frontend: { enabled: false }, backend: { enabled: true } }); // api/modules/document-parser/blogRoutes.ts import express from "express"; import path3 from "path"; import fs3 from "fs/promises"; import { existsSync as existsSync2 } from "fs"; import axios from "axios"; // api/utils/file.ts import fs from "fs/promises"; import path from "path"; var getUniqueFilename = async (imagesDirFullPath, baseName, ext) => { const maxAttempts = 1e3; for (let i = 0; i < maxAttempts; i++) { const suffix = i === 0 ? "" : `-${i + 1}`; const filename = `${baseName}${suffix}${ext}`; const fullPath = path.join(imagesDirFullPath, filename); try { await fs.access(fullPath); } catch { return filename; } } throw new InternalError("Failed to generate unique filename"); }; var mimeToExt = { "image/png": ".png", "image/jpeg": ".jpg", "image/jpg": ".jpg", "image/gif": ".gif", "image/webp": ".webp" }; var IMAGE_MAGIC_BYTES = { "image/png": { bytes: [137, 80, 78, 71, 13, 10, 26, 10] }, "image/jpeg": { bytes: [255, 216, 255] }, "image/gif": { bytes: [71, 73, 70, 56] }, "image/webp": { bytes: [82, 73, 70, 70], offset: 0 } }; var WEBP_WEBP_MARKER = [87, 69, 66, 80]; var MIN_IMAGE_SIZE = 16; var MAX_IMAGE_SIZE = 8 * 1024 * 1024; var validateImageBuffer = (buffer, claimedMimeType) => { if (buffer.byteLength < MIN_IMAGE_SIZE) { throw new ValidationError("Image file is too small or corrupted"); } if (buffer.byteLength > MAX_IMAGE_SIZE) { throw new ValidationError("Image file is too large"); } const magicInfo = IMAGE_MAGIC_BYTES[claimedMimeType]; if (!magicInfo) { throw new ValidationError("Unsupported image type for content validation"); } const offset = magicInfo.offset || 0; const expectedBytes = magicInfo.bytes; for (let i = 0; i < expectedBytes.length; i++) { if (buffer[offset + i] !== expectedBytes[i]) { throw new ValidationError("Image content does not match the claimed file type"); } } if (claimedMimeType === "image/webp") { if (buffer.byteLength < 12) { throw new ValidationError("WebP image is corrupted"); } for (let i = 0; i < WEBP_WEBP_MARKER.length; i++) { if (buffer[8 + i] !== WEBP_WEBP_MARKER[i]) { throw new ValidationError("WebP image content is invalid"); } } } }; var detectImageMimeType = (buffer) => { if (buffer.byteLength < 8) return null; if (buffer[0] === 137 && buffer[1] === 80 && buffer[2] === 78 && buffer[3] === 71 && buffer[4] === 13 && buffer[5] === 10 && buffer[6] === 26 && buffer[7] === 10) { return "image/png"; } if (buffer[0] === 255 && buffer[1] === 216 && buffer[2] === 255) { return "image/jpeg"; } if (buffer[0] === 71 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 56) { return "image/gif"; } if (buffer[0] === 82 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 70 && buffer[8] === 87 && buffer[9] === 69 && buffer[10] === 66 && buffer[11] === 80) { return "image/webp"; } return null; }; // shared/utils/date.ts var pad2 = (n) => String(n).padStart(2, "0"); var pad3 = (n) => String(n).padStart(3, "0"); var formatTimestamp = (d) => { const yyyy = d.getFullYear(); const mm = pad2(d.getMonth() + 1); const dd = pad2(d.getDate()); const hh = pad2(d.getHours()); const mi = pad2(d.getMinutes()); const ss = pad2(d.getSeconds()); const ms = pad3(d.getMilliseconds()); return `${yyyy}${mm}${dd}_${hh}${mi}${ss}_${ms}`; }; // api/modules/document-parser/documentParser.ts import path2 from "path"; import { spawn } from "child_process"; import fs2 from "fs/promises"; import { existsSync, mkdirSync } from "fs"; if (!existsSync(TEMP_ROOT)) { mkdirSync(TEMP_ROOT, { recursive: true }); } var createJobContext = async (prefix) => { const now = /* @__PURE__ */ new Date(); const jobDir = path2.join(TEMP_ROOT, `${prefix}_${formatTimestamp(now)}`); await fs2.mkdir(jobDir, { recursive: true }); const year = now.getFullYear(); const month = pad2(now.getMonth() + 1); const day = pad2(now.getDate()); const imagesSubDir = `images/${year}/${month}/${day}`; const destImagesDir = path2.join(NOTEBOOK_ROOT, imagesSubDir); await fs2.mkdir(destImagesDir, { recursive: true }); return { jobDir, now, imagesSubDir, destImagesDir }; }; var spawnPythonScript = async (options) => { const { scriptPath, args, cwd, inputContent } = options; return new Promise((resolve, reject) => { const pythonProcess = spawn("python", ["-X", "utf8", scriptPath, ...args], { cwd, env: { ...process.env, PYTHONIOENCODING: "utf-8", PYTHONUTF8: "1" } }); let stdout = ""; let stderr = ""; pythonProcess.stdout.on("data", (data) => { stdout += data.toString(); }); pythonProcess.stderr.on("data", (data) => { stderr += data.toString(); }); pythonProcess.on("close", (code) => { if (code !== 0) { logger.error("Python script error:", stderr); reject(new Error(`Process exited with code ${code}. Error: ${stderr}`)); } else { resolve(stdout); } }); pythonProcess.on("error", (err) => { reject(err); }); if (inputContent !== void 0) { pythonProcess.stdin.write(inputContent); pythonProcess.stdin.end(); } }); }; var findImageDestinations = (md) => { const results = []; let i = 0; while (i < md.length) { const bang = md.indexOf("![", i); if (bang === -1) break; const closeBracket = md.indexOf("]", bang + 2); if (closeBracket === -1) break; if (md[closeBracket + 1] !== "(") { i = closeBracket + 1; continue; } const urlStart = closeBracket + 2; let depth = 1; let j = urlStart; for (; j < md.length; j++) { const ch = md[j]; if (ch === "(") depth++; else if (ch === ")") { depth--; if (depth === 0) break; } } if (depth !== 0) break; results.push({ url: md.slice(urlStart, j), start: urlStart, end: j }); i = j + 1; } return results; }; var applyReplacements = (md, replacements) => { const sorted = [...replacements].sort((a, b) => b.start - a.start); let result = md; for (const r of sorted) { result = `${result.slice(0, r.start)}${r.replacement}${result.slice(r.end)}`; } return result; }; var copyLocalImage = async (src, jobDir, htmlDir, destImagesDir, imagesSubDir, now) => { const s0 = src.trim().replace(/^<|>$/g, ""); if (!s0) return null; let decoded = s0; try { decoded = decodeURI(s0); } catch { } const s1 = decoded.replace(/\\/g, "/"); const s2 = s1.startsWith("./") ? s1.slice(2) : s1; const candidates = s2.startsWith("/") ? [path2.join(jobDir, s2.slice(1)), path2.join(htmlDir, s2.slice(1))] : [path2.resolve(htmlDir, s2), path2.resolve(jobDir, s2)]; let foundFile = null; for (const c of candidates) { if (existsSync(c)) { foundFile = c; break; } } if (!foundFile) return null; const ext = path2.extname(foundFile) || ".jpg"; const baseName = formatTimestamp(now); const newFilename = await getUniqueFilename(destImagesDir, baseName, ext); const newPath = path2.join(destImagesDir, newFilename); await fs2.copyFile(foundFile, newPath); return { newLink: `/${imagesSubDir}/${newFilename}` }; }; var cleanupJob = async (jobDir, additionalPaths = []) => { await fs2.rm(jobDir, { recursive: true, force: true }).catch(() => { }); for (const p of additionalPaths) { await fs2.unlink(p).catch(() => { }); } }; var getScriptPath = (toolName, scriptName) => { return path2.join(PROJECT_ROOT, "tools", toolName, scriptName); }; var ensureScriptExists = (scriptPath) => { return existsSync(scriptPath); }; // api/modules/document-parser/blogRoutes.ts var router = express.Router(); var tempDir = getTempDir(); router.post( "/parse-local", asyncHandler(async (req, res) => { const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body; if (!htmlPath || !htmlDir || !targetPath) { throw new ValidationError("htmlPath, htmlDir and targetPath are required"); } let fullTargetPath; try { const resolved = resolveNotebookPath(targetPath); fullTargetPath = resolved.fullPath; } catch (error) { throw error; } const scriptPath = getScriptPath("blog", "parse_blog.py"); if (!ensureScriptExists(scriptPath)) { throw new InternalError("Parser script not found"); } const jobContext = await createJobContext("blog"); let htmlPathInJob = ""; try { htmlPathInJob = path3.join(jobContext.jobDir, "input.html"); await fs3.copyFile(htmlPath, htmlPathInJob); if (assetsDirName && assetsFiles && assetsFiles.length > 0) { const assetsDirPath = path3.join(htmlDir, assetsDirName); for (const relPath of assetsFiles) { const srcPath = path3.join(assetsDirPath, relPath); if (existsSync2(srcPath)) { const destPath = path3.join(jobContext.jobDir, assetsDirName, relPath); await fs3.mkdir(path3.dirname(destPath), { recursive: true }); await fs3.copyFile(srcPath, destPath); } } } } catch (err) { await cleanupJob(jobContext.jobDir); throw err; } processHtmlInBackground({ jobDir: jobContext.jobDir, htmlPath: htmlPathInJob, targetPath: fullTargetPath, cwd: path3.dirname(scriptPath), jobContext, originalHtmlDir: htmlDir, originalAssetsDirName: assetsDirName }).catch((err) => { logger.error("Background HTML processing failed:", err); fs3.writeFile(fullTargetPath, `# \u89E3\u6790\u5931\u8D25 > \u9519\u8BEF\u4FE1\u606F: ${err.message}`, "utf-8").catch(() => { }); cleanupJob(jobContext.jobDir).catch(() => { }); }); successResponse(res, { message: "HTML parsing started in background.", status: "processing" }); }) ); async function processHtmlInBackground(args) { const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args; try { await spawnPythonScript({ scriptPath: "parse_blog.py", args: [htmlPath], cwd }); const parsedPathObj = path3.parse(htmlPath); const markdownPath = path3.join(parsedPathObj.dir, `${parsedPathObj.name}.md`); if (!existsSync2(markdownPath)) { throw new Error("Markdown result file not found"); } let mdContent = await fs3.readFile(markdownPath, "utf-8"); const ctx = await jobContext; const htmlDir = path3.dirname(htmlPath); const replacements = []; const destinations = findImageDestinations(mdContent); for (const dest of destinations) { const originalSrc = dest.url; if (!originalSrc) continue; if (originalSrc.startsWith("http://") || originalSrc.startsWith("https://")) { try { const response = await axios.get(originalSrc, { responseType: "arraybuffer", timeout: 1e4 }); const contentType = response.headers["content-type"]; let ext = ".jpg"; if (contentType) { if (contentType.includes("png")) ext = ".png"; else if (contentType.includes("gif")) ext = ".gif"; else if (contentType.includes("webp")) ext = ".webp"; else if (contentType.includes("svg")) ext = ".svg"; else if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg"; } const urlExt = path3.extname(originalSrc.split("?")[0]); if (urlExt) ext = urlExt; const baseName = formatTimestamp(ctx.now); const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext); const newPath = path3.join(ctx.destImagesDir, newFilename); await fs3.writeFile(newPath, response.data); replacements.push({ start: dest.start, end: dest.end, original: originalSrc, replacement: `/${ctx.imagesSubDir}/${newFilename}` }); } catch { } continue; } if (originalSrc.startsWith("data:")) continue; let result = await copyLocalImage( originalSrc, jobDir, htmlDir, ctx.destImagesDir, ctx.imagesSubDir, ctx.now ); if (!result && originalHtmlDir && originalAssetsDirName) { const srcWithFiles = originalSrc.replace(/^\.\//, "").replace(/^\//, ""); const possiblePaths = [ path3.join(originalHtmlDir, originalAssetsDirName, srcWithFiles), path3.join(originalHtmlDir, originalAssetsDirName, path3.basename(srcWithFiles)) ]; for (const p of possiblePaths) { if (existsSync2(p)) { const ext = path3.extname(p) || ".jpg"; const baseName = formatTimestamp(ctx.now); const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext); const newPath = path3.join(ctx.destImagesDir, newFilename); await fs3.copyFile(p, newPath); result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` }; break; } } } if (result) { replacements.push({ start: dest.start, end: dest.end, original: originalSrc, replacement: result.newLink }); } } mdContent = applyReplacements(mdContent, replacements); await fs3.writeFile(targetPath, mdContent, "utf-8"); await fs3.unlink(markdownPath).catch(() => { }); } finally { await cleanupJob(jobDir); } } var blogRoutes_default = router; // api/modules/document-parser/mineruRoutes.ts import express2 from "express"; import multer from "multer"; import path4 from "path"; import fs4 from "fs/promises"; import { existsSync as existsSync3 } from "fs"; var router2 = express2.Router(); var tempDir2 = getTempDir(); var upload = multer({ dest: tempDir2, limits: { fileSize: 50 * 1024 * 1024 } }); router2.post( "/parse", upload.single("file"), asyncHandler(async (req, res) => { if (!req.file) { throw new ValidationError("File is required"); } const { targetPath } = req.body; if (!targetPath) { await fs4.unlink(req.file.path).catch(() => { }); throw new ValidationError("Target path is required"); } let fullTargetPath; try { const resolved = resolveNotebookPath(targetPath); fullTargetPath = resolved.fullPath; } catch (error) { await fs4.unlink(req.file.path).catch(() => { }); throw error; } const scriptPath = getScriptPath("mineru", "mineru_parser.py"); if (!ensureScriptExists(scriptPath)) { await fs4.unlink(req.file.path).catch(() => { }); throw new InternalError("Parser script not found"); } processPdfInBackground(req.file.path, fullTargetPath, path4.dirname(scriptPath)).catch((err) => { logger.error("Background PDF processing failed:", err); fs4.writeFile(fullTargetPath, `# \u89E3\u6790\u5931\u8D25 > \u9519\u8BEF\u4FE1\u606F: ${err.message}`, "utf-8").catch(() => { }); }); successResponse(res, { message: "PDF upload successful. Parsing started in background.", status: "processing" }); }) ); async function processPdfInBackground(filePath, targetPath, cwd) { try { const output = await spawnPythonScript({ scriptPath: "mineru_parser.py", args: [filePath], cwd }); const match = output.match(/JSON_RESULT:(.*)/); if (!match) { throw new Error("Failed to parse Python script output: JSON_RESULT not found"); } const result = JSON.parse(match[1]); const markdownPath = result.markdown_file; const outputDir = result.output_dir; if (!existsSync3(markdownPath)) { throw new Error("Markdown result file not found"); } let mdContent = await fs4.readFile(markdownPath, "utf-8"); const imagesDir = path4.join(outputDir, "images"); if (existsSync3(imagesDir)) { const jobContext = await createJobContext("pdf_images"); const destinations = findImageDestinations(mdContent); const replacements = []; for (const dest of destinations) { const originalSrc = dest.url; if (!originalSrc) continue; const possibleFilenames = [originalSrc, path4.basename(originalSrc)]; let foundFile = null; for (const fname of possibleFilenames) { const localPath = path4.join(imagesDir, fname); if (existsSync3(localPath)) { foundFile = localPath; break; } const directPath = path4.join(outputDir, originalSrc); if (existsSync3(directPath)) { foundFile = directPath; break; } } if (foundFile) { const ext = path4.extname(foundFile); const baseName = formatTimestamp(jobContext.now); const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext); const newPath = path4.join(jobContext.destImagesDir, newFilename); await fs4.copyFile(foundFile, newPath); replacements.push({ start: dest.start, end: dest.end, original: originalSrc, replacement: `${jobContext.imagesSubDir}/${newFilename}` }); } } mdContent = applyReplacements(mdContent, replacements); } await fs4.writeFile(targetPath, mdContent, "utf-8"); await fs4.unlink(markdownPath).catch(() => { }); if (outputDir && outputDir.includes("temp")) { await fs4.rm(outputDir, { recursive: true, force: true }).catch(() => { }); } } finally { await fs4.unlink(filePath).catch(() => { }); } } var mineruRoutes_default = router2; // api/modules/document-parser/index.ts var createDocumentParserModule = () => { return createApiModule(DOCUMENT_PARSER_MODULE, { routes: (_container) => { const router3 = express3.Router(); router3.use("/blog", blogRoutes_default); router3.use("/mineru", mineruRoutes_default); return router3; } }); }; var document_parser_default = createDocumentParserModule; export { pad2, formatTimestamp, getUniqueFilename, mimeToExt, validateImageBuffer, detectImageMimeType, createJobContext, spawnPythonScript, findImageDestinations, applyReplacements, copyLocalImage, cleanupJob, getScriptPath, ensureScriptExists, blogRoutes_default, mineruRoutes_default, createDocumentParserModule, document_parser_default };