Files
XCDesktop/dist-api/chunk-R5LQJNQE.js

587 lines
19 KiB
JavaScript
Raw Normal View History

import {
logger
} from "./chunk-47DJ6YUB.js";
import {
getTempDir
} from "./chunk-FTVFWJFJ.js";
import {
InternalError,
ValidationError,
resolveNotebookPath
} from "./chunk-ER4KPD22.js";
import {
NOTEBOOK_ROOT,
PROJECT_ROOT,
TEMP_ROOT,
asyncHandler,
createApiModule,
defineApiModule,
successResponse
} from "./chunk-74TMTGBG.js";
// api/modules/document-parser/index.ts
import express3 from "express";
// shared/modules/document-parser/index.ts
var DOCUMENT_PARSER_MODULE = defineApiModule({
id: "document-parser",
name: "Document Parser",
basePath: "/document-parser",
order: 60,
version: "1.0.0",
frontend: {
enabled: false
},
backend: {
enabled: true
}
});
// api/modules/document-parser/blogRoutes.ts
import express from "express";
import path3 from "path";
import fs3 from "fs/promises";
import { existsSync as existsSync2 } from "fs";
import axios from "axios";
// api/utils/file.ts
import fs from "fs/promises";
import path from "path";
var getUniqueFilename = async (imagesDirFullPath, baseName, ext) => {
const maxAttempts = 1e3;
for (let i = 0; i < maxAttempts; i++) {
const suffix = i === 0 ? "" : `-${i + 1}`;
const filename = `${baseName}${suffix}${ext}`;
const fullPath = path.join(imagesDirFullPath, filename);
try {
await fs.access(fullPath);
} catch {
return filename;
}
}
throw new InternalError("Failed to generate unique filename");
};
var mimeToExt = {
"image/png": ".png",
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/gif": ".gif",
"image/webp": ".webp"
};
var IMAGE_MAGIC_BYTES = {
"image/png": { bytes: [137, 80, 78, 71, 13, 10, 26, 10] },
"image/jpeg": { bytes: [255, 216, 255] },
"image/gif": { bytes: [71, 73, 70, 56] },
"image/webp": { bytes: [82, 73, 70, 70], offset: 0 }
};
var WEBP_WEBP_MARKER = [87, 69, 66, 80];
var MIN_IMAGE_SIZE = 16;
var MAX_IMAGE_SIZE = 8 * 1024 * 1024;
var validateImageBuffer = (buffer, claimedMimeType) => {
if (buffer.byteLength < MIN_IMAGE_SIZE) {
throw new ValidationError("Image file is too small or corrupted");
}
if (buffer.byteLength > MAX_IMAGE_SIZE) {
throw new ValidationError("Image file is too large");
}
const magicInfo = IMAGE_MAGIC_BYTES[claimedMimeType];
if (!magicInfo) {
throw new ValidationError("Unsupported image type for content validation");
}
const offset = magicInfo.offset || 0;
const expectedBytes = magicInfo.bytes;
for (let i = 0; i < expectedBytes.length; i++) {
if (buffer[offset + i] !== expectedBytes[i]) {
throw new ValidationError("Image content does not match the claimed file type");
}
}
if (claimedMimeType === "image/webp") {
if (buffer.byteLength < 12) {
throw new ValidationError("WebP image is corrupted");
}
for (let i = 0; i < WEBP_WEBP_MARKER.length; i++) {
if (buffer[8 + i] !== WEBP_WEBP_MARKER[i]) {
throw new ValidationError("WebP image content is invalid");
}
}
}
};
var detectImageMimeType = (buffer) => {
if (buffer.byteLength < 8) return null;
if (buffer[0] === 137 && buffer[1] === 80 && buffer[2] === 78 && buffer[3] === 71 && buffer[4] === 13 && buffer[5] === 10 && buffer[6] === 26 && buffer[7] === 10) {
return "image/png";
}
if (buffer[0] === 255 && buffer[1] === 216 && buffer[2] === 255) {
return "image/jpeg";
}
if (buffer[0] === 71 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 56) {
return "image/gif";
}
if (buffer[0] === 82 && buffer[1] === 73 && buffer[2] === 70 && buffer[3] === 70 && buffer[8] === 87 && buffer[9] === 69 && buffer[10] === 66 && buffer[11] === 80) {
return "image/webp";
}
return null;
};
// shared/utils/date.ts
var pad2 = (n) => String(n).padStart(2, "0");
var pad3 = (n) => String(n).padStart(3, "0");
var formatTimestamp = (d) => {
const yyyy = d.getFullYear();
const mm = pad2(d.getMonth() + 1);
const dd = pad2(d.getDate());
const hh = pad2(d.getHours());
const mi = pad2(d.getMinutes());
const ss = pad2(d.getSeconds());
const ms = pad3(d.getMilliseconds());
return `${yyyy}${mm}${dd}_${hh}${mi}${ss}_${ms}`;
};
// api/modules/document-parser/documentParser.ts
import path2 from "path";
import { spawn } from "child_process";
import fs2 from "fs/promises";
import { existsSync, mkdirSync } from "fs";
if (!existsSync(TEMP_ROOT)) {
mkdirSync(TEMP_ROOT, { recursive: true });
}
var createJobContext = async (prefix) => {
const now = /* @__PURE__ */ new Date();
const jobDir = path2.join(TEMP_ROOT, `${prefix}_${formatTimestamp(now)}`);
await fs2.mkdir(jobDir, { recursive: true });
const year = now.getFullYear();
const month = pad2(now.getMonth() + 1);
const day = pad2(now.getDate());
const imagesSubDir = `images/${year}/${month}/${day}`;
const destImagesDir = path2.join(NOTEBOOK_ROOT, imagesSubDir);
await fs2.mkdir(destImagesDir, { recursive: true });
return { jobDir, now, imagesSubDir, destImagesDir };
};
var spawnPythonScript = async (options) => {
const { scriptPath, args, cwd, inputContent } = options;
return new Promise((resolve, reject) => {
const pythonProcess = spawn("python", ["-X", "utf8", scriptPath, ...args], {
cwd,
env: { ...process.env, PYTHONIOENCODING: "utf-8", PYTHONUTF8: "1" }
});
let stdout = "";
let stderr = "";
pythonProcess.stdout.on("data", (data) => {
stdout += data.toString();
});
pythonProcess.stderr.on("data", (data) => {
stderr += data.toString();
});
pythonProcess.on("close", (code) => {
if (code !== 0) {
logger.error("Python script error:", stderr);
reject(new Error(`Process exited with code ${code}. Error: ${stderr}`));
} else {
resolve(stdout);
}
});
pythonProcess.on("error", (err) => {
reject(err);
});
if (inputContent !== void 0) {
pythonProcess.stdin.write(inputContent);
pythonProcess.stdin.end();
}
});
};
var findImageDestinations = (md) => {
const results = [];
let i = 0;
while (i < md.length) {
const bang = md.indexOf("![", i);
if (bang === -1) break;
const closeBracket = md.indexOf("]", bang + 2);
if (closeBracket === -1) break;
if (md[closeBracket + 1] !== "(") {
i = closeBracket + 1;
continue;
}
const urlStart = closeBracket + 2;
let depth = 1;
let j = urlStart;
for (; j < md.length; j++) {
const ch = md[j];
if (ch === "(") depth++;
else if (ch === ")") {
depth--;
if (depth === 0) break;
}
}
if (depth !== 0) break;
results.push({ url: md.slice(urlStart, j), start: urlStart, end: j });
i = j + 1;
}
return results;
};
var applyReplacements = (md, replacements) => {
const sorted = [...replacements].sort((a, b) => b.start - a.start);
let result = md;
for (const r of sorted) {
result = `${result.slice(0, r.start)}${r.replacement}${result.slice(r.end)}`;
}
return result;
};
var copyLocalImage = async (src, jobDir, htmlDir, destImagesDir, imagesSubDir, now) => {
const s0 = src.trim().replace(/^<|>$/g, "");
if (!s0) return null;
let decoded = s0;
try {
decoded = decodeURI(s0);
} catch {
}
const s1 = decoded.replace(/\\/g, "/");
const s2 = s1.startsWith("./") ? s1.slice(2) : s1;
const candidates = s2.startsWith("/") ? [path2.join(jobDir, s2.slice(1)), path2.join(htmlDir, s2.slice(1))] : [path2.resolve(htmlDir, s2), path2.resolve(jobDir, s2)];
let foundFile = null;
for (const c of candidates) {
if (existsSync(c)) {
foundFile = c;
break;
}
}
if (!foundFile) return null;
const ext = path2.extname(foundFile) || ".jpg";
const baseName = formatTimestamp(now);
const newFilename = await getUniqueFilename(destImagesDir, baseName, ext);
const newPath = path2.join(destImagesDir, newFilename);
await fs2.copyFile(foundFile, newPath);
return { newLink: `/${imagesSubDir}/${newFilename}` };
};
var cleanupJob = async (jobDir, additionalPaths = []) => {
await fs2.rm(jobDir, { recursive: true, force: true }).catch(() => {
});
for (const p of additionalPaths) {
await fs2.unlink(p).catch(() => {
});
}
};
var getScriptPath = (toolName, scriptName) => {
return path2.join(PROJECT_ROOT, "tools", toolName, scriptName);
};
var ensureScriptExists = (scriptPath) => {
return existsSync(scriptPath);
};
// api/modules/document-parser/blogRoutes.ts
var router = express.Router();
var tempDir = getTempDir();
router.post(
"/parse-local",
asyncHandler(async (req, res) => {
const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body;
if (!htmlPath || !htmlDir || !targetPath) {
throw new ValidationError("htmlPath, htmlDir and targetPath are required");
}
let fullTargetPath;
try {
const resolved = resolveNotebookPath(targetPath);
fullTargetPath = resolved.fullPath;
} catch (error) {
throw error;
}
const scriptPath = getScriptPath("blog", "parse_blog.py");
if (!ensureScriptExists(scriptPath)) {
throw new InternalError("Parser script not found");
}
const jobContext = await createJobContext("blog");
let htmlPathInJob = "";
try {
htmlPathInJob = path3.join(jobContext.jobDir, "input.html");
await fs3.copyFile(htmlPath, htmlPathInJob);
if (assetsDirName && assetsFiles && assetsFiles.length > 0) {
const assetsDirPath = path3.join(htmlDir, assetsDirName);
for (const relPath of assetsFiles) {
const srcPath = path3.join(assetsDirPath, relPath);
if (existsSync2(srcPath)) {
const destPath = path3.join(jobContext.jobDir, assetsDirName, relPath);
await fs3.mkdir(path3.dirname(destPath), { recursive: true });
await fs3.copyFile(srcPath, destPath);
}
}
}
} catch (err) {
await cleanupJob(jobContext.jobDir);
throw err;
}
processHtmlInBackground({
jobDir: jobContext.jobDir,
htmlPath: htmlPathInJob,
targetPath: fullTargetPath,
cwd: path3.dirname(scriptPath),
jobContext,
originalHtmlDir: htmlDir,
originalAssetsDirName: assetsDirName
}).catch((err) => {
logger.error("Background HTML processing failed:", err);
fs3.writeFile(fullTargetPath, `# \u89E3\u6790\u5931\u8D25
> \u9519\u8BEF\u4FE1\u606F: ${err.message}`, "utf-8").catch(() => {
});
cleanupJob(jobContext.jobDir).catch(() => {
});
});
successResponse(res, {
message: "HTML parsing started in background.",
status: "processing"
});
})
);
async function processHtmlInBackground(args) {
const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args;
try {
await spawnPythonScript({
scriptPath: "parse_blog.py",
args: [htmlPath],
cwd
});
const parsedPathObj = path3.parse(htmlPath);
const markdownPath = path3.join(parsedPathObj.dir, `${parsedPathObj.name}.md`);
if (!existsSync2(markdownPath)) {
throw new Error("Markdown result file not found");
}
let mdContent = await fs3.readFile(markdownPath, "utf-8");
const ctx = await jobContext;
const htmlDir = path3.dirname(htmlPath);
const replacements = [];
const destinations = findImageDestinations(mdContent);
for (const dest of destinations) {
const originalSrc = dest.url;
if (!originalSrc) continue;
if (originalSrc.startsWith("http://") || originalSrc.startsWith("https://")) {
try {
const response = await axios.get(originalSrc, { responseType: "arraybuffer", timeout: 1e4 });
const contentType = response.headers["content-type"];
let ext = ".jpg";
if (contentType) {
if (contentType.includes("png")) ext = ".png";
else if (contentType.includes("gif")) ext = ".gif";
else if (contentType.includes("webp")) ext = ".webp";
else if (contentType.includes("svg")) ext = ".svg";
else if (contentType.includes("jpeg") || contentType.includes("jpg")) ext = ".jpg";
}
const urlExt = path3.extname(originalSrc.split("?")[0]);
if (urlExt) ext = urlExt;
const baseName = formatTimestamp(ctx.now);
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext);
const newPath = path3.join(ctx.destImagesDir, newFilename);
await fs3.writeFile(newPath, response.data);
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: `/${ctx.imagesSubDir}/${newFilename}`
});
} catch {
}
continue;
}
if (originalSrc.startsWith("data:")) continue;
let result = await copyLocalImage(
originalSrc,
jobDir,
htmlDir,
ctx.destImagesDir,
ctx.imagesSubDir,
ctx.now
);
if (!result && originalHtmlDir && originalAssetsDirName) {
const srcWithFiles = originalSrc.replace(/^\.\//, "").replace(/^\//, "");
const possiblePaths = [
path3.join(originalHtmlDir, originalAssetsDirName, srcWithFiles),
path3.join(originalHtmlDir, originalAssetsDirName, path3.basename(srcWithFiles))
];
for (const p of possiblePaths) {
if (existsSync2(p)) {
const ext = path3.extname(p) || ".jpg";
const baseName = formatTimestamp(ctx.now);
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext);
const newPath = path3.join(ctx.destImagesDir, newFilename);
await fs3.copyFile(p, newPath);
result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` };
break;
}
}
}
if (result) {
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: result.newLink
});
}
}
mdContent = applyReplacements(mdContent, replacements);
await fs3.writeFile(targetPath, mdContent, "utf-8");
await fs3.unlink(markdownPath).catch(() => {
});
} finally {
await cleanupJob(jobDir);
}
}
var blogRoutes_default = router;
// api/modules/document-parser/mineruRoutes.ts
import express2 from "express";
import multer from "multer";
import path4 from "path";
import fs4 from "fs/promises";
import { existsSync as existsSync3 } from "fs";
var router2 = express2.Router();
var tempDir2 = getTempDir();
var upload = multer({
dest: tempDir2,
limits: {
fileSize: 50 * 1024 * 1024
}
});
router2.post(
"/parse",
upload.single("file"),
asyncHandler(async (req, res) => {
if (!req.file) {
throw new ValidationError("File is required");
}
const { targetPath } = req.body;
if (!targetPath) {
await fs4.unlink(req.file.path).catch(() => {
});
throw new ValidationError("Target path is required");
}
let fullTargetPath;
try {
const resolved = resolveNotebookPath(targetPath);
fullTargetPath = resolved.fullPath;
} catch (error) {
await fs4.unlink(req.file.path).catch(() => {
});
throw error;
}
const scriptPath = getScriptPath("mineru", "mineru_parser.py");
if (!ensureScriptExists(scriptPath)) {
await fs4.unlink(req.file.path).catch(() => {
});
throw new InternalError("Parser script not found");
}
processPdfInBackground(req.file.path, fullTargetPath, path4.dirname(scriptPath)).catch((err) => {
logger.error("Background PDF processing failed:", err);
fs4.writeFile(fullTargetPath, `# \u89E3\u6790\u5931\u8D25
> \u9519\u8BEF\u4FE1\u606F: ${err.message}`, "utf-8").catch(() => {
});
});
successResponse(res, {
message: "PDF upload successful. Parsing started in background.",
status: "processing"
});
})
);
async function processPdfInBackground(filePath, targetPath, cwd) {
try {
const output = await spawnPythonScript({
scriptPath: "mineru_parser.py",
args: [filePath],
cwd
});
const match = output.match(/JSON_RESULT:(.*)/);
if (!match) {
throw new Error("Failed to parse Python script output: JSON_RESULT not found");
}
const result = JSON.parse(match[1]);
const markdownPath = result.markdown_file;
const outputDir = result.output_dir;
if (!existsSync3(markdownPath)) {
throw new Error("Markdown result file not found");
}
let mdContent = await fs4.readFile(markdownPath, "utf-8");
const imagesDir = path4.join(outputDir, "images");
if (existsSync3(imagesDir)) {
const jobContext = await createJobContext("pdf_images");
const destinations = findImageDestinations(mdContent);
const replacements = [];
for (const dest of destinations) {
const originalSrc = dest.url;
if (!originalSrc) continue;
const possibleFilenames = [originalSrc, path4.basename(originalSrc)];
let foundFile = null;
for (const fname of possibleFilenames) {
const localPath = path4.join(imagesDir, fname);
if (existsSync3(localPath)) {
foundFile = localPath;
break;
}
const directPath = path4.join(outputDir, originalSrc);
if (existsSync3(directPath)) {
foundFile = directPath;
break;
}
}
if (foundFile) {
const ext = path4.extname(foundFile);
const baseName = formatTimestamp(jobContext.now);
const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext);
const newPath = path4.join(jobContext.destImagesDir, newFilename);
await fs4.copyFile(foundFile, newPath);
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: `${jobContext.imagesSubDir}/${newFilename}`
});
}
}
mdContent = applyReplacements(mdContent, replacements);
}
await fs4.writeFile(targetPath, mdContent, "utf-8");
await fs4.unlink(markdownPath).catch(() => {
});
if (outputDir && outputDir.includes("temp")) {
await fs4.rm(outputDir, { recursive: true, force: true }).catch(() => {
});
}
} finally {
await fs4.unlink(filePath).catch(() => {
});
}
}
var mineruRoutes_default = router2;
// api/modules/document-parser/index.ts
var createDocumentParserModule = () => {
return createApiModule(DOCUMENT_PARSER_MODULE, {
routes: (_container) => {
const router3 = express3.Router();
router3.use("/blog", blogRoutes_default);
router3.use("/mineru", mineruRoutes_default);
return router3;
}
});
};
var document_parser_default = createDocumentParserModule;
export {
pad2,
formatTimestamp,
getUniqueFilename,
mimeToExt,
validateImageBuffer,
detectImageMimeType,
createJobContext,
spawnPythonScript,
findImageDestinations,
applyReplacements,
copyLocalImage,
cleanupJob,
getScriptPath,
ensureScriptExists,
blogRoutes_default,
mineruRoutes_default,
createDocumentParserModule,
document_parser_default
};