import express, { type Request, type Response } from 'express' import multer from 'multer' import path from 'path' import fs from 'fs/promises' import { existsSync } from 'fs' import { asyncHandler } from '../../utils/asyncHandler.js' import { successResponse } from '../../utils/response.js' import { resolveNotebookPath } from '../../utils/pathSafety.js' import { getUniqueFilename } from '../../utils/file.js' import { formatTimestamp } from '../../../shared/utils/date.js' import { getTempDir } from '../../utils/tempDir.js' import { createJobContext, spawnPythonScript, findImageDestinations, applyReplacements, cleanupJob, getScriptPath, ensureScriptExists, } from './documentParser.js' import type { ImageReplacement } from './documentParser.js' import { ValidationError, InternalError } from '../../../shared/errors/index.js' import { logger } from '../../utils/logger.js' const router = express.Router() const tempDir = getTempDir() const upload = multer({ dest: tempDir, limits: { fileSize: 50 * 1024 * 1024 } }) router.post( '/parse', upload.single('file'), asyncHandler(async (req: Request, res: Response) => { if (!req.file) { throw new ValidationError('File is required') } const { targetPath } = req.body as { targetPath?: string } if (!targetPath) { await fs.unlink(req.file.path).catch(() => {}) throw new ValidationError('Target path is required') } let fullTargetPath: string try { const resolved = resolveNotebookPath(targetPath) fullTargetPath = resolved.fullPath } catch (error) { await fs.unlink(req.file.path).catch(() => {}) throw error } const scriptPath = getScriptPath('mineru', 'mineru_parser.py') if (!ensureScriptExists(scriptPath)) { await fs.unlink(req.file.path).catch(() => {}) throw new InternalError('Parser script not found') } processPdfInBackground(req.file.path, fullTargetPath, path.dirname(scriptPath)) .catch(err => { logger.error('Background PDF processing failed:', err) fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => {}) }) successResponse(res, { message: 'PDF upload successful. Parsing started in background.', status: 'processing' }) }), ) async function processPdfInBackground(filePath: string, targetPath: string, cwd: string) { try { const output = await spawnPythonScript({ scriptPath: 'mineru_parser.py', args: [filePath], cwd, }) const match = output.match(/JSON_RESULT:(.*)/) if (!match) { throw new Error('Failed to parse Python script output: JSON_RESULT not found') } const result = JSON.parse(match[1]) const markdownPath = result.markdown_file const outputDir = result.output_dir if (!existsSync(markdownPath)) { throw new Error('Markdown result file not found') } let mdContent = await fs.readFile(markdownPath, 'utf-8') const imagesDir = path.join(outputDir, 'images') if (existsSync(imagesDir)) { const jobContext = await createJobContext('pdf_images') const destinations = findImageDestinations(mdContent) const replacements: ImageReplacement[] = [] for (const dest of destinations) { const originalSrc = dest.url if (!originalSrc) continue const possibleFilenames = [originalSrc, path.basename(originalSrc)] let foundFile: string | null = null for (const fname of possibleFilenames) { const localPath = path.join(imagesDir, fname) if (existsSync(localPath)) { foundFile = localPath break } const directPath = path.join(outputDir, originalSrc) if (existsSync(directPath)) { foundFile = directPath break } } if (foundFile) { const ext = path.extname(foundFile) const baseName = formatTimestamp(jobContext.now) const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext) const newPath = path.join(jobContext.destImagesDir, newFilename) await fs.copyFile(foundFile, newPath) replacements.push({ start: dest.start, end: dest.end, original: originalSrc, replacement: `${jobContext.imagesSubDir}/${newFilename}` }) } } mdContent = applyReplacements(mdContent, replacements) } await fs.writeFile(targetPath, mdContent, 'utf-8') await fs.unlink(markdownPath).catch(() => {}) if (outputDir && outputDir.includes('temp')) { await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {}) } } finally { await fs.unlink(filePath).catch(() => {}) } } export default router