Files
XCDesktop/api/modules/document-parser/mineruRoutes.ts

159 lines
4.8 KiB
TypeScript
Raw Normal View History

2026-03-08 01:34:54 +08:00
import express, { type Request, type Response } from 'express'
import multer from 'multer'
import path from 'path'
import fs from 'fs/promises'
import { existsSync } from 'fs'
import { asyncHandler } from '../../utils/asyncHandler.js'
import { successResponse } from '../../utils/response.js'
import { resolveNotebookPath } from '../../utils/pathSafety.js'
import { getUniqueFilename } from '../../utils/file.js'
import { formatTimestamp } from '../../../shared/utils/date.js'
import { getTempDir } from '../../utils/tempDir.js'
import {
createJobContext,
spawnPythonScript,
findImageDestinations,
applyReplacements,
cleanupJob,
getScriptPath,
ensureScriptExists,
} from './documentParser.js'
import type { ImageReplacement } from './documentParser.js'
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
import { logger } from '../../utils/logger.js'
const router = express.Router()
const tempDir = getTempDir()
const upload = multer({
dest: tempDir,
limits: {
fileSize: 50 * 1024 * 1024
}
})
router.post(
'/parse',
upload.single('file'),
asyncHandler(async (req: Request, res: Response) => {
if (!req.file) {
throw new ValidationError('File is required')
}
const { targetPath } = req.body as { targetPath?: string }
if (!targetPath) {
await fs.unlink(req.file.path).catch(() => {})
throw new ValidationError('Target path is required')
}
let fullTargetPath: string
try {
const resolved = resolveNotebookPath(targetPath)
fullTargetPath = resolved.fullPath
} catch (error) {
await fs.unlink(req.file.path).catch(() => {})
throw error
}
const scriptPath = getScriptPath('mineru', 'mineru_parser.py')
if (!ensureScriptExists(scriptPath)) {
await fs.unlink(req.file.path).catch(() => {})
throw new InternalError('Parser script not found')
}
processPdfInBackground(req.file.path, fullTargetPath, path.dirname(scriptPath))
.catch(err => {
logger.error('Background PDF processing failed:', err)
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => {})
})
successResponse(res, {
message: 'PDF upload successful. Parsing started in background.',
status: 'processing'
})
}),
)
async function processPdfInBackground(filePath: string, targetPath: string, cwd: string) {
try {
const output = await spawnPythonScript({
scriptPath: 'mineru_parser.py',
args: [filePath],
cwd,
})
const match = output.match(/JSON_RESULT:(.*)/)
if (!match) {
throw new Error('Failed to parse Python script output: JSON_RESULT not found')
}
const result = JSON.parse(match[1])
const markdownPath = result.markdown_file
const outputDir = result.output_dir
if (!existsSync(markdownPath)) {
throw new Error('Markdown result file not found')
}
let mdContent = await fs.readFile(markdownPath, 'utf-8')
const imagesDir = path.join(outputDir, 'images')
if (existsSync(imagesDir)) {
const jobContext = await createJobContext('pdf_images')
const destinations = findImageDestinations(mdContent)
const replacements: ImageReplacement[] = []
for (const dest of destinations) {
const originalSrc = dest.url
if (!originalSrc) continue
const possibleFilenames = [originalSrc, path.basename(originalSrc)]
let foundFile: string | null = null
for (const fname of possibleFilenames) {
const localPath = path.join(imagesDir, fname)
if (existsSync(localPath)) {
foundFile = localPath
break
}
const directPath = path.join(outputDir, originalSrc)
if (existsSync(directPath)) {
foundFile = directPath
break
}
}
if (foundFile) {
const ext = path.extname(foundFile)
const baseName = formatTimestamp(jobContext.now)
const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext)
const newPath = path.join(jobContext.destImagesDir, newFilename)
await fs.copyFile(foundFile, newPath)
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: `${jobContext.imagesSubDir}/${newFilename}`
})
}
}
mdContent = applyReplacements(mdContent, replacements)
}
await fs.writeFile(targetPath, mdContent, 'utf-8')
await fs.unlink(markdownPath).catch(() => {})
if (outputDir && outputDir.includes('temp')) {
await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {})
}
} finally {
await fs.unlink(filePath).catch(() => {})
}
}
export default router