159 lines
4.8 KiB
TypeScript
159 lines
4.8 KiB
TypeScript
|
|
import express, { type Request, type Response } from 'express'
|
||
|
|
import multer from 'multer'
|
||
|
|
import path from 'path'
|
||
|
|
import fs from 'fs/promises'
|
||
|
|
import { existsSync } from 'fs'
|
||
|
|
import { asyncHandler } from '../../utils/asyncHandler.js'
|
||
|
|
import { successResponse } from '../../utils/response.js'
|
||
|
|
import { resolveNotebookPath } from '../../utils/pathSafety.js'
|
||
|
|
import { getUniqueFilename } from '../../utils/file.js'
|
||
|
|
import { formatTimestamp } from '../../../shared/utils/date.js'
|
||
|
|
import { getTempDir } from '../../utils/tempDir.js'
|
||
|
|
import {
|
||
|
|
createJobContext,
|
||
|
|
spawnPythonScript,
|
||
|
|
findImageDestinations,
|
||
|
|
applyReplacements,
|
||
|
|
cleanupJob,
|
||
|
|
getScriptPath,
|
||
|
|
ensureScriptExists,
|
||
|
|
} from './documentParser.js'
|
||
|
|
import type { ImageReplacement } from './documentParser.js'
|
||
|
|
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
|
||
|
|
import { logger } from '../../utils/logger.js'
|
||
|
|
|
||
|
|
const router = express.Router()
|
||
|
|
|
||
|
|
const tempDir = getTempDir()
|
||
|
|
|
||
|
|
const upload = multer({
|
||
|
|
dest: tempDir,
|
||
|
|
limits: {
|
||
|
|
fileSize: 50 * 1024 * 1024
|
||
|
|
}
|
||
|
|
})
|
||
|
|
|
||
|
|
router.post(
|
||
|
|
'/parse',
|
||
|
|
upload.single('file'),
|
||
|
|
asyncHandler(async (req: Request, res: Response) => {
|
||
|
|
if (!req.file) {
|
||
|
|
throw new ValidationError('File is required')
|
||
|
|
}
|
||
|
|
|
||
|
|
const { targetPath } = req.body as { targetPath?: string }
|
||
|
|
if (!targetPath) {
|
||
|
|
await fs.unlink(req.file.path).catch(() => {})
|
||
|
|
throw new ValidationError('Target path is required')
|
||
|
|
}
|
||
|
|
|
||
|
|
let fullTargetPath: string
|
||
|
|
try {
|
||
|
|
const resolved = resolveNotebookPath(targetPath)
|
||
|
|
fullTargetPath = resolved.fullPath
|
||
|
|
} catch (error) {
|
||
|
|
await fs.unlink(req.file.path).catch(() => {})
|
||
|
|
throw error
|
||
|
|
}
|
||
|
|
|
||
|
|
const scriptPath = getScriptPath('mineru', 'mineru_parser.py')
|
||
|
|
if (!ensureScriptExists(scriptPath)) {
|
||
|
|
await fs.unlink(req.file.path).catch(() => {})
|
||
|
|
throw new InternalError('Parser script not found')
|
||
|
|
}
|
||
|
|
|
||
|
|
processPdfInBackground(req.file.path, fullTargetPath, path.dirname(scriptPath))
|
||
|
|
.catch(err => {
|
||
|
|
logger.error('Background PDF processing failed:', err)
|
||
|
|
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => {})
|
||
|
|
})
|
||
|
|
|
||
|
|
successResponse(res, {
|
||
|
|
message: 'PDF upload successful. Parsing started in background.',
|
||
|
|
status: 'processing'
|
||
|
|
})
|
||
|
|
}),
|
||
|
|
)
|
||
|
|
|
||
|
|
async function processPdfInBackground(filePath: string, targetPath: string, cwd: string) {
|
||
|
|
try {
|
||
|
|
const output = await spawnPythonScript({
|
||
|
|
scriptPath: 'mineru_parser.py',
|
||
|
|
args: [filePath],
|
||
|
|
cwd,
|
||
|
|
})
|
||
|
|
|
||
|
|
const match = output.match(/JSON_RESULT:(.*)/)
|
||
|
|
if (!match) {
|
||
|
|
throw new Error('Failed to parse Python script output: JSON_RESULT not found')
|
||
|
|
}
|
||
|
|
|
||
|
|
const result = JSON.parse(match[1])
|
||
|
|
const markdownPath = result.markdown_file
|
||
|
|
const outputDir = result.output_dir
|
||
|
|
|
||
|
|
if (!existsSync(markdownPath)) {
|
||
|
|
throw new Error('Markdown result file not found')
|
||
|
|
}
|
||
|
|
|
||
|
|
let mdContent = await fs.readFile(markdownPath, 'utf-8')
|
||
|
|
|
||
|
|
const imagesDir = path.join(outputDir, 'images')
|
||
|
|
if (existsSync(imagesDir)) {
|
||
|
|
const jobContext = await createJobContext('pdf_images')
|
||
|
|
|
||
|
|
const destinations = findImageDestinations(mdContent)
|
||
|
|
const replacements: ImageReplacement[] = []
|
||
|
|
|
||
|
|
for (const dest of destinations) {
|
||
|
|
const originalSrc = dest.url
|
||
|
|
if (!originalSrc) continue
|
||
|
|
|
||
|
|
const possibleFilenames = [originalSrc, path.basename(originalSrc)]
|
||
|
|
let foundFile: string | null = null
|
||
|
|
|
||
|
|
for (const fname of possibleFilenames) {
|
||
|
|
const localPath = path.join(imagesDir, fname)
|
||
|
|
if (existsSync(localPath)) {
|
||
|
|
foundFile = localPath
|
||
|
|
break
|
||
|
|
}
|
||
|
|
|
||
|
|
const directPath = path.join(outputDir, originalSrc)
|
||
|
|
if (existsSync(directPath)) {
|
||
|
|
foundFile = directPath
|
||
|
|
break
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
if (foundFile) {
|
||
|
|
const ext = path.extname(foundFile)
|
||
|
|
const baseName = formatTimestamp(jobContext.now)
|
||
|
|
const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext)
|
||
|
|
const newPath = path.join(jobContext.destImagesDir, newFilename)
|
||
|
|
await fs.copyFile(foundFile, newPath)
|
||
|
|
replacements.push({
|
||
|
|
start: dest.start,
|
||
|
|
end: dest.end,
|
||
|
|
original: originalSrc,
|
||
|
|
replacement: `${jobContext.imagesSubDir}/${newFilename}`
|
||
|
|
})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
mdContent = applyReplacements(mdContent, replacements)
|
||
|
|
}
|
||
|
|
|
||
|
|
await fs.writeFile(targetPath, mdContent, 'utf-8')
|
||
|
|
await fs.unlink(markdownPath).catch(() => {})
|
||
|
|
|
||
|
|
if (outputDir && outputDir.includes('temp')) {
|
||
|
|
await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {})
|
||
|
|
}
|
||
|
|
} finally {
|
||
|
|
await fs.unlink(filePath).catch(() => {})
|
||
|
|
}
|
||
|
|
}
|
||
|
|
|
||
|
|
export default router
|