Initial commit
This commit is contained in:
158
api/modules/document-parser/mineruRoutes.ts
Normal file
158
api/modules/document-parser/mineruRoutes.ts
Normal file
@@ -0,0 +1,158 @@
|
||||
import express, { type Request, type Response } from 'express'
|
||||
import multer from 'multer'
|
||||
import path from 'path'
|
||||
import fs from 'fs/promises'
|
||||
import { existsSync } from 'fs'
|
||||
import { asyncHandler } from '../../utils/asyncHandler.js'
|
||||
import { successResponse } from '../../utils/response.js'
|
||||
import { resolveNotebookPath } from '../../utils/pathSafety.js'
|
||||
import { getUniqueFilename } from '../../utils/file.js'
|
||||
import { formatTimestamp } from '../../../shared/utils/date.js'
|
||||
import { getTempDir } from '../../utils/tempDir.js'
|
||||
import {
|
||||
createJobContext,
|
||||
spawnPythonScript,
|
||||
findImageDestinations,
|
||||
applyReplacements,
|
||||
cleanupJob,
|
||||
getScriptPath,
|
||||
ensureScriptExists,
|
||||
} from './documentParser.js'
|
||||
import type { ImageReplacement } from './documentParser.js'
|
||||
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
|
||||
import { logger } from '../../utils/logger.js'
|
||||
|
||||
const router = express.Router()
|
||||
|
||||
const tempDir = getTempDir()
|
||||
|
||||
const upload = multer({
|
||||
dest: tempDir,
|
||||
limits: {
|
||||
fileSize: 50 * 1024 * 1024
|
||||
}
|
||||
})
|
||||
|
||||
router.post(
|
||||
'/parse',
|
||||
upload.single('file'),
|
||||
asyncHandler(async (req: Request, res: Response) => {
|
||||
if (!req.file) {
|
||||
throw new ValidationError('File is required')
|
||||
}
|
||||
|
||||
const { targetPath } = req.body as { targetPath?: string }
|
||||
if (!targetPath) {
|
||||
await fs.unlink(req.file.path).catch(() => {})
|
||||
throw new ValidationError('Target path is required')
|
||||
}
|
||||
|
||||
let fullTargetPath: string
|
||||
try {
|
||||
const resolved = resolveNotebookPath(targetPath)
|
||||
fullTargetPath = resolved.fullPath
|
||||
} catch (error) {
|
||||
await fs.unlink(req.file.path).catch(() => {})
|
||||
throw error
|
||||
}
|
||||
|
||||
const scriptPath = getScriptPath('mineru', 'mineru_parser.py')
|
||||
if (!ensureScriptExists(scriptPath)) {
|
||||
await fs.unlink(req.file.path).catch(() => {})
|
||||
throw new InternalError('Parser script not found')
|
||||
}
|
||||
|
||||
processPdfInBackground(req.file.path, fullTargetPath, path.dirname(scriptPath))
|
||||
.catch(err => {
|
||||
logger.error('Background PDF processing failed:', err)
|
||||
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => {})
|
||||
})
|
||||
|
||||
successResponse(res, {
|
||||
message: 'PDF upload successful. Parsing started in background.',
|
||||
status: 'processing'
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
async function processPdfInBackground(filePath: string, targetPath: string, cwd: string) {
|
||||
try {
|
||||
const output = await spawnPythonScript({
|
||||
scriptPath: 'mineru_parser.py',
|
||||
args: [filePath],
|
||||
cwd,
|
||||
})
|
||||
|
||||
const match = output.match(/JSON_RESULT:(.*)/)
|
||||
if (!match) {
|
||||
throw new Error('Failed to parse Python script output: JSON_RESULT not found')
|
||||
}
|
||||
|
||||
const result = JSON.parse(match[1])
|
||||
const markdownPath = result.markdown_file
|
||||
const outputDir = result.output_dir
|
||||
|
||||
if (!existsSync(markdownPath)) {
|
||||
throw new Error('Markdown result file not found')
|
||||
}
|
||||
|
||||
let mdContent = await fs.readFile(markdownPath, 'utf-8')
|
||||
|
||||
const imagesDir = path.join(outputDir, 'images')
|
||||
if (existsSync(imagesDir)) {
|
||||
const jobContext = await createJobContext('pdf_images')
|
||||
|
||||
const destinations = findImageDestinations(mdContent)
|
||||
const replacements: ImageReplacement[] = []
|
||||
|
||||
for (const dest of destinations) {
|
||||
const originalSrc = dest.url
|
||||
if (!originalSrc) continue
|
||||
|
||||
const possibleFilenames = [originalSrc, path.basename(originalSrc)]
|
||||
let foundFile: string | null = null
|
||||
|
||||
for (const fname of possibleFilenames) {
|
||||
const localPath = path.join(imagesDir, fname)
|
||||
if (existsSync(localPath)) {
|
||||
foundFile = localPath
|
||||
break
|
||||
}
|
||||
|
||||
const directPath = path.join(outputDir, originalSrc)
|
||||
if (existsSync(directPath)) {
|
||||
foundFile = directPath
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (foundFile) {
|
||||
const ext = path.extname(foundFile)
|
||||
const baseName = formatTimestamp(jobContext.now)
|
||||
const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext)
|
||||
const newPath = path.join(jobContext.destImagesDir, newFilename)
|
||||
await fs.copyFile(foundFile, newPath)
|
||||
replacements.push({
|
||||
start: dest.start,
|
||||
end: dest.end,
|
||||
original: originalSrc,
|
||||
replacement: `${jobContext.imagesSubDir}/${newFilename}`
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
mdContent = applyReplacements(mdContent, replacements)
|
||||
}
|
||||
|
||||
await fs.writeFile(targetPath, mdContent, 'utf-8')
|
||||
await fs.unlink(markdownPath).catch(() => {})
|
||||
|
||||
if (outputDir && outputDir.includes('temp')) {
|
||||
await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {})
|
||||
}
|
||||
} finally {
|
||||
await fs.unlink(filePath).catch(() => {})
|
||||
}
|
||||
}
|
||||
|
||||
export default router
|
||||
Reference in New Issue
Block a user