Initial commit
This commit is contained in:
217
api/modules/document-parser/blogRoutes.ts
Normal file
217
api/modules/document-parser/blogRoutes.ts
Normal file
@@ -0,0 +1,217 @@
|
||||
import express, { type Request, type Response } from 'express'
|
||||
import path from 'path'
|
||||
import fs from 'fs/promises'
|
||||
import { existsSync } from 'fs'
|
||||
import axios from 'axios'
|
||||
import { asyncHandler } from '../../utils/asyncHandler.js'
|
||||
import { successResponse } from '../../utils/response.js'
|
||||
import { resolveNotebookPath } from '../../utils/pathSafety.js'
|
||||
import { getUniqueFilename } from '../../utils/file.js'
|
||||
import { formatTimestamp } from '../../../shared/utils/date.js'
|
||||
import { getTempDir } from '../../utils/tempDir.js'
|
||||
import {
|
||||
createJobContext,
|
||||
spawnPythonScript,
|
||||
findImageDestinations,
|
||||
applyReplacements,
|
||||
copyLocalImage,
|
||||
cleanupJob,
|
||||
getScriptPath,
|
||||
ensureScriptExists,
|
||||
} from './documentParser.js'
|
||||
import type { ImageReplacement } from './documentParser.js'
|
||||
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
|
||||
import { logger } from '../../utils/logger.js'
|
||||
|
||||
const router = express.Router()
|
||||
|
||||
const tempDir = getTempDir()
|
||||
|
||||
router.post(
|
||||
'/parse-local',
|
||||
asyncHandler(async (req: Request, res: Response) => {
|
||||
const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body as {
|
||||
htmlPath?: string
|
||||
htmlDir?: string
|
||||
assetsDirName?: string
|
||||
assetsFiles?: string[]
|
||||
targetPath?: string
|
||||
}
|
||||
|
||||
if (!htmlPath || !htmlDir || !targetPath) {
|
||||
throw new ValidationError('htmlPath, htmlDir and targetPath are required')
|
||||
}
|
||||
|
||||
let fullTargetPath: string
|
||||
try {
|
||||
const resolved = resolveNotebookPath(targetPath)
|
||||
fullTargetPath = resolved.fullPath
|
||||
} catch (error) {
|
||||
throw error
|
||||
}
|
||||
|
||||
const scriptPath = getScriptPath('blog', 'parse_blog.py')
|
||||
if (!ensureScriptExists(scriptPath)) {
|
||||
throw new InternalError('Parser script not found')
|
||||
}
|
||||
|
||||
const jobContext = await createJobContext('blog')
|
||||
|
||||
let htmlPathInJob = ''
|
||||
try {
|
||||
htmlPathInJob = path.join(jobContext.jobDir, 'input.html')
|
||||
await fs.copyFile(htmlPath, htmlPathInJob)
|
||||
|
||||
if (assetsDirName && assetsFiles && assetsFiles.length > 0) {
|
||||
const assetsDirPath = path.join(htmlDir, assetsDirName)
|
||||
for (const relPath of assetsFiles) {
|
||||
const srcPath = path.join(assetsDirPath, relPath)
|
||||
if (existsSync(srcPath)) {
|
||||
const destPath = path.join(jobContext.jobDir, assetsDirName, relPath)
|
||||
await fs.mkdir(path.dirname(destPath), { recursive: true })
|
||||
await fs.copyFile(srcPath, destPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
await cleanupJob(jobContext.jobDir)
|
||||
throw err
|
||||
}
|
||||
|
||||
processHtmlInBackground({
|
||||
jobDir: jobContext.jobDir,
|
||||
htmlPath: htmlPathInJob,
|
||||
targetPath: fullTargetPath,
|
||||
cwd: path.dirname(scriptPath),
|
||||
jobContext,
|
||||
originalHtmlDir: htmlDir,
|
||||
originalAssetsDirName: assetsDirName,
|
||||
}).catch(err => {
|
||||
logger.error('Background HTML processing failed:', err)
|
||||
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => { })
|
||||
cleanupJob(jobContext.jobDir).catch(() => { })
|
||||
})
|
||||
|
||||
successResponse(res, {
|
||||
message: 'HTML parsing started in background.',
|
||||
status: 'processing'
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
interface ProcessHtmlArgs {
|
||||
jobDir: string
|
||||
htmlPath: string
|
||||
targetPath: string
|
||||
cwd: string
|
||||
jobContext: ReturnType<typeof createJobContext> extends Promise<infer T> ? T : never
|
||||
originalHtmlDir?: string
|
||||
originalAssetsDirName?: string
|
||||
}
|
||||
|
||||
async function processHtmlInBackground(args: ProcessHtmlArgs) {
|
||||
const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args
|
||||
try {
|
||||
await spawnPythonScript({
|
||||
scriptPath: 'parse_blog.py',
|
||||
args: [htmlPath],
|
||||
cwd,
|
||||
})
|
||||
|
||||
const parsedPathObj = path.parse(htmlPath)
|
||||
const markdownPath = path.join(parsedPathObj.dir, `${parsedPathObj.name}.md`)
|
||||
|
||||
if (!existsSync(markdownPath)) {
|
||||
throw new Error('Markdown result file not found')
|
||||
}
|
||||
|
||||
let mdContent = await fs.readFile(markdownPath, 'utf-8')
|
||||
const ctx = await jobContext
|
||||
|
||||
const htmlDir = path.dirname(htmlPath)
|
||||
const replacements: ImageReplacement[] = []
|
||||
|
||||
const destinations = findImageDestinations(mdContent)
|
||||
for (const dest of destinations) {
|
||||
const originalSrc = dest.url
|
||||
if (!originalSrc) continue
|
||||
|
||||
if (originalSrc.startsWith('http://') || originalSrc.startsWith('https://')) {
|
||||
try {
|
||||
const response = await axios.get(originalSrc, { responseType: 'arraybuffer', timeout: 10000 })
|
||||
const contentType = response.headers['content-type']
|
||||
let ext = '.jpg'
|
||||
if (contentType) {
|
||||
if (contentType.includes('png')) ext = '.png'
|
||||
else if (contentType.includes('gif')) ext = '.gif'
|
||||
else if (contentType.includes('webp')) ext = '.webp'
|
||||
else if (contentType.includes('svg')) ext = '.svg'
|
||||
else if (contentType.includes('jpeg') || contentType.includes('jpg')) ext = '.jpg'
|
||||
}
|
||||
const urlExt = path.extname(originalSrc.split('?')[0])
|
||||
if (urlExt) ext = urlExt
|
||||
|
||||
const baseName = formatTimestamp(ctx.now)
|
||||
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
|
||||
const newPath = path.join(ctx.destImagesDir, newFilename)
|
||||
await fs.writeFile(newPath, response.data)
|
||||
replacements.push({
|
||||
start: dest.start,
|
||||
end: dest.end,
|
||||
original: originalSrc,
|
||||
replacement: `/${ctx.imagesSubDir}/${newFilename}`
|
||||
})
|
||||
} catch { }
|
||||
continue
|
||||
}
|
||||
|
||||
if (originalSrc.startsWith('data:')) continue
|
||||
|
||||
let result = await copyLocalImage(
|
||||
originalSrc,
|
||||
jobDir,
|
||||
htmlDir,
|
||||
ctx.destImagesDir,
|
||||
ctx.imagesSubDir,
|
||||
ctx.now
|
||||
)
|
||||
|
||||
if (!result && originalHtmlDir && originalAssetsDirName) {
|
||||
const srcWithFiles = originalSrc.replace(/^\.\//, '').replace(/^\//, '')
|
||||
const possiblePaths = [
|
||||
path.join(originalHtmlDir, originalAssetsDirName, srcWithFiles),
|
||||
path.join(originalHtmlDir, originalAssetsDirName, path.basename(srcWithFiles)),
|
||||
]
|
||||
for (const p of possiblePaths) {
|
||||
if (existsSync(p)) {
|
||||
const ext = path.extname(p) || '.jpg'
|
||||
const baseName = formatTimestamp(ctx.now)
|
||||
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
|
||||
const newPath = path.join(ctx.destImagesDir, newFilename)
|
||||
await fs.copyFile(p, newPath)
|
||||
result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` }
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
replacements.push({
|
||||
start: dest.start,
|
||||
end: dest.end,
|
||||
original: originalSrc,
|
||||
replacement: result.newLink
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
mdContent = applyReplacements(mdContent, replacements)
|
||||
|
||||
await fs.writeFile(targetPath, mdContent, 'utf-8')
|
||||
await fs.unlink(markdownPath).catch(() => { })
|
||||
} finally {
|
||||
await cleanupJob(jobDir)
|
||||
}
|
||||
}
|
||||
|
||||
export default router
|
||||
184
api/modules/document-parser/documentParser.ts
Normal file
184
api/modules/document-parser/documentParser.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
import path from 'path'
|
||||
import { spawn } from 'child_process'
|
||||
import fs from 'fs/promises'
|
||||
import { existsSync, mkdirSync } from 'fs'
|
||||
import { PROJECT_ROOT, NOTEBOOK_ROOT, TEMP_ROOT } from '../../config/paths.js'
|
||||
import { getUniqueFilename } from '../../utils/file.js'
|
||||
import { formatTimestamp, pad2 } from '../../../shared/utils/date.js'
|
||||
import { logger } from '../../utils/logger.js'
|
||||
|
||||
if (!existsSync(TEMP_ROOT)) {
|
||||
mkdirSync(TEMP_ROOT, { recursive: true })
|
||||
}
|
||||
|
||||
export interface JobContext {
|
||||
jobDir: string
|
||||
now: Date
|
||||
imagesSubDir: string
|
||||
destImagesDir: string
|
||||
}
|
||||
|
||||
export const createJobContext = async (prefix: string): Promise<JobContext> => {
|
||||
const now = new Date()
|
||||
const jobDir = path.join(TEMP_ROOT, `${prefix}_${formatTimestamp(now)}`)
|
||||
await fs.mkdir(jobDir, { recursive: true })
|
||||
|
||||
const year = now.getFullYear()
|
||||
const month = pad2(now.getMonth() + 1)
|
||||
const day = pad2(now.getDate())
|
||||
const imagesSubDir = `images/${year}/${month}/${day}`
|
||||
const destImagesDir = path.join(NOTEBOOK_ROOT, imagesSubDir)
|
||||
await fs.mkdir(destImagesDir, { recursive: true })
|
||||
|
||||
return { jobDir, now, imagesSubDir, destImagesDir }
|
||||
}
|
||||
|
||||
export interface SpawnPythonOptions {
|
||||
scriptPath: string
|
||||
args: string[]
|
||||
cwd: string
|
||||
inputContent?: string
|
||||
}
|
||||
|
||||
export const spawnPythonScript = async (options: SpawnPythonOptions): Promise<string> => {
|
||||
const { scriptPath, args, cwd, inputContent } = options
|
||||
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
const pythonProcess = spawn('python', ['-X', 'utf8', scriptPath, ...args], {
|
||||
cwd,
|
||||
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' },
|
||||
})
|
||||
|
||||
let stdout = ''
|
||||
let stderr = ''
|
||||
|
||||
pythonProcess.stdout.on('data', (data) => {
|
||||
stdout += data.toString()
|
||||
})
|
||||
|
||||
pythonProcess.stderr.on('data', (data) => {
|
||||
stderr += data.toString()
|
||||
})
|
||||
|
||||
pythonProcess.on('close', (code) => {
|
||||
if (code !== 0) {
|
||||
logger.error('Python script error:', stderr)
|
||||
reject(new Error(`Process exited with code ${code}. Error: ${stderr}`))
|
||||
} else {
|
||||
resolve(stdout)
|
||||
}
|
||||
})
|
||||
|
||||
pythonProcess.on('error', (err) => {
|
||||
reject(err)
|
||||
})
|
||||
|
||||
if (inputContent !== undefined) {
|
||||
pythonProcess.stdin.write(inputContent)
|
||||
pythonProcess.stdin.end()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
export interface ImageReplacement {
|
||||
start: number
|
||||
end: number
|
||||
original: string
|
||||
replacement: string
|
||||
}
|
||||
|
||||
export const findImageDestinations = (md: string): Array<{ url: string; start: number; end: number }> => {
|
||||
const results: Array<{ url: string; start: number; end: number }> = []
|
||||
let i = 0
|
||||
while (i < md.length) {
|
||||
const bang = md.indexOf('![', i)
|
||||
if (bang === -1) break
|
||||
const closeBracket = md.indexOf(']', bang + 2)
|
||||
if (closeBracket === -1) break
|
||||
if (md[closeBracket + 1] !== '(') {
|
||||
i = closeBracket + 1
|
||||
continue
|
||||
}
|
||||
|
||||
const urlStart = closeBracket + 2
|
||||
let depth = 1
|
||||
let j = urlStart
|
||||
for (; j < md.length; j++) {
|
||||
const ch = md[j]
|
||||
if (ch === '(') depth++
|
||||
else if (ch === ')') {
|
||||
depth--
|
||||
if (depth === 0) break
|
||||
}
|
||||
}
|
||||
if (depth !== 0) break
|
||||
results.push({ url: md.slice(urlStart, j), start: urlStart, end: j })
|
||||
i = j + 1
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
export const applyReplacements = (md: string, replacements: ImageReplacement[]): string => {
|
||||
const sorted = [...replacements].sort((a, b) => b.start - a.start)
|
||||
let result = md
|
||||
for (const r of sorted) {
|
||||
result = `${result.slice(0, r.start)}${r.replacement}${result.slice(r.end)}`
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
export const copyLocalImage = async (
|
||||
src: string,
|
||||
jobDir: string,
|
||||
htmlDir: string,
|
||||
destImagesDir: string,
|
||||
imagesSubDir: string,
|
||||
now: Date
|
||||
): Promise<{ newLink: string } | null> => {
|
||||
const s0 = src.trim().replace(/^<|>$/g, '')
|
||||
if (!s0) return null
|
||||
|
||||
let decoded = s0
|
||||
try {
|
||||
decoded = decodeURI(s0)
|
||||
} catch {}
|
||||
|
||||
const s1 = decoded.replace(/\\/g, '/')
|
||||
const s2 = s1.startsWith('./') ? s1.slice(2) : s1
|
||||
const candidates = s2.startsWith('/')
|
||||
? [path.join(jobDir, s2.slice(1)), path.join(htmlDir, s2.slice(1))]
|
||||
: [path.resolve(htmlDir, s2), path.resolve(jobDir, s2)]
|
||||
|
||||
let foundFile: string | null = null
|
||||
for (const c of candidates) {
|
||||
if (existsSync(c)) {
|
||||
foundFile = c
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundFile) return null
|
||||
|
||||
const ext = path.extname(foundFile) || '.jpg'
|
||||
const baseName = formatTimestamp(now)
|
||||
const newFilename = await getUniqueFilename(destImagesDir, baseName, ext)
|
||||
const newPath = path.join(destImagesDir, newFilename)
|
||||
await fs.copyFile(foundFile, newPath)
|
||||
|
||||
return { newLink: `/${imagesSubDir}/${newFilename}` }
|
||||
}
|
||||
|
||||
export const cleanupJob = async (jobDir: string, additionalPaths: string[] = []): Promise<void> => {
|
||||
await fs.rm(jobDir, { recursive: true, force: true }).catch(() => {})
|
||||
for (const p of additionalPaths) {
|
||||
await fs.unlink(p).catch(() => {})
|
||||
}
|
||||
}
|
||||
|
||||
export const getScriptPath = (toolName: string, scriptName: string): string => {
|
||||
return path.join(PROJECT_ROOT, 'tools', toolName, scriptName)
|
||||
}
|
||||
|
||||
export const ensureScriptExists = (scriptPath: string): boolean => {
|
||||
return existsSync(scriptPath)
|
||||
}
|
||||
23
api/modules/document-parser/index.ts
Normal file
23
api/modules/document-parser/index.ts
Normal file
@@ -0,0 +1,23 @@
|
||||
import express, { type Router } from 'express'
|
||||
import type { ServiceContainer } from '../../infra/container.js'
|
||||
import { createApiModule } from '../../infra/createModule.js'
|
||||
import { DOCUMENT_PARSER_MODULE } from '../../../shared/modules/document-parser/index.js'
|
||||
import blogRoutes from './blogRoutes.js'
|
||||
import mineruRoutes from './mineruRoutes.js'
|
||||
|
||||
export * from './documentParser.js'
|
||||
export { default as blogRoutes } from './blogRoutes.js'
|
||||
export { default as mineruRoutes } from './mineruRoutes.js'
|
||||
|
||||
export const createDocumentParserModule = () => {
|
||||
return createApiModule(DOCUMENT_PARSER_MODULE, {
|
||||
routes: (_container: ServiceContainer): Router => {
|
||||
const router = express.Router()
|
||||
router.use('/blog', blogRoutes)
|
||||
router.use('/mineru', mineruRoutes)
|
||||
return router
|
||||
},
|
||||
})
|
||||
}
|
||||
|
||||
export default createDocumentParserModule
|
||||
158
api/modules/document-parser/mineruRoutes.ts
Normal file
158
api/modules/document-parser/mineruRoutes.ts
Normal file
@@ -0,0 +1,158 @@
|
||||
import express, { type Request, type Response } from 'express'
|
||||
import multer from 'multer'
|
||||
import path from 'path'
|
||||
import fs from 'fs/promises'
|
||||
import { existsSync } from 'fs'
|
||||
import { asyncHandler } from '../../utils/asyncHandler.js'
|
||||
import { successResponse } from '../../utils/response.js'
|
||||
import { resolveNotebookPath } from '../../utils/pathSafety.js'
|
||||
import { getUniqueFilename } from '../../utils/file.js'
|
||||
import { formatTimestamp } from '../../../shared/utils/date.js'
|
||||
import { getTempDir } from '../../utils/tempDir.js'
|
||||
import {
|
||||
createJobContext,
|
||||
spawnPythonScript,
|
||||
findImageDestinations,
|
||||
applyReplacements,
|
||||
cleanupJob,
|
||||
getScriptPath,
|
||||
ensureScriptExists,
|
||||
} from './documentParser.js'
|
||||
import type { ImageReplacement } from './documentParser.js'
|
||||
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
|
||||
import { logger } from '../../utils/logger.js'
|
||||
|
||||
const router = express.Router()
|
||||
|
||||
const tempDir = getTempDir()
|
||||
|
||||
const upload = multer({
|
||||
dest: tempDir,
|
||||
limits: {
|
||||
fileSize: 50 * 1024 * 1024
|
||||
}
|
||||
})
|
||||
|
||||
router.post(
|
||||
'/parse',
|
||||
upload.single('file'),
|
||||
asyncHandler(async (req: Request, res: Response) => {
|
||||
if (!req.file) {
|
||||
throw new ValidationError('File is required')
|
||||
}
|
||||
|
||||
const { targetPath } = req.body as { targetPath?: string }
|
||||
if (!targetPath) {
|
||||
await fs.unlink(req.file.path).catch(() => {})
|
||||
throw new ValidationError('Target path is required')
|
||||
}
|
||||
|
||||
let fullTargetPath: string
|
||||
try {
|
||||
const resolved = resolveNotebookPath(targetPath)
|
||||
fullTargetPath = resolved.fullPath
|
||||
} catch (error) {
|
||||
await fs.unlink(req.file.path).catch(() => {})
|
||||
throw error
|
||||
}
|
||||
|
||||
const scriptPath = getScriptPath('mineru', 'mineru_parser.py')
|
||||
if (!ensureScriptExists(scriptPath)) {
|
||||
await fs.unlink(req.file.path).catch(() => {})
|
||||
throw new InternalError('Parser script not found')
|
||||
}
|
||||
|
||||
processPdfInBackground(req.file.path, fullTargetPath, path.dirname(scriptPath))
|
||||
.catch(err => {
|
||||
logger.error('Background PDF processing failed:', err)
|
||||
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => {})
|
||||
})
|
||||
|
||||
successResponse(res, {
|
||||
message: 'PDF upload successful. Parsing started in background.',
|
||||
status: 'processing'
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
async function processPdfInBackground(filePath: string, targetPath: string, cwd: string) {
|
||||
try {
|
||||
const output = await spawnPythonScript({
|
||||
scriptPath: 'mineru_parser.py',
|
||||
args: [filePath],
|
||||
cwd,
|
||||
})
|
||||
|
||||
const match = output.match(/JSON_RESULT:(.*)/)
|
||||
if (!match) {
|
||||
throw new Error('Failed to parse Python script output: JSON_RESULT not found')
|
||||
}
|
||||
|
||||
const result = JSON.parse(match[1])
|
||||
const markdownPath = result.markdown_file
|
||||
const outputDir = result.output_dir
|
||||
|
||||
if (!existsSync(markdownPath)) {
|
||||
throw new Error('Markdown result file not found')
|
||||
}
|
||||
|
||||
let mdContent = await fs.readFile(markdownPath, 'utf-8')
|
||||
|
||||
const imagesDir = path.join(outputDir, 'images')
|
||||
if (existsSync(imagesDir)) {
|
||||
const jobContext = await createJobContext('pdf_images')
|
||||
|
||||
const destinations = findImageDestinations(mdContent)
|
||||
const replacements: ImageReplacement[] = []
|
||||
|
||||
for (const dest of destinations) {
|
||||
const originalSrc = dest.url
|
||||
if (!originalSrc) continue
|
||||
|
||||
const possibleFilenames = [originalSrc, path.basename(originalSrc)]
|
||||
let foundFile: string | null = null
|
||||
|
||||
for (const fname of possibleFilenames) {
|
||||
const localPath = path.join(imagesDir, fname)
|
||||
if (existsSync(localPath)) {
|
||||
foundFile = localPath
|
||||
break
|
||||
}
|
||||
|
||||
const directPath = path.join(outputDir, originalSrc)
|
||||
if (existsSync(directPath)) {
|
||||
foundFile = directPath
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (foundFile) {
|
||||
const ext = path.extname(foundFile)
|
||||
const baseName = formatTimestamp(jobContext.now)
|
||||
const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext)
|
||||
const newPath = path.join(jobContext.destImagesDir, newFilename)
|
||||
await fs.copyFile(foundFile, newPath)
|
||||
replacements.push({
|
||||
start: dest.start,
|
||||
end: dest.end,
|
||||
original: originalSrc,
|
||||
replacement: `${jobContext.imagesSubDir}/${newFilename}`
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
mdContent = applyReplacements(mdContent, replacements)
|
||||
}
|
||||
|
||||
await fs.writeFile(targetPath, mdContent, 'utf-8')
|
||||
await fs.unlink(markdownPath).catch(() => {})
|
||||
|
||||
if (outputDir && outputDir.includes('temp')) {
|
||||
await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {})
|
||||
}
|
||||
} finally {
|
||||
await fs.unlink(filePath).catch(() => {})
|
||||
}
|
||||
}
|
||||
|
||||
export default router
|
||||
Reference in New Issue
Block a user