Initial commit
This commit is contained in:
217
api/modules/document-parser/blogRoutes.ts
Normal file
217
api/modules/document-parser/blogRoutes.ts
Normal file
@@ -0,0 +1,217 @@
|
||||
import express, { type Request, type Response } from 'express'
|
||||
import path from 'path'
|
||||
import fs from 'fs/promises'
|
||||
import { existsSync } from 'fs'
|
||||
import axios from 'axios'
|
||||
import { asyncHandler } from '../../utils/asyncHandler.js'
|
||||
import { successResponse } from '../../utils/response.js'
|
||||
import { resolveNotebookPath } from '../../utils/pathSafety.js'
|
||||
import { getUniqueFilename } from '../../utils/file.js'
|
||||
import { formatTimestamp } from '../../../shared/utils/date.js'
|
||||
import { getTempDir } from '../../utils/tempDir.js'
|
||||
import {
|
||||
createJobContext,
|
||||
spawnPythonScript,
|
||||
findImageDestinations,
|
||||
applyReplacements,
|
||||
copyLocalImage,
|
||||
cleanupJob,
|
||||
getScriptPath,
|
||||
ensureScriptExists,
|
||||
} from './documentParser.js'
|
||||
import type { ImageReplacement } from './documentParser.js'
|
||||
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
|
||||
import { logger } from '../../utils/logger.js'
|
||||
|
||||
const router = express.Router()
|
||||
|
||||
const tempDir = getTempDir()
|
||||
|
||||
router.post(
|
||||
'/parse-local',
|
||||
asyncHandler(async (req: Request, res: Response) => {
|
||||
const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body as {
|
||||
htmlPath?: string
|
||||
htmlDir?: string
|
||||
assetsDirName?: string
|
||||
assetsFiles?: string[]
|
||||
targetPath?: string
|
||||
}
|
||||
|
||||
if (!htmlPath || !htmlDir || !targetPath) {
|
||||
throw new ValidationError('htmlPath, htmlDir and targetPath are required')
|
||||
}
|
||||
|
||||
let fullTargetPath: string
|
||||
try {
|
||||
const resolved = resolveNotebookPath(targetPath)
|
||||
fullTargetPath = resolved.fullPath
|
||||
} catch (error) {
|
||||
throw error
|
||||
}
|
||||
|
||||
const scriptPath = getScriptPath('blog', 'parse_blog.py')
|
||||
if (!ensureScriptExists(scriptPath)) {
|
||||
throw new InternalError('Parser script not found')
|
||||
}
|
||||
|
||||
const jobContext = await createJobContext('blog')
|
||||
|
||||
let htmlPathInJob = ''
|
||||
try {
|
||||
htmlPathInJob = path.join(jobContext.jobDir, 'input.html')
|
||||
await fs.copyFile(htmlPath, htmlPathInJob)
|
||||
|
||||
if (assetsDirName && assetsFiles && assetsFiles.length > 0) {
|
||||
const assetsDirPath = path.join(htmlDir, assetsDirName)
|
||||
for (const relPath of assetsFiles) {
|
||||
const srcPath = path.join(assetsDirPath, relPath)
|
||||
if (existsSync(srcPath)) {
|
||||
const destPath = path.join(jobContext.jobDir, assetsDirName, relPath)
|
||||
await fs.mkdir(path.dirname(destPath), { recursive: true })
|
||||
await fs.copyFile(srcPath, destPath)
|
||||
}
|
||||
}
|
||||
}
|
||||
} catch (err) {
|
||||
await cleanupJob(jobContext.jobDir)
|
||||
throw err
|
||||
}
|
||||
|
||||
processHtmlInBackground({
|
||||
jobDir: jobContext.jobDir,
|
||||
htmlPath: htmlPathInJob,
|
||||
targetPath: fullTargetPath,
|
||||
cwd: path.dirname(scriptPath),
|
||||
jobContext,
|
||||
originalHtmlDir: htmlDir,
|
||||
originalAssetsDirName: assetsDirName,
|
||||
}).catch(err => {
|
||||
logger.error('Background HTML processing failed:', err)
|
||||
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => { })
|
||||
cleanupJob(jobContext.jobDir).catch(() => { })
|
||||
})
|
||||
|
||||
successResponse(res, {
|
||||
message: 'HTML parsing started in background.',
|
||||
status: 'processing'
|
||||
})
|
||||
}),
|
||||
)
|
||||
|
||||
interface ProcessHtmlArgs {
|
||||
jobDir: string
|
||||
htmlPath: string
|
||||
targetPath: string
|
||||
cwd: string
|
||||
jobContext: ReturnType<typeof createJobContext> extends Promise<infer T> ? T : never
|
||||
originalHtmlDir?: string
|
||||
originalAssetsDirName?: string
|
||||
}
|
||||
|
||||
async function processHtmlInBackground(args: ProcessHtmlArgs) {
|
||||
const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args
|
||||
try {
|
||||
await spawnPythonScript({
|
||||
scriptPath: 'parse_blog.py',
|
||||
args: [htmlPath],
|
||||
cwd,
|
||||
})
|
||||
|
||||
const parsedPathObj = path.parse(htmlPath)
|
||||
const markdownPath = path.join(parsedPathObj.dir, `${parsedPathObj.name}.md`)
|
||||
|
||||
if (!existsSync(markdownPath)) {
|
||||
throw new Error('Markdown result file not found')
|
||||
}
|
||||
|
||||
let mdContent = await fs.readFile(markdownPath, 'utf-8')
|
||||
const ctx = await jobContext
|
||||
|
||||
const htmlDir = path.dirname(htmlPath)
|
||||
const replacements: ImageReplacement[] = []
|
||||
|
||||
const destinations = findImageDestinations(mdContent)
|
||||
for (const dest of destinations) {
|
||||
const originalSrc = dest.url
|
||||
if (!originalSrc) continue
|
||||
|
||||
if (originalSrc.startsWith('http://') || originalSrc.startsWith('https://')) {
|
||||
try {
|
||||
const response = await axios.get(originalSrc, { responseType: 'arraybuffer', timeout: 10000 })
|
||||
const contentType = response.headers['content-type']
|
||||
let ext = '.jpg'
|
||||
if (contentType) {
|
||||
if (contentType.includes('png')) ext = '.png'
|
||||
else if (contentType.includes('gif')) ext = '.gif'
|
||||
else if (contentType.includes('webp')) ext = '.webp'
|
||||
else if (contentType.includes('svg')) ext = '.svg'
|
||||
else if (contentType.includes('jpeg') || contentType.includes('jpg')) ext = '.jpg'
|
||||
}
|
||||
const urlExt = path.extname(originalSrc.split('?')[0])
|
||||
if (urlExt) ext = urlExt
|
||||
|
||||
const baseName = formatTimestamp(ctx.now)
|
||||
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
|
||||
const newPath = path.join(ctx.destImagesDir, newFilename)
|
||||
await fs.writeFile(newPath, response.data)
|
||||
replacements.push({
|
||||
start: dest.start,
|
||||
end: dest.end,
|
||||
original: originalSrc,
|
||||
replacement: `/${ctx.imagesSubDir}/${newFilename}`
|
||||
})
|
||||
} catch { }
|
||||
continue
|
||||
}
|
||||
|
||||
if (originalSrc.startsWith('data:')) continue
|
||||
|
||||
let result = await copyLocalImage(
|
||||
originalSrc,
|
||||
jobDir,
|
||||
htmlDir,
|
||||
ctx.destImagesDir,
|
||||
ctx.imagesSubDir,
|
||||
ctx.now
|
||||
)
|
||||
|
||||
if (!result && originalHtmlDir && originalAssetsDirName) {
|
||||
const srcWithFiles = originalSrc.replace(/^\.\//, '').replace(/^\//, '')
|
||||
const possiblePaths = [
|
||||
path.join(originalHtmlDir, originalAssetsDirName, srcWithFiles),
|
||||
path.join(originalHtmlDir, originalAssetsDirName, path.basename(srcWithFiles)),
|
||||
]
|
||||
for (const p of possiblePaths) {
|
||||
if (existsSync(p)) {
|
||||
const ext = path.extname(p) || '.jpg'
|
||||
const baseName = formatTimestamp(ctx.now)
|
||||
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
|
||||
const newPath = path.join(ctx.destImagesDir, newFilename)
|
||||
await fs.copyFile(p, newPath)
|
||||
result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` }
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (result) {
|
||||
replacements.push({
|
||||
start: dest.start,
|
||||
end: dest.end,
|
||||
original: originalSrc,
|
||||
replacement: result.newLink
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
mdContent = applyReplacements(mdContent, replacements)
|
||||
|
||||
await fs.writeFile(targetPath, mdContent, 'utf-8')
|
||||
await fs.unlink(markdownPath).catch(() => { })
|
||||
} finally {
|
||||
await cleanupJob(jobDir)
|
||||
}
|
||||
}
|
||||
|
||||
export default router
|
||||
Reference in New Issue
Block a user