Files
XCDesktop/api/modules/document-parser/blogRoutes.ts
2026-03-08 01:34:54 +08:00

218 lines
7.0 KiB
TypeScript

import express, { type Request, type Response } from 'express'
import path from 'path'
import fs from 'fs/promises'
import { existsSync } from 'fs'
import axios from 'axios'
import { asyncHandler } from '../../utils/asyncHandler.js'
import { successResponse } from '../../utils/response.js'
import { resolveNotebookPath } from '../../utils/pathSafety.js'
import { getUniqueFilename } from '../../utils/file.js'
import { formatTimestamp } from '../../../shared/utils/date.js'
import { getTempDir } from '../../utils/tempDir.js'
import {
createJobContext,
spawnPythonScript,
findImageDestinations,
applyReplacements,
copyLocalImage,
cleanupJob,
getScriptPath,
ensureScriptExists,
} from './documentParser.js'
import type { ImageReplacement } from './documentParser.js'
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
import { logger } from '../../utils/logger.js'
const router = express.Router()
const tempDir = getTempDir()
router.post(
'/parse-local',
asyncHandler(async (req: Request, res: Response) => {
const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body as {
htmlPath?: string
htmlDir?: string
assetsDirName?: string
assetsFiles?: string[]
targetPath?: string
}
if (!htmlPath || !htmlDir || !targetPath) {
throw new ValidationError('htmlPath, htmlDir and targetPath are required')
}
let fullTargetPath: string
try {
const resolved = resolveNotebookPath(targetPath)
fullTargetPath = resolved.fullPath
} catch (error) {
throw error
}
const scriptPath = getScriptPath('blog', 'parse_blog.py')
if (!ensureScriptExists(scriptPath)) {
throw new InternalError('Parser script not found')
}
const jobContext = await createJobContext('blog')
let htmlPathInJob = ''
try {
htmlPathInJob = path.join(jobContext.jobDir, 'input.html')
await fs.copyFile(htmlPath, htmlPathInJob)
if (assetsDirName && assetsFiles && assetsFiles.length > 0) {
const assetsDirPath = path.join(htmlDir, assetsDirName)
for (const relPath of assetsFiles) {
const srcPath = path.join(assetsDirPath, relPath)
if (existsSync(srcPath)) {
const destPath = path.join(jobContext.jobDir, assetsDirName, relPath)
await fs.mkdir(path.dirname(destPath), { recursive: true })
await fs.copyFile(srcPath, destPath)
}
}
}
} catch (err) {
await cleanupJob(jobContext.jobDir)
throw err
}
processHtmlInBackground({
jobDir: jobContext.jobDir,
htmlPath: htmlPathInJob,
targetPath: fullTargetPath,
cwd: path.dirname(scriptPath),
jobContext,
originalHtmlDir: htmlDir,
originalAssetsDirName: assetsDirName,
}).catch(err => {
logger.error('Background HTML processing failed:', err)
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => { })
cleanupJob(jobContext.jobDir).catch(() => { })
})
successResponse(res, {
message: 'HTML parsing started in background.',
status: 'processing'
})
}),
)
interface ProcessHtmlArgs {
jobDir: string
htmlPath: string
targetPath: string
cwd: string
jobContext: ReturnType<typeof createJobContext> extends Promise<infer T> ? T : never
originalHtmlDir?: string
originalAssetsDirName?: string
}
async function processHtmlInBackground(args: ProcessHtmlArgs) {
const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args
try {
await spawnPythonScript({
scriptPath: 'parse_blog.py',
args: [htmlPath],
cwd,
})
const parsedPathObj = path.parse(htmlPath)
const markdownPath = path.join(parsedPathObj.dir, `${parsedPathObj.name}.md`)
if (!existsSync(markdownPath)) {
throw new Error('Markdown result file not found')
}
let mdContent = await fs.readFile(markdownPath, 'utf-8')
const ctx = await jobContext
const htmlDir = path.dirname(htmlPath)
const replacements: ImageReplacement[] = []
const destinations = findImageDestinations(mdContent)
for (const dest of destinations) {
const originalSrc = dest.url
if (!originalSrc) continue
if (originalSrc.startsWith('http://') || originalSrc.startsWith('https://')) {
try {
const response = await axios.get(originalSrc, { responseType: 'arraybuffer', timeout: 10000 })
const contentType = response.headers['content-type']
let ext = '.jpg'
if (contentType) {
if (contentType.includes('png')) ext = '.png'
else if (contentType.includes('gif')) ext = '.gif'
else if (contentType.includes('webp')) ext = '.webp'
else if (contentType.includes('svg')) ext = '.svg'
else if (contentType.includes('jpeg') || contentType.includes('jpg')) ext = '.jpg'
}
const urlExt = path.extname(originalSrc.split('?')[0])
if (urlExt) ext = urlExt
const baseName = formatTimestamp(ctx.now)
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
const newPath = path.join(ctx.destImagesDir, newFilename)
await fs.writeFile(newPath, response.data)
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: `/${ctx.imagesSubDir}/${newFilename}`
})
} catch { }
continue
}
if (originalSrc.startsWith('data:')) continue
let result = await copyLocalImage(
originalSrc,
jobDir,
htmlDir,
ctx.destImagesDir,
ctx.imagesSubDir,
ctx.now
)
if (!result && originalHtmlDir && originalAssetsDirName) {
const srcWithFiles = originalSrc.replace(/^\.\//, '').replace(/^\//, '')
const possiblePaths = [
path.join(originalHtmlDir, originalAssetsDirName, srcWithFiles),
path.join(originalHtmlDir, originalAssetsDirName, path.basename(srcWithFiles)),
]
for (const p of possiblePaths) {
if (existsSync(p)) {
const ext = path.extname(p) || '.jpg'
const baseName = formatTimestamp(ctx.now)
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
const newPath = path.join(ctx.destImagesDir, newFilename)
await fs.copyFile(p, newPath)
result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` }
break
}
}
}
if (result) {
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: result.newLink
})
}
}
mdContent = applyReplacements(mdContent, replacements)
await fs.writeFile(targetPath, mdContent, 'utf-8')
await fs.unlink(markdownPath).catch(() => { })
} finally {
await cleanupJob(jobDir)
}
}
export default router