218 lines
7.0 KiB
TypeScript
218 lines
7.0 KiB
TypeScript
import express, { type Request, type Response } from 'express'
|
|
import path from 'path'
|
|
import fs from 'fs/promises'
|
|
import { existsSync } from 'fs'
|
|
import axios from 'axios'
|
|
import { asyncHandler } from '../../utils/asyncHandler.js'
|
|
import { successResponse } from '../../utils/response.js'
|
|
import { resolveNotebookPath } from '../../utils/pathSafety.js'
|
|
import { getUniqueFilename } from '../../utils/file.js'
|
|
import { formatTimestamp } from '../../../shared/utils/date.js'
|
|
import { getTempDir } from '../../utils/tempDir.js'
|
|
import {
|
|
createJobContext,
|
|
spawnPythonScript,
|
|
findImageDestinations,
|
|
applyReplacements,
|
|
copyLocalImage,
|
|
cleanupJob,
|
|
getScriptPath,
|
|
ensureScriptExists,
|
|
} from './documentParser.js'
|
|
import type { ImageReplacement } from './documentParser.js'
|
|
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
|
|
import { logger } from '../../utils/logger.js'
|
|
|
|
const router = express.Router()
|
|
|
|
const tempDir = getTempDir()
|
|
|
|
router.post(
|
|
'/parse-local',
|
|
asyncHandler(async (req: Request, res: Response) => {
|
|
const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body as {
|
|
htmlPath?: string
|
|
htmlDir?: string
|
|
assetsDirName?: string
|
|
assetsFiles?: string[]
|
|
targetPath?: string
|
|
}
|
|
|
|
if (!htmlPath || !htmlDir || !targetPath) {
|
|
throw new ValidationError('htmlPath, htmlDir and targetPath are required')
|
|
}
|
|
|
|
let fullTargetPath: string
|
|
try {
|
|
const resolved = resolveNotebookPath(targetPath)
|
|
fullTargetPath = resolved.fullPath
|
|
} catch (error) {
|
|
throw error
|
|
}
|
|
|
|
const scriptPath = getScriptPath('blog', 'parse_blog.py')
|
|
if (!ensureScriptExists(scriptPath)) {
|
|
throw new InternalError('Parser script not found')
|
|
}
|
|
|
|
const jobContext = await createJobContext('blog')
|
|
|
|
let htmlPathInJob = ''
|
|
try {
|
|
htmlPathInJob = path.join(jobContext.jobDir, 'input.html')
|
|
await fs.copyFile(htmlPath, htmlPathInJob)
|
|
|
|
if (assetsDirName && assetsFiles && assetsFiles.length > 0) {
|
|
const assetsDirPath = path.join(htmlDir, assetsDirName)
|
|
for (const relPath of assetsFiles) {
|
|
const srcPath = path.join(assetsDirPath, relPath)
|
|
if (existsSync(srcPath)) {
|
|
const destPath = path.join(jobContext.jobDir, assetsDirName, relPath)
|
|
await fs.mkdir(path.dirname(destPath), { recursive: true })
|
|
await fs.copyFile(srcPath, destPath)
|
|
}
|
|
}
|
|
}
|
|
} catch (err) {
|
|
await cleanupJob(jobContext.jobDir)
|
|
throw err
|
|
}
|
|
|
|
processHtmlInBackground({
|
|
jobDir: jobContext.jobDir,
|
|
htmlPath: htmlPathInJob,
|
|
targetPath: fullTargetPath,
|
|
cwd: path.dirname(scriptPath),
|
|
jobContext,
|
|
originalHtmlDir: htmlDir,
|
|
originalAssetsDirName: assetsDirName,
|
|
}).catch(err => {
|
|
logger.error('Background HTML processing failed:', err)
|
|
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => { })
|
|
cleanupJob(jobContext.jobDir).catch(() => { })
|
|
})
|
|
|
|
successResponse(res, {
|
|
message: 'HTML parsing started in background.',
|
|
status: 'processing'
|
|
})
|
|
}),
|
|
)
|
|
|
|
interface ProcessHtmlArgs {
|
|
jobDir: string
|
|
htmlPath: string
|
|
targetPath: string
|
|
cwd: string
|
|
jobContext: ReturnType<typeof createJobContext> extends Promise<infer T> ? T : never
|
|
originalHtmlDir?: string
|
|
originalAssetsDirName?: string
|
|
}
|
|
|
|
async function processHtmlInBackground(args: ProcessHtmlArgs) {
|
|
const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args
|
|
try {
|
|
await spawnPythonScript({
|
|
scriptPath: 'parse_blog.py',
|
|
args: [htmlPath],
|
|
cwd,
|
|
})
|
|
|
|
const parsedPathObj = path.parse(htmlPath)
|
|
const markdownPath = path.join(parsedPathObj.dir, `${parsedPathObj.name}.md`)
|
|
|
|
if (!existsSync(markdownPath)) {
|
|
throw new Error('Markdown result file not found')
|
|
}
|
|
|
|
let mdContent = await fs.readFile(markdownPath, 'utf-8')
|
|
const ctx = await jobContext
|
|
|
|
const htmlDir = path.dirname(htmlPath)
|
|
const replacements: ImageReplacement[] = []
|
|
|
|
const destinations = findImageDestinations(mdContent)
|
|
for (const dest of destinations) {
|
|
const originalSrc = dest.url
|
|
if (!originalSrc) continue
|
|
|
|
if (originalSrc.startsWith('http://') || originalSrc.startsWith('https://')) {
|
|
try {
|
|
const response = await axios.get(originalSrc, { responseType: 'arraybuffer', timeout: 10000 })
|
|
const contentType = response.headers['content-type']
|
|
let ext = '.jpg'
|
|
if (contentType) {
|
|
if (contentType.includes('png')) ext = '.png'
|
|
else if (contentType.includes('gif')) ext = '.gif'
|
|
else if (contentType.includes('webp')) ext = '.webp'
|
|
else if (contentType.includes('svg')) ext = '.svg'
|
|
else if (contentType.includes('jpeg') || contentType.includes('jpg')) ext = '.jpg'
|
|
}
|
|
const urlExt = path.extname(originalSrc.split('?')[0])
|
|
if (urlExt) ext = urlExt
|
|
|
|
const baseName = formatTimestamp(ctx.now)
|
|
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
|
|
const newPath = path.join(ctx.destImagesDir, newFilename)
|
|
await fs.writeFile(newPath, response.data)
|
|
replacements.push({
|
|
start: dest.start,
|
|
end: dest.end,
|
|
original: originalSrc,
|
|
replacement: `/${ctx.imagesSubDir}/${newFilename}`
|
|
})
|
|
} catch { }
|
|
continue
|
|
}
|
|
|
|
if (originalSrc.startsWith('data:')) continue
|
|
|
|
let result = await copyLocalImage(
|
|
originalSrc,
|
|
jobDir,
|
|
htmlDir,
|
|
ctx.destImagesDir,
|
|
ctx.imagesSubDir,
|
|
ctx.now
|
|
)
|
|
|
|
if (!result && originalHtmlDir && originalAssetsDirName) {
|
|
const srcWithFiles = originalSrc.replace(/^\.\//, '').replace(/^\//, '')
|
|
const possiblePaths = [
|
|
path.join(originalHtmlDir, originalAssetsDirName, srcWithFiles),
|
|
path.join(originalHtmlDir, originalAssetsDirName, path.basename(srcWithFiles)),
|
|
]
|
|
for (const p of possiblePaths) {
|
|
if (existsSync(p)) {
|
|
const ext = path.extname(p) || '.jpg'
|
|
const baseName = formatTimestamp(ctx.now)
|
|
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
|
|
const newPath = path.join(ctx.destImagesDir, newFilename)
|
|
await fs.copyFile(p, newPath)
|
|
result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` }
|
|
break
|
|
}
|
|
}
|
|
}
|
|
|
|
if (result) {
|
|
replacements.push({
|
|
start: dest.start,
|
|
end: dest.end,
|
|
original: originalSrc,
|
|
replacement: result.newLink
|
|
})
|
|
}
|
|
}
|
|
|
|
mdContent = applyReplacements(mdContent, replacements)
|
|
|
|
await fs.writeFile(targetPath, mdContent, 'utf-8')
|
|
await fs.unlink(markdownPath).catch(() => { })
|
|
} finally {
|
|
await cleanupJob(jobDir)
|
|
}
|
|
}
|
|
|
|
export default router
|