import express, { type Request, type Response } from 'express' import path from 'path' import fs from 'fs/promises' import { existsSync } from 'fs' import axios from 'axios' import { asyncHandler } from '../../utils/asyncHandler.js' import { successResponse } from '../../utils/response.js' import { resolveNotebookPath } from '../../utils/pathSafety.js' import { getUniqueFilename } from '../../utils/file.js' import { formatTimestamp } from '../../../shared/utils/date.js' import { getTempDir } from '../../utils/tempDir.js' import { createJobContext, spawnPythonScript, findImageDestinations, applyReplacements, copyLocalImage, cleanupJob, getScriptPath, ensureScriptExists, } from './documentParser.js' import type { ImageReplacement } from './documentParser.js' import { ValidationError, InternalError } from '../../../shared/errors/index.js' import { logger } from '../../utils/logger.js' const router = express.Router() const tempDir = getTempDir() router.post( '/parse-local', asyncHandler(async (req: Request, res: Response) => { const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body as { htmlPath?: string htmlDir?: string assetsDirName?: string assetsFiles?: string[] targetPath?: string } if (!htmlPath || !htmlDir || !targetPath) { throw new ValidationError('htmlPath, htmlDir and targetPath are required') } let fullTargetPath: string try { const resolved = resolveNotebookPath(targetPath) fullTargetPath = resolved.fullPath } catch (error) { throw error } const scriptPath = getScriptPath('blog', 'parse_blog.py') if (!ensureScriptExists(scriptPath)) { throw new InternalError('Parser script not found') } const jobContext = await createJobContext('blog') let htmlPathInJob = '' try { htmlPathInJob = path.join(jobContext.jobDir, 'input.html') await fs.copyFile(htmlPath, htmlPathInJob) if (assetsDirName && assetsFiles && assetsFiles.length > 0) { const assetsDirPath = path.join(htmlDir, assetsDirName) for (const relPath of assetsFiles) { const srcPath = path.join(assetsDirPath, relPath) if (existsSync(srcPath)) { const destPath = path.join(jobContext.jobDir, assetsDirName, relPath) await fs.mkdir(path.dirname(destPath), { recursive: true }) await fs.copyFile(srcPath, destPath) } } } } catch (err) { await cleanupJob(jobContext.jobDir) throw err } processHtmlInBackground({ jobDir: jobContext.jobDir, htmlPath: htmlPathInJob, targetPath: fullTargetPath, cwd: path.dirname(scriptPath), jobContext, originalHtmlDir: htmlDir, originalAssetsDirName: assetsDirName, }).catch(err => { logger.error('Background HTML processing failed:', err) fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => { }) cleanupJob(jobContext.jobDir).catch(() => { }) }) successResponse(res, { message: 'HTML parsing started in background.', status: 'processing' }) }), ) interface ProcessHtmlArgs { jobDir: string htmlPath: string targetPath: string cwd: string jobContext: ReturnType extends Promise ? T : never originalHtmlDir?: string originalAssetsDirName?: string } async function processHtmlInBackground(args: ProcessHtmlArgs) { const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args try { await spawnPythonScript({ scriptPath: 'parse_blog.py', args: [htmlPath], cwd, }) const parsedPathObj = path.parse(htmlPath) const markdownPath = path.join(parsedPathObj.dir, `${parsedPathObj.name}.md`) if (!existsSync(markdownPath)) { throw new Error('Markdown result file not found') } let mdContent = await fs.readFile(markdownPath, 'utf-8') const ctx = await jobContext const htmlDir = path.dirname(htmlPath) const replacements: ImageReplacement[] = [] const destinations = findImageDestinations(mdContent) for (const dest of destinations) { const originalSrc = dest.url if (!originalSrc) continue if (originalSrc.startsWith('http://') || originalSrc.startsWith('https://')) { try { const response = await axios.get(originalSrc, { responseType: 'arraybuffer', timeout: 10000 }) const contentType = response.headers['content-type'] let ext = '.jpg' if (contentType) { if (contentType.includes('png')) ext = '.png' else if (contentType.includes('gif')) ext = '.gif' else if (contentType.includes('webp')) ext = '.webp' else if (contentType.includes('svg')) ext = '.svg' else if (contentType.includes('jpeg') || contentType.includes('jpg')) ext = '.jpg' } const urlExt = path.extname(originalSrc.split('?')[0]) if (urlExt) ext = urlExt const baseName = formatTimestamp(ctx.now) const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext) const newPath = path.join(ctx.destImagesDir, newFilename) await fs.writeFile(newPath, response.data) replacements.push({ start: dest.start, end: dest.end, original: originalSrc, replacement: `/${ctx.imagesSubDir}/${newFilename}` }) } catch { } continue } if (originalSrc.startsWith('data:')) continue let result = await copyLocalImage( originalSrc, jobDir, htmlDir, ctx.destImagesDir, ctx.imagesSubDir, ctx.now ) if (!result && originalHtmlDir && originalAssetsDirName) { const srcWithFiles = originalSrc.replace(/^\.\//, '').replace(/^\//, '') const possiblePaths = [ path.join(originalHtmlDir, originalAssetsDirName, srcWithFiles), path.join(originalHtmlDir, originalAssetsDirName, path.basename(srcWithFiles)), ] for (const p of possiblePaths) { if (existsSync(p)) { const ext = path.extname(p) || '.jpg' const baseName = formatTimestamp(ctx.now) const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext) const newPath = path.join(ctx.destImagesDir, newFilename) await fs.copyFile(p, newPath) result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` } break } } } if (result) { replacements.push({ start: dest.start, end: dest.end, original: originalSrc, replacement: result.newLink }) } } mdContent = applyReplacements(mdContent, replacements) await fs.writeFile(targetPath, mdContent, 'utf-8') await fs.unlink(markdownPath).catch(() => { }) } finally { await cleanupJob(jobDir) } } export default router