Initial commit

This commit is contained in:
2026-03-08 01:34:54 +08:00
commit 1f104f73c8
441 changed files with 64911 additions and 0 deletions

View File

@@ -0,0 +1,217 @@
import express, { type Request, type Response } from 'express'
import path from 'path'
import fs from 'fs/promises'
import { existsSync } from 'fs'
import axios from 'axios'
import { asyncHandler } from '../../utils/asyncHandler.js'
import { successResponse } from '../../utils/response.js'
import { resolveNotebookPath } from '../../utils/pathSafety.js'
import { getUniqueFilename } from '../../utils/file.js'
import { formatTimestamp } from '../../../shared/utils/date.js'
import { getTempDir } from '../../utils/tempDir.js'
import {
createJobContext,
spawnPythonScript,
findImageDestinations,
applyReplacements,
copyLocalImage,
cleanupJob,
getScriptPath,
ensureScriptExists,
} from './documentParser.js'
import type { ImageReplacement } from './documentParser.js'
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
import { logger } from '../../utils/logger.js'
const router = express.Router()
const tempDir = getTempDir()
router.post(
'/parse-local',
asyncHandler(async (req: Request, res: Response) => {
const { htmlPath, htmlDir, assetsDirName, assetsFiles, targetPath } = req.body as {
htmlPath?: string
htmlDir?: string
assetsDirName?: string
assetsFiles?: string[]
targetPath?: string
}
if (!htmlPath || !htmlDir || !targetPath) {
throw new ValidationError('htmlPath, htmlDir and targetPath are required')
}
let fullTargetPath: string
try {
const resolved = resolveNotebookPath(targetPath)
fullTargetPath = resolved.fullPath
} catch (error) {
throw error
}
const scriptPath = getScriptPath('blog', 'parse_blog.py')
if (!ensureScriptExists(scriptPath)) {
throw new InternalError('Parser script not found')
}
const jobContext = await createJobContext('blog')
let htmlPathInJob = ''
try {
htmlPathInJob = path.join(jobContext.jobDir, 'input.html')
await fs.copyFile(htmlPath, htmlPathInJob)
if (assetsDirName && assetsFiles && assetsFiles.length > 0) {
const assetsDirPath = path.join(htmlDir, assetsDirName)
for (const relPath of assetsFiles) {
const srcPath = path.join(assetsDirPath, relPath)
if (existsSync(srcPath)) {
const destPath = path.join(jobContext.jobDir, assetsDirName, relPath)
await fs.mkdir(path.dirname(destPath), { recursive: true })
await fs.copyFile(srcPath, destPath)
}
}
}
} catch (err) {
await cleanupJob(jobContext.jobDir)
throw err
}
processHtmlInBackground({
jobDir: jobContext.jobDir,
htmlPath: htmlPathInJob,
targetPath: fullTargetPath,
cwd: path.dirname(scriptPath),
jobContext,
originalHtmlDir: htmlDir,
originalAssetsDirName: assetsDirName,
}).catch(err => {
logger.error('Background HTML processing failed:', err)
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => { })
cleanupJob(jobContext.jobDir).catch(() => { })
})
successResponse(res, {
message: 'HTML parsing started in background.',
status: 'processing'
})
}),
)
interface ProcessHtmlArgs {
jobDir: string
htmlPath: string
targetPath: string
cwd: string
jobContext: ReturnType<typeof createJobContext> extends Promise<infer T> ? T : never
originalHtmlDir?: string
originalAssetsDirName?: string
}
async function processHtmlInBackground(args: ProcessHtmlArgs) {
const { jobDir, htmlPath, targetPath, cwd, jobContext, originalHtmlDir, originalAssetsDirName } = args
try {
await spawnPythonScript({
scriptPath: 'parse_blog.py',
args: [htmlPath],
cwd,
})
const parsedPathObj = path.parse(htmlPath)
const markdownPath = path.join(parsedPathObj.dir, `${parsedPathObj.name}.md`)
if (!existsSync(markdownPath)) {
throw new Error('Markdown result file not found')
}
let mdContent = await fs.readFile(markdownPath, 'utf-8')
const ctx = await jobContext
const htmlDir = path.dirname(htmlPath)
const replacements: ImageReplacement[] = []
const destinations = findImageDestinations(mdContent)
for (const dest of destinations) {
const originalSrc = dest.url
if (!originalSrc) continue
if (originalSrc.startsWith('http://') || originalSrc.startsWith('https://')) {
try {
const response = await axios.get(originalSrc, { responseType: 'arraybuffer', timeout: 10000 })
const contentType = response.headers['content-type']
let ext = '.jpg'
if (contentType) {
if (contentType.includes('png')) ext = '.png'
else if (contentType.includes('gif')) ext = '.gif'
else if (contentType.includes('webp')) ext = '.webp'
else if (contentType.includes('svg')) ext = '.svg'
else if (contentType.includes('jpeg') || contentType.includes('jpg')) ext = '.jpg'
}
const urlExt = path.extname(originalSrc.split('?')[0])
if (urlExt) ext = urlExt
const baseName = formatTimestamp(ctx.now)
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
const newPath = path.join(ctx.destImagesDir, newFilename)
await fs.writeFile(newPath, response.data)
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: `/${ctx.imagesSubDir}/${newFilename}`
})
} catch { }
continue
}
if (originalSrc.startsWith('data:')) continue
let result = await copyLocalImage(
originalSrc,
jobDir,
htmlDir,
ctx.destImagesDir,
ctx.imagesSubDir,
ctx.now
)
if (!result && originalHtmlDir && originalAssetsDirName) {
const srcWithFiles = originalSrc.replace(/^\.\//, '').replace(/^\//, '')
const possiblePaths = [
path.join(originalHtmlDir, originalAssetsDirName, srcWithFiles),
path.join(originalHtmlDir, originalAssetsDirName, path.basename(srcWithFiles)),
]
for (const p of possiblePaths) {
if (existsSync(p)) {
const ext = path.extname(p) || '.jpg'
const baseName = formatTimestamp(ctx.now)
const newFilename = await getUniqueFilename(ctx.destImagesDir, baseName, ext)
const newPath = path.join(ctx.destImagesDir, newFilename)
await fs.copyFile(p, newPath)
result = { newLink: `/${ctx.imagesSubDir}/${newFilename}` }
break
}
}
}
if (result) {
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: result.newLink
})
}
}
mdContent = applyReplacements(mdContent, replacements)
await fs.writeFile(targetPath, mdContent, 'utf-8')
await fs.unlink(markdownPath).catch(() => { })
} finally {
await cleanupJob(jobDir)
}
}
export default router

View File

@@ -0,0 +1,184 @@
import path from 'path'
import { spawn } from 'child_process'
import fs from 'fs/promises'
import { existsSync, mkdirSync } from 'fs'
import { PROJECT_ROOT, NOTEBOOK_ROOT, TEMP_ROOT } from '../../config/paths.js'
import { getUniqueFilename } from '../../utils/file.js'
import { formatTimestamp, pad2 } from '../../../shared/utils/date.js'
import { logger } from '../../utils/logger.js'
if (!existsSync(TEMP_ROOT)) {
mkdirSync(TEMP_ROOT, { recursive: true })
}
export interface JobContext {
jobDir: string
now: Date
imagesSubDir: string
destImagesDir: string
}
export const createJobContext = async (prefix: string): Promise<JobContext> => {
const now = new Date()
const jobDir = path.join(TEMP_ROOT, `${prefix}_${formatTimestamp(now)}`)
await fs.mkdir(jobDir, { recursive: true })
const year = now.getFullYear()
const month = pad2(now.getMonth() + 1)
const day = pad2(now.getDate())
const imagesSubDir = `images/${year}/${month}/${day}`
const destImagesDir = path.join(NOTEBOOK_ROOT, imagesSubDir)
await fs.mkdir(destImagesDir, { recursive: true })
return { jobDir, now, imagesSubDir, destImagesDir }
}
export interface SpawnPythonOptions {
scriptPath: string
args: string[]
cwd: string
inputContent?: string
}
export const spawnPythonScript = async (options: SpawnPythonOptions): Promise<string> => {
const { scriptPath, args, cwd, inputContent } = options
return new Promise<string>((resolve, reject) => {
const pythonProcess = spawn('python', ['-X', 'utf8', scriptPath, ...args], {
cwd,
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' },
})
let stdout = ''
let stderr = ''
pythonProcess.stdout.on('data', (data) => {
stdout += data.toString()
})
pythonProcess.stderr.on('data', (data) => {
stderr += data.toString()
})
pythonProcess.on('close', (code) => {
if (code !== 0) {
logger.error('Python script error:', stderr)
reject(new Error(`Process exited with code ${code}. Error: ${stderr}`))
} else {
resolve(stdout)
}
})
pythonProcess.on('error', (err) => {
reject(err)
})
if (inputContent !== undefined) {
pythonProcess.stdin.write(inputContent)
pythonProcess.stdin.end()
}
})
}
export interface ImageReplacement {
start: number
end: number
original: string
replacement: string
}
export const findImageDestinations = (md: string): Array<{ url: string; start: number; end: number }> => {
const results: Array<{ url: string; start: number; end: number }> = []
let i = 0
while (i < md.length) {
const bang = md.indexOf('![', i)
if (bang === -1) break
const closeBracket = md.indexOf(']', bang + 2)
if (closeBracket === -1) break
if (md[closeBracket + 1] !== '(') {
i = closeBracket + 1
continue
}
const urlStart = closeBracket + 2
let depth = 1
let j = urlStart
for (; j < md.length; j++) {
const ch = md[j]
if (ch === '(') depth++
else if (ch === ')') {
depth--
if (depth === 0) break
}
}
if (depth !== 0) break
results.push({ url: md.slice(urlStart, j), start: urlStart, end: j })
i = j + 1
}
return results
}
export const applyReplacements = (md: string, replacements: ImageReplacement[]): string => {
const sorted = [...replacements].sort((a, b) => b.start - a.start)
let result = md
for (const r of sorted) {
result = `${result.slice(0, r.start)}${r.replacement}${result.slice(r.end)}`
}
return result
}
export const copyLocalImage = async (
src: string,
jobDir: string,
htmlDir: string,
destImagesDir: string,
imagesSubDir: string,
now: Date
): Promise<{ newLink: string } | null> => {
const s0 = src.trim().replace(/^<|>$/g, '')
if (!s0) return null
let decoded = s0
try {
decoded = decodeURI(s0)
} catch {}
const s1 = decoded.replace(/\\/g, '/')
const s2 = s1.startsWith('./') ? s1.slice(2) : s1
const candidates = s2.startsWith('/')
? [path.join(jobDir, s2.slice(1)), path.join(htmlDir, s2.slice(1))]
: [path.resolve(htmlDir, s2), path.resolve(jobDir, s2)]
let foundFile: string | null = null
for (const c of candidates) {
if (existsSync(c)) {
foundFile = c
break
}
}
if (!foundFile) return null
const ext = path.extname(foundFile) || '.jpg'
const baseName = formatTimestamp(now)
const newFilename = await getUniqueFilename(destImagesDir, baseName, ext)
const newPath = path.join(destImagesDir, newFilename)
await fs.copyFile(foundFile, newPath)
return { newLink: `/${imagesSubDir}/${newFilename}` }
}
export const cleanupJob = async (jobDir: string, additionalPaths: string[] = []): Promise<void> => {
await fs.rm(jobDir, { recursive: true, force: true }).catch(() => {})
for (const p of additionalPaths) {
await fs.unlink(p).catch(() => {})
}
}
export const getScriptPath = (toolName: string, scriptName: string): string => {
return path.join(PROJECT_ROOT, 'tools', toolName, scriptName)
}
export const ensureScriptExists = (scriptPath: string): boolean => {
return existsSync(scriptPath)
}

View File

@@ -0,0 +1,23 @@
import express, { type Router } from 'express'
import type { ServiceContainer } from '../../infra/container.js'
import { createApiModule } from '../../infra/createModule.js'
import { DOCUMENT_PARSER_MODULE } from '../../../shared/modules/document-parser/index.js'
import blogRoutes from './blogRoutes.js'
import mineruRoutes from './mineruRoutes.js'
export * from './documentParser.js'
export { default as blogRoutes } from './blogRoutes.js'
export { default as mineruRoutes } from './mineruRoutes.js'
export const createDocumentParserModule = () => {
return createApiModule(DOCUMENT_PARSER_MODULE, {
routes: (_container: ServiceContainer): Router => {
const router = express.Router()
router.use('/blog', blogRoutes)
router.use('/mineru', mineruRoutes)
return router
},
})
}
export default createDocumentParserModule

View File

@@ -0,0 +1,158 @@
import express, { type Request, type Response } from 'express'
import multer from 'multer'
import path from 'path'
import fs from 'fs/promises'
import { existsSync } from 'fs'
import { asyncHandler } from '../../utils/asyncHandler.js'
import { successResponse } from '../../utils/response.js'
import { resolveNotebookPath } from '../../utils/pathSafety.js'
import { getUniqueFilename } from '../../utils/file.js'
import { formatTimestamp } from '../../../shared/utils/date.js'
import { getTempDir } from '../../utils/tempDir.js'
import {
createJobContext,
spawnPythonScript,
findImageDestinations,
applyReplacements,
cleanupJob,
getScriptPath,
ensureScriptExists,
} from './documentParser.js'
import type { ImageReplacement } from './documentParser.js'
import { ValidationError, InternalError } from '../../../shared/errors/index.js'
import { logger } from '../../utils/logger.js'
const router = express.Router()
const tempDir = getTempDir()
const upload = multer({
dest: tempDir,
limits: {
fileSize: 50 * 1024 * 1024
}
})
router.post(
'/parse',
upload.single('file'),
asyncHandler(async (req: Request, res: Response) => {
if (!req.file) {
throw new ValidationError('File is required')
}
const { targetPath } = req.body as { targetPath?: string }
if (!targetPath) {
await fs.unlink(req.file.path).catch(() => {})
throw new ValidationError('Target path is required')
}
let fullTargetPath: string
try {
const resolved = resolveNotebookPath(targetPath)
fullTargetPath = resolved.fullPath
} catch (error) {
await fs.unlink(req.file.path).catch(() => {})
throw error
}
const scriptPath = getScriptPath('mineru', 'mineru_parser.py')
if (!ensureScriptExists(scriptPath)) {
await fs.unlink(req.file.path).catch(() => {})
throw new InternalError('Parser script not found')
}
processPdfInBackground(req.file.path, fullTargetPath, path.dirname(scriptPath))
.catch(err => {
logger.error('Background PDF processing failed:', err)
fs.writeFile(fullTargetPath, `# 解析失败\n\n> 错误信息: ${err.message}`, 'utf-8').catch(() => {})
})
successResponse(res, {
message: 'PDF upload successful. Parsing started in background.',
status: 'processing'
})
}),
)
async function processPdfInBackground(filePath: string, targetPath: string, cwd: string) {
try {
const output = await spawnPythonScript({
scriptPath: 'mineru_parser.py',
args: [filePath],
cwd,
})
const match = output.match(/JSON_RESULT:(.*)/)
if (!match) {
throw new Error('Failed to parse Python script output: JSON_RESULT not found')
}
const result = JSON.parse(match[1])
const markdownPath = result.markdown_file
const outputDir = result.output_dir
if (!existsSync(markdownPath)) {
throw new Error('Markdown result file not found')
}
let mdContent = await fs.readFile(markdownPath, 'utf-8')
const imagesDir = path.join(outputDir, 'images')
if (existsSync(imagesDir)) {
const jobContext = await createJobContext('pdf_images')
const destinations = findImageDestinations(mdContent)
const replacements: ImageReplacement[] = []
for (const dest of destinations) {
const originalSrc = dest.url
if (!originalSrc) continue
const possibleFilenames = [originalSrc, path.basename(originalSrc)]
let foundFile: string | null = null
for (const fname of possibleFilenames) {
const localPath = path.join(imagesDir, fname)
if (existsSync(localPath)) {
foundFile = localPath
break
}
const directPath = path.join(outputDir, originalSrc)
if (existsSync(directPath)) {
foundFile = directPath
break
}
}
if (foundFile) {
const ext = path.extname(foundFile)
const baseName = formatTimestamp(jobContext.now)
const newFilename = await getUniqueFilename(jobContext.destImagesDir, baseName, ext)
const newPath = path.join(jobContext.destImagesDir, newFilename)
await fs.copyFile(foundFile, newPath)
replacements.push({
start: dest.start,
end: dest.end,
original: originalSrc,
replacement: `${jobContext.imagesSubDir}/${newFilename}`
})
}
}
mdContent = applyReplacements(mdContent, replacements)
}
await fs.writeFile(targetPath, mdContent, 'utf-8')
await fs.unlink(markdownPath).catch(() => {})
if (outputDir && outputDir.includes('temp')) {
await fs.rm(outputDir, { recursive: true, force: true }).catch(() => {})
}
} finally {
await fs.unlink(filePath).catch(() => {})
}
}
export default router