Files
XCDesktop/api/modules/document-parser/documentParser.ts

185 lines
5.0 KiB
TypeScript
Raw Normal View History

2026-03-08 01:34:54 +08:00
import path from 'path'
import { spawn } from 'child_process'
import fs from 'fs/promises'
import { existsSync, mkdirSync } from 'fs'
import { PROJECT_ROOT, NOTEBOOK_ROOT, TEMP_ROOT } from '../../config/paths.js'
import { getUniqueFilename } from '../../utils/file.js'
import { formatTimestamp, pad2 } from '../../../shared/utils/date.js'
import { logger } from '../../utils/logger.js'
if (!existsSync(TEMP_ROOT)) {
mkdirSync(TEMP_ROOT, { recursive: true })
}
export interface JobContext {
jobDir: string
now: Date
imagesSubDir: string
destImagesDir: string
}
export const createJobContext = async (prefix: string): Promise<JobContext> => {
const now = new Date()
const jobDir = path.join(TEMP_ROOT, `${prefix}_${formatTimestamp(now)}`)
await fs.mkdir(jobDir, { recursive: true })
const year = now.getFullYear()
const month = pad2(now.getMonth() + 1)
const day = pad2(now.getDate())
const imagesSubDir = `images/${year}/${month}/${day}`
const destImagesDir = path.join(NOTEBOOK_ROOT, imagesSubDir)
await fs.mkdir(destImagesDir, { recursive: true })
return { jobDir, now, imagesSubDir, destImagesDir }
}
export interface SpawnPythonOptions {
scriptPath: string
args: string[]
cwd: string
inputContent?: string
}
export const spawnPythonScript = async (options: SpawnPythonOptions): Promise<string> => {
const { scriptPath, args, cwd, inputContent } = options
return new Promise<string>((resolve, reject) => {
const pythonProcess = spawn('python', ['-X', 'utf8', scriptPath, ...args], {
cwd,
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' },
})
let stdout = ''
let stderr = ''
pythonProcess.stdout.on('data', (data) => {
stdout += data.toString()
})
pythonProcess.stderr.on('data', (data) => {
stderr += data.toString()
})
pythonProcess.on('close', (code) => {
if (code !== 0) {
logger.error('Python script error:', stderr)
reject(new Error(`Process exited with code ${code}. Error: ${stderr}`))
} else {
resolve(stdout)
}
})
pythonProcess.on('error', (err) => {
reject(err)
})
if (inputContent !== undefined) {
pythonProcess.stdin.write(inputContent)
pythonProcess.stdin.end()
}
})
}
export interface ImageReplacement {
start: number
end: number
original: string
replacement: string
}
export const findImageDestinations = (md: string): Array<{ url: string; start: number; end: number }> => {
const results: Array<{ url: string; start: number; end: number }> = []
let i = 0
while (i < md.length) {
const bang = md.indexOf('![', i)
if (bang === -1) break
const closeBracket = md.indexOf(']', bang + 2)
if (closeBracket === -1) break
if (md[closeBracket + 1] !== '(') {
i = closeBracket + 1
continue
}
const urlStart = closeBracket + 2
let depth = 1
let j = urlStart
for (; j < md.length; j++) {
const ch = md[j]
if (ch === '(') depth++
else if (ch === ')') {
depth--
if (depth === 0) break
}
}
if (depth !== 0) break
results.push({ url: md.slice(urlStart, j), start: urlStart, end: j })
i = j + 1
}
return results
}
export const applyReplacements = (md: string, replacements: ImageReplacement[]): string => {
const sorted = [...replacements].sort((a, b) => b.start - a.start)
let result = md
for (const r of sorted) {
result = `${result.slice(0, r.start)}${r.replacement}${result.slice(r.end)}`
}
return result
}
export const copyLocalImage = async (
src: string,
jobDir: string,
htmlDir: string,
destImagesDir: string,
imagesSubDir: string,
now: Date
): Promise<{ newLink: string } | null> => {
const s0 = src.trim().replace(/^<|>$/g, '')
if (!s0) return null
let decoded = s0
try {
decoded = decodeURI(s0)
} catch {}
const s1 = decoded.replace(/\\/g, '/')
const s2 = s1.startsWith('./') ? s1.slice(2) : s1
const candidates = s2.startsWith('/')
? [path.join(jobDir, s2.slice(1)), path.join(htmlDir, s2.slice(1))]
: [path.resolve(htmlDir, s2), path.resolve(jobDir, s2)]
let foundFile: string | null = null
for (const c of candidates) {
if (existsSync(c)) {
foundFile = c
break
}
}
if (!foundFile) return null
const ext = path.extname(foundFile) || '.jpg'
const baseName = formatTimestamp(now)
const newFilename = await getUniqueFilename(destImagesDir, baseName, ext)
const newPath = path.join(destImagesDir, newFilename)
await fs.copyFile(foundFile, newPath)
return { newLink: `/${imagesSubDir}/${newFilename}` }
}
export const cleanupJob = async (jobDir: string, additionalPaths: string[] = []): Promise<void> => {
await fs.rm(jobDir, { recursive: true, force: true }).catch(() => {})
for (const p of additionalPaths) {
await fs.unlink(p).catch(() => {})
}
}
export const getScriptPath = (toolName: string, scriptName: string): string => {
return path.join(PROJECT_ROOT, 'tools', toolName, scriptName)
}
export const ensureScriptExists = (scriptPath: string): boolean => {
return existsSync(scriptPath)
}