Initial commit
This commit is contained in:
184
api/modules/document-parser/documentParser.ts
Normal file
184
api/modules/document-parser/documentParser.ts
Normal file
@@ -0,0 +1,184 @@
|
||||
import path from 'path'
|
||||
import { spawn } from 'child_process'
|
||||
import fs from 'fs/promises'
|
||||
import { existsSync, mkdirSync } from 'fs'
|
||||
import { PROJECT_ROOT, NOTEBOOK_ROOT, TEMP_ROOT } from '../../config/paths.js'
|
||||
import { getUniqueFilename } from '../../utils/file.js'
|
||||
import { formatTimestamp, pad2 } from '../../../shared/utils/date.js'
|
||||
import { logger } from '../../utils/logger.js'
|
||||
|
||||
if (!existsSync(TEMP_ROOT)) {
|
||||
mkdirSync(TEMP_ROOT, { recursive: true })
|
||||
}
|
||||
|
||||
export interface JobContext {
|
||||
jobDir: string
|
||||
now: Date
|
||||
imagesSubDir: string
|
||||
destImagesDir: string
|
||||
}
|
||||
|
||||
export const createJobContext = async (prefix: string): Promise<JobContext> => {
|
||||
const now = new Date()
|
||||
const jobDir = path.join(TEMP_ROOT, `${prefix}_${formatTimestamp(now)}`)
|
||||
await fs.mkdir(jobDir, { recursive: true })
|
||||
|
||||
const year = now.getFullYear()
|
||||
const month = pad2(now.getMonth() + 1)
|
||||
const day = pad2(now.getDate())
|
||||
const imagesSubDir = `images/${year}/${month}/${day}`
|
||||
const destImagesDir = path.join(NOTEBOOK_ROOT, imagesSubDir)
|
||||
await fs.mkdir(destImagesDir, { recursive: true })
|
||||
|
||||
return { jobDir, now, imagesSubDir, destImagesDir }
|
||||
}
|
||||
|
||||
export interface SpawnPythonOptions {
|
||||
scriptPath: string
|
||||
args: string[]
|
||||
cwd: string
|
||||
inputContent?: string
|
||||
}
|
||||
|
||||
export const spawnPythonScript = async (options: SpawnPythonOptions): Promise<string> => {
|
||||
const { scriptPath, args, cwd, inputContent } = options
|
||||
|
||||
return new Promise<string>((resolve, reject) => {
|
||||
const pythonProcess = spawn('python', ['-X', 'utf8', scriptPath, ...args], {
|
||||
cwd,
|
||||
env: { ...process.env, PYTHONIOENCODING: 'utf-8', PYTHONUTF8: '1' },
|
||||
})
|
||||
|
||||
let stdout = ''
|
||||
let stderr = ''
|
||||
|
||||
pythonProcess.stdout.on('data', (data) => {
|
||||
stdout += data.toString()
|
||||
})
|
||||
|
||||
pythonProcess.stderr.on('data', (data) => {
|
||||
stderr += data.toString()
|
||||
})
|
||||
|
||||
pythonProcess.on('close', (code) => {
|
||||
if (code !== 0) {
|
||||
logger.error('Python script error:', stderr)
|
||||
reject(new Error(`Process exited with code ${code}. Error: ${stderr}`))
|
||||
} else {
|
||||
resolve(stdout)
|
||||
}
|
||||
})
|
||||
|
||||
pythonProcess.on('error', (err) => {
|
||||
reject(err)
|
||||
})
|
||||
|
||||
if (inputContent !== undefined) {
|
||||
pythonProcess.stdin.write(inputContent)
|
||||
pythonProcess.stdin.end()
|
||||
}
|
||||
})
|
||||
}
|
||||
|
||||
export interface ImageReplacement {
|
||||
start: number
|
||||
end: number
|
||||
original: string
|
||||
replacement: string
|
||||
}
|
||||
|
||||
export const findImageDestinations = (md: string): Array<{ url: string; start: number; end: number }> => {
|
||||
const results: Array<{ url: string; start: number; end: number }> = []
|
||||
let i = 0
|
||||
while (i < md.length) {
|
||||
const bang = md.indexOf('![', i)
|
||||
if (bang === -1) break
|
||||
const closeBracket = md.indexOf(']', bang + 2)
|
||||
if (closeBracket === -1) break
|
||||
if (md[closeBracket + 1] !== '(') {
|
||||
i = closeBracket + 1
|
||||
continue
|
||||
}
|
||||
|
||||
const urlStart = closeBracket + 2
|
||||
let depth = 1
|
||||
let j = urlStart
|
||||
for (; j < md.length; j++) {
|
||||
const ch = md[j]
|
||||
if (ch === '(') depth++
|
||||
else if (ch === ')') {
|
||||
depth--
|
||||
if (depth === 0) break
|
||||
}
|
||||
}
|
||||
if (depth !== 0) break
|
||||
results.push({ url: md.slice(urlStart, j), start: urlStart, end: j })
|
||||
i = j + 1
|
||||
}
|
||||
return results
|
||||
}
|
||||
|
||||
export const applyReplacements = (md: string, replacements: ImageReplacement[]): string => {
|
||||
const sorted = [...replacements].sort((a, b) => b.start - a.start)
|
||||
let result = md
|
||||
for (const r of sorted) {
|
||||
result = `${result.slice(0, r.start)}${r.replacement}${result.slice(r.end)}`
|
||||
}
|
||||
return result
|
||||
}
|
||||
|
||||
export const copyLocalImage = async (
|
||||
src: string,
|
||||
jobDir: string,
|
||||
htmlDir: string,
|
||||
destImagesDir: string,
|
||||
imagesSubDir: string,
|
||||
now: Date
|
||||
): Promise<{ newLink: string } | null> => {
|
||||
const s0 = src.trim().replace(/^<|>$/g, '')
|
||||
if (!s0) return null
|
||||
|
||||
let decoded = s0
|
||||
try {
|
||||
decoded = decodeURI(s0)
|
||||
} catch {}
|
||||
|
||||
const s1 = decoded.replace(/\\/g, '/')
|
||||
const s2 = s1.startsWith('./') ? s1.slice(2) : s1
|
||||
const candidates = s2.startsWith('/')
|
||||
? [path.join(jobDir, s2.slice(1)), path.join(htmlDir, s2.slice(1))]
|
||||
: [path.resolve(htmlDir, s2), path.resolve(jobDir, s2)]
|
||||
|
||||
let foundFile: string | null = null
|
||||
for (const c of candidates) {
|
||||
if (existsSync(c)) {
|
||||
foundFile = c
|
||||
break
|
||||
}
|
||||
}
|
||||
|
||||
if (!foundFile) return null
|
||||
|
||||
const ext = path.extname(foundFile) || '.jpg'
|
||||
const baseName = formatTimestamp(now)
|
||||
const newFilename = await getUniqueFilename(destImagesDir, baseName, ext)
|
||||
const newPath = path.join(destImagesDir, newFilename)
|
||||
await fs.copyFile(foundFile, newPath)
|
||||
|
||||
return { newLink: `/${imagesSubDir}/${newFilename}` }
|
||||
}
|
||||
|
||||
export const cleanupJob = async (jobDir: string, additionalPaths: string[] = []): Promise<void> => {
|
||||
await fs.rm(jobDir, { recursive: true, force: true }).catch(() => {})
|
||||
for (const p of additionalPaths) {
|
||||
await fs.unlink(p).catch(() => {})
|
||||
}
|
||||
}
|
||||
|
||||
export const getScriptPath = (toolName: string, scriptName: string): string => {
|
||||
return path.join(PROJECT_ROOT, 'tools', toolName, scriptName)
|
||||
}
|
||||
|
||||
export const ensureScriptExists = (scriptPath: string): boolean => {
|
||||
return existsSync(scriptPath)
|
||||
}
|
||||
Reference in New Issue
Block a user