Initial commit
This commit is contained in:
217
tools/tongyi/MD.py
Normal file
217
tools/tongyi/MD.py
Normal file
@@ -0,0 +1,217 @@
|
||||
#!/usr/bin/env python
|
||||
#coding=utf-8
|
||||
|
||||
import os
|
||||
import json
|
||||
import datetime
|
||||
from PptExtraction import main as ppt_main
|
||||
from Transcription import main as transcription_main
|
||||
|
||||
|
||||
def get_ppt_result():
|
||||
"""
|
||||
运行 PptExtraction.py 并获取结果字典
|
||||
"""
|
||||
try:
|
||||
# 导入 PptExtraction 模块的函数
|
||||
from PptExtraction import read_a_txt, download_ppt_extraction, download_image
|
||||
|
||||
# 1. 读取并解析 a.txt 文件
|
||||
ppt_extraction_url = read_a_txt()
|
||||
if not ppt_extraction_url:
|
||||
print("无法提取 PptExtraction 链接")
|
||||
return {}
|
||||
|
||||
# 2. 下载并解析 PptExtraction 内容
|
||||
key_frame_list = download_ppt_extraction(ppt_extraction_url)
|
||||
if not key_frame_list:
|
||||
print("无法获取 PPT 内容")
|
||||
return {}
|
||||
|
||||
# 3. 生成时间戳(年月日_时分秒)
|
||||
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
print(f"生成时间戳: {timestamp}")
|
||||
|
||||
# 4. 创建保存目录
|
||||
save_dir = r'd:\Xuanchi\高斯泼溅\XCNote\tools\tongyi\ppt_output'
|
||||
if not os.path.exists(save_dir):
|
||||
os.makedirs(save_dir)
|
||||
print(f"创建保存目录: {save_dir}")
|
||||
|
||||
# 5. 下载图片并整理结果
|
||||
result_dict = {}
|
||||
downloaded_count = 0
|
||||
|
||||
for i, frame in enumerate(key_frame_list):
|
||||
image_url = frame.get('FileUrl')
|
||||
if image_url:
|
||||
# 下载图片
|
||||
image_filename = download_image(image_url, save_dir, timestamp, i+1)
|
||||
if image_filename:
|
||||
downloaded_count += 1
|
||||
# 整理成字典条目
|
||||
result_dict[i+1] = {
|
||||
"Start": frame.get('Start'),
|
||||
"End": frame.get('End'),
|
||||
"Type": "image",
|
||||
"Content": f""
|
||||
}
|
||||
|
||||
print(f"成功获取 PPT 结果,共 {len(result_dict)} 张图片")
|
||||
return result_dict
|
||||
except Exception as e:
|
||||
print(f"获取 PPT 结果失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return {}
|
||||
|
||||
|
||||
def get_transcription_result():
|
||||
"""
|
||||
运行 Transcription.py 并获取结果字典
|
||||
"""
|
||||
try:
|
||||
# 导入 Transcription 模块的函数
|
||||
from Transcription import read_a_txt, download_transcription, process_transcription
|
||||
|
||||
# 1. 读取并解析 a.txt 文件
|
||||
transcription_url = read_a_txt()
|
||||
if not transcription_url:
|
||||
print("无法提取 Transcription 链接")
|
||||
return {}
|
||||
|
||||
# 2. 下载并解析 Transcription 内容
|
||||
paragraphs = download_transcription(transcription_url)
|
||||
if not paragraphs:
|
||||
print("无法获取 Transcription 内容")
|
||||
return {}
|
||||
|
||||
# 3. 处理 Transcription 内容
|
||||
result_dict = process_transcription(paragraphs)
|
||||
|
||||
# 转换格式,添加 Type 字段
|
||||
for key, value in result_dict.items():
|
||||
value["Type"] = "text"
|
||||
value["Content"] = value.pop("Text")
|
||||
|
||||
print(f"成功获取 Transcription 结果,共 {len(result_dict)} 个句子")
|
||||
return result_dict
|
||||
except Exception as e:
|
||||
print(f"获取 Transcription 结果失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return {}
|
||||
|
||||
|
||||
def merge_results(ppt_result, transcription_result):
|
||||
"""
|
||||
根据时间顺序拼合两个结果字典
|
||||
"""
|
||||
try:
|
||||
# 转换为列表以便排序
|
||||
items = []
|
||||
|
||||
# 添加 PPT 项目
|
||||
for key, value in ppt_result.items():
|
||||
items.append({
|
||||
"id": f"ppt_{key}",
|
||||
"start": value["Start"],
|
||||
"end": value["End"],
|
||||
"type": value["Type"],
|
||||
"content": value["Content"]
|
||||
})
|
||||
|
||||
# 添加 Transcription 项目
|
||||
for key, value in transcription_result.items():
|
||||
items.append({
|
||||
"id": f"trans_{key}",
|
||||
"start": value["Start"],
|
||||
"end": value["End"],
|
||||
"type": value["Type"],
|
||||
"content": value["Content"]
|
||||
})
|
||||
|
||||
# 根据 start 时间排序,相同时间下图片优先
|
||||
items.sort(key=lambda x: (x["start"], 0 if x["type"] == "image" else 1))
|
||||
|
||||
print(f"成功拼合结果,共 {len(items)} 个项目")
|
||||
return items
|
||||
except Exception as e:
|
||||
print(f"拼合结果失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return []
|
||||
|
||||
|
||||
def generate_md(items):
|
||||
"""
|
||||
根据拼合结果生成 md 文档
|
||||
"""
|
||||
try:
|
||||
# 生成时间戳
|
||||
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
|
||||
|
||||
# 生成 md 文件名
|
||||
md_filename = f"{timestamp}_merged.md"
|
||||
md_path = os.path.join(r'd:\Xuanchi\高斯泼溅\XCNote\tools\tongyi', md_filename)
|
||||
|
||||
# 创建 md 内容
|
||||
md_content = f"# 拼合内容\n\n"
|
||||
md_content += f"生成时间: {datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n\n"
|
||||
|
||||
# 添加拼合的内容
|
||||
for item in items:
|
||||
if item["type"] == "image":
|
||||
md_content += f"{item['content']}\n\n"
|
||||
else:
|
||||
md_content += f"{item['content']}\n\n"
|
||||
|
||||
# 保存 md 文件
|
||||
with open(md_path, 'w', encoding='utf-8') as f:
|
||||
f.write(md_content)
|
||||
|
||||
print(f"成功生成 md 文档: {md_filename}")
|
||||
return md_filename
|
||||
except Exception as e:
|
||||
print(f"生成 md 文档失败: {e}")
|
||||
import traceback
|
||||
traceback.print_exc()
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""
|
||||
主函数
|
||||
"""
|
||||
print("===== 开始生成拼合 MD 文档 =====")
|
||||
|
||||
# 1. 获取 PPT 结果
|
||||
print("\n===== 获取 PPT 结果 =====")
|
||||
ppt_result = get_ppt_result()
|
||||
|
||||
# 2. 获取 Transcription 结果
|
||||
print("\n===== 获取 Transcription 结果 =====")
|
||||
transcription_result = get_transcription_result()
|
||||
|
||||
# 3. 拼合结果
|
||||
print("\n===== 拼合结果 =====")
|
||||
merged_items = merge_results(ppt_result, transcription_result)
|
||||
|
||||
if not merged_items:
|
||||
print("无法生成拼合结果,程序退出")
|
||||
return
|
||||
|
||||
# 4. 生成 md 文档
|
||||
print("\n===== 生成 MD 文档 =====")
|
||||
md_filename = generate_md(merged_items)
|
||||
|
||||
if md_filename:
|
||||
print(f"\n===== 处理完成 =====")
|
||||
print(f"生成的 MD 文档: {md_filename}")
|
||||
print(f"保存位置: d:\Xuanchi\高斯泼溅\XCNote\tools\tongyi\{md_filename}")
|
||||
else:
|
||||
print("生成 MD 文档失败,程序退出")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user