Files
XCDesktop/tools/tongyi/Transcription.py
2026-03-08 01:34:54 +08:00

136 lines
4.3 KiB
Python

#!/usr/bin/env python
#coding=utf-8
import os
import json
import requests
import datetime
def read_a_txt():
"""
读取并解析 a.txt 文件,提取 Transcription 链接
"""
try:
file_path = r'd:\Xuanchi\高斯泼溅\XCNote\tools\tongyi\a.txt'
with open(file_path, 'r', encoding='utf-8') as f:
# 读取文件内容,跳过开头的 "响应数据: " 字符串
content = f.read().replace('响应数据: ', '')
data = json.loads(content)
transcription_url = data.get('Data', {}).get('Result', {}).get('Transcription')
if transcription_url:
print(f"成功提取 Transcription 链接: {transcription_url}")
return transcription_url
else:
print("无法找到 Transcription 链接")
return None
except Exception as e:
print(f"读取 a.txt 文件失败: {e}")
import traceback
traceback.print_exc()
return None
def download_transcription(url):
"""
下载并解析 Transcription 内容
"""
try:
print(f"开始下载 Transcription 内容: {url}")
response = requests.get(url, timeout=30)
response.raise_for_status()
data = response.json()
transcription_data = data.get('Transcription', {})
paragraphs = transcription_data.get('Paragraphs', [])
print(f"成功下载并解析 Transcription 内容,共 {len(paragraphs)} 个段落")
return paragraphs
except Exception as e:
print(f"下载 Transcription 内容失败: {e}")
import traceback
traceback.print_exc()
return []
def process_transcription(paragraphs):
"""
处理 Transcription 内容,根据 SentenceId 拼合成句子,并整理成字典
"""
try:
result_dict = {}
sentence_counter = 1
for paragraph in paragraphs:
words = paragraph.get('Words', [])
# 按 SentenceId 分组
sentences = {}
for word in words:
sentence_id = word.get('SentenceId')
if sentence_id not in sentences:
sentences[sentence_id] = {
'words': [word.get('Text')],
'start': word.get('Start'),
'end': word.get('End')
}
else:
sentences[sentence_id]['words'].append(word.get('Text'))
# 更新结束时间
sentences[sentence_id]['end'] = word.get('End')
# 处理每个句子
for sentence_id, sentence_data in sorted(sentences.items()):
# 拼接句子
sentence_text = ''.join(sentence_data['words'])
if sentence_text:
# 整理成字典条目
result_dict[sentence_counter] = {
"Start": sentence_data['start'],
"End": sentence_data['end'],
"Text": sentence_text
}
sentence_counter += 1
print(f"成功处理 Transcription 内容,共 {len(result_dict)} 个句子")
return result_dict
except Exception as e:
print(f"处理 Transcription 内容失败: {e}")
import traceback
traceback.print_exc()
return {}
def main():
"""
主函数
"""
print("===== 开始处理 Transcription 结果 =====")
# 1. 读取并解析 a.txt 文件
transcription_url = read_a_txt()
if not transcription_url:
print("无法提取 Transcription 链接,程序退出")
return
# 2. 下载并解析 Transcription 内容
paragraphs = download_transcription(transcription_url)
if not paragraphs:
print("无法获取 Transcription 内容,程序退出")
return
# 3. 处理 Transcription 内容
result_dict = process_transcription(paragraphs)
if not result_dict:
print("处理 Transcription 内容失败,程序退出")
return
# 4. 输出整理结果
print(f"\n===== 处理完成 =====")
print(f"整理结果: ")
print(json.dumps(result_dict, indent=4, ensure_ascii=False))
if __name__ == "__main__":
main()