136 lines
4.3 KiB
Python
136 lines
4.3 KiB
Python
#!/usr/bin/env python
|
|
#coding=utf-8
|
|
|
|
import os
|
|
import json
|
|
import requests
|
|
import datetime
|
|
|
|
|
|
def read_a_txt():
|
|
"""
|
|
读取并解析 a.txt 文件,提取 Transcription 链接
|
|
"""
|
|
try:
|
|
file_path = r'd:\Xuanchi\高斯泼溅\XCNote\tools\tongyi\a.txt'
|
|
with open(file_path, 'r', encoding='utf-8') as f:
|
|
# 读取文件内容,跳过开头的 "响应数据: " 字符串
|
|
content = f.read().replace('响应数据: ', '')
|
|
data = json.loads(content)
|
|
|
|
transcription_url = data.get('Data', {}).get('Result', {}).get('Transcription')
|
|
if transcription_url:
|
|
print(f"成功提取 Transcription 链接: {transcription_url}")
|
|
return transcription_url
|
|
else:
|
|
print("无法找到 Transcription 链接")
|
|
return None
|
|
except Exception as e:
|
|
print(f"读取 a.txt 文件失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return None
|
|
|
|
|
|
def download_transcription(url):
|
|
"""
|
|
下载并解析 Transcription 内容
|
|
"""
|
|
try:
|
|
print(f"开始下载 Transcription 内容: {url}")
|
|
response = requests.get(url, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
data = response.json()
|
|
transcription_data = data.get('Transcription', {})
|
|
paragraphs = transcription_data.get('Paragraphs', [])
|
|
|
|
print(f"成功下载并解析 Transcription 内容,共 {len(paragraphs)} 个段落")
|
|
return paragraphs
|
|
except Exception as e:
|
|
print(f"下载 Transcription 内容失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return []
|
|
|
|
|
|
def process_transcription(paragraphs):
|
|
"""
|
|
处理 Transcription 内容,根据 SentenceId 拼合成句子,并整理成字典
|
|
"""
|
|
try:
|
|
result_dict = {}
|
|
sentence_counter = 1
|
|
|
|
for paragraph in paragraphs:
|
|
words = paragraph.get('Words', [])
|
|
# 按 SentenceId 分组
|
|
sentences = {}
|
|
for word in words:
|
|
sentence_id = word.get('SentenceId')
|
|
if sentence_id not in sentences:
|
|
sentences[sentence_id] = {
|
|
'words': [word.get('Text')],
|
|
'start': word.get('Start'),
|
|
'end': word.get('End')
|
|
}
|
|
else:
|
|
sentences[sentence_id]['words'].append(word.get('Text'))
|
|
# 更新结束时间
|
|
sentences[sentence_id]['end'] = word.get('End')
|
|
|
|
# 处理每个句子
|
|
for sentence_id, sentence_data in sorted(sentences.items()):
|
|
# 拼接句子
|
|
sentence_text = ''.join(sentence_data['words'])
|
|
if sentence_text:
|
|
# 整理成字典条目
|
|
result_dict[sentence_counter] = {
|
|
"Start": sentence_data['start'],
|
|
"End": sentence_data['end'],
|
|
"Text": sentence_text
|
|
}
|
|
sentence_counter += 1
|
|
|
|
print(f"成功处理 Transcription 内容,共 {len(result_dict)} 个句子")
|
|
return result_dict
|
|
except Exception as e:
|
|
print(f"处理 Transcription 内容失败: {e}")
|
|
import traceback
|
|
traceback.print_exc()
|
|
return {}
|
|
|
|
|
|
def main():
|
|
"""
|
|
主函数
|
|
"""
|
|
print("===== 开始处理 Transcription 结果 =====")
|
|
|
|
# 1. 读取并解析 a.txt 文件
|
|
transcription_url = read_a_txt()
|
|
if not transcription_url:
|
|
print("无法提取 Transcription 链接,程序退出")
|
|
return
|
|
|
|
# 2. 下载并解析 Transcription 内容
|
|
paragraphs = download_transcription(transcription_url)
|
|
if not paragraphs:
|
|
print("无法获取 Transcription 内容,程序退出")
|
|
return
|
|
|
|
# 3. 处理 Transcription 内容
|
|
result_dict = process_transcription(paragraphs)
|
|
if not result_dict:
|
|
print("处理 Transcription 内容失败,程序退出")
|
|
return
|
|
|
|
# 4. 输出整理结果
|
|
print(f"\n===== 处理完成 =====")
|
|
print(f"整理结果: ")
|
|
print(json.dumps(result_dict, indent=4, ensure_ascii=False))
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|