XCDesktop/tools/tongyi/Transcription.py

#!/usr/bin/env python
#coding=utf-8

import os
import json
import requests
import datetime


def read_a_txt():
    """
    读取并解析 a.txt 文件，提取 Transcription 链接
    """
    try:
        file_path = r'd:\Xuanchi\高斯泼溅\XCNote\tools\tongyi\a.txt'
        with open(file_path, 'r', encoding='utf-8') as f:
            # 读取文件内容，跳过开头的 "响应数据: " 字符串
            content = f.read().replace('响应数据: ', '')
            data = json.loads(content)

        transcription_url = data.get('Data', {}).get('Result', {}).get('Transcription')
        if transcription_url:
            print(f"成功提取 Transcription 链接: {transcription_url}")
            return transcription_url
        else:
            print("无法找到 Transcription 链接")
            return None
    except Exception as e:
        print(f"读取 a.txt 文件失败: {e}")
        import traceback
        traceback.print_exc()
        return None


def download_transcription(url):
    """
    下载并解析 Transcription 内容
    """
    try:
        print(f"开始下载 Transcription 内容: {url}")
        response = requests.get(url, timeout=30)
        response.raise_for_status()

        data = response.json()
        transcription_data = data.get('Transcription', {})
        paragraphs = transcription_data.get('Paragraphs', [])

        print(f"成功下载并解析 Transcription 内容，共 {len(paragraphs)} 个段落")
        return paragraphs
    except Exception as e:
        print(f"下载 Transcription 内容失败: {e}")
        import traceback
        traceback.print_exc()
        return []


def process_transcription(paragraphs):
    """
    处理 Transcription 内容，根据 SentenceId 拼合成句子，并整理成字典
    """
    try:
        result_dict = {}
        sentence_counter = 1

        for paragraph in paragraphs:
            words = paragraph.get('Words', [])
            # 按 SentenceId 分组
            sentences = {}
            for word in words:
                sentence_id = word.get('SentenceId')
                if sentence_id not in sentences:
                    sentences[sentence_id] = {
                        'words': [word.get('Text')],
                        'start': word.get('Start'),
                        'end': word.get('End')
                    }
                else:
                    sentences[sentence_id]['words'].append(word.get('Text'))
                    # 更新结束时间
                    sentences[sentence_id]['end'] = word.get('End')

            # 处理每个句子
            for sentence_id, sentence_data in sorted(sentences.items()):
                # 拼接句子
                sentence_text = ''.join(sentence_data['words'])
                if sentence_text:
                    # 整理成字典条目
                    result_dict[sentence_counter] = {
                        "Start": sentence_data['start'],
                        "End": sentence_data['end'],
                        "Text": sentence_text
                    }
                    sentence_counter += 1

        print(f"成功处理 Transcription 内容，共 {len(result_dict)} 个句子")
        return result_dict
    except Exception as e:
        print(f"处理 Transcription 内容失败: {e}")
        import traceback
        traceback.print_exc()
        return {}


def main():
    """
    主函数
    """
    print("===== 开始处理 Transcription 结果 =====")

    # 1. 读取并解析 a.txt 文件
    transcription_url = read_a_txt()
    if not transcription_url:
        print("无法提取 Transcription 链接，程序退出")
        return

    # 2. 下载并解析 Transcription 内容
    paragraphs = download_transcription(transcription_url)
    if not paragraphs:
        print("无法获取 Transcription 内容，程序退出")
        return

    # 3. 处理 Transcription 内容
    result_dict = process_transcription(paragraphs)
    if not result_dict:
        print("处理 Transcription 内容失败，程序退出")
        return

    # 4. 输出整理结果
    print(f"\n===== 处理完成 =====")
    print(f"整理结果: ")
    print(json.dumps(result_dict, indent=4, ensure_ascii=False))


if __name__ == "__main__":
    main()