tools/blog/parse_blog.py

#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
知乎博客HTML解析为Markdown工具

功能：将下载的知乎HTML文件解析为标准Markdown格式
使用：python parse_blog.py <html_file_path>
"""

import os
import sys
from bs4 import BeautifulSoup
from markdownify import markdownify as md


def parse_zhihu_blog(html_file):
    """
    解析知乎博客HTML文件并转换为Markdown
    
    Args:
        html_file: HTML文件路径
        
    Returns:
        str: 转换后的Markdown内容
    """
    try:
        # 读取HTML文件
        with open(html_file, 'r', encoding='utf-8') as f:
            html_content = f.read()
        
        # 解析HTML
        soup = BeautifulSoup(html_content, 'lxml')
        
        # 提取标题
        title = ""
        title_tag = soup.find('h1', class_='Post-Title')
        if not title_tag:
            title_tag = soup.find('h1')
        if title_tag:
            title = title_tag.get_text(strip=True)
        
        # 提取正文内容
        content = ""
        
        # 知乎文章正文的常见容器
        content_containers = [
            soup.find('div', class_='Post-RichTextContainer'),
            soup.find('div', class_='RichText ztext Post-RichText'),
            soup.find('article', class_='Post-content'),
            soup.find('div', class_='Post-content')
        ]
        
        # 尝试找到第一个有效的内容容器
        content_container = None
        for container in content_containers:
            if container:
                content_container = container
                break
        
        # 如果找到了内容容器，提取内容
        if content_container:
            # 预处理：移除不需要的元素
            for element in content_container.find_all(['script', 'style', 'iframe', 'noscript']):
                element.decompose()
            
            # 处理图片路径
            for img in content_container.find_all('img'):
                if 'src' in img.attrs:
                    src = img['src']
                    # 处理相对路径
                    if src.startswith('./'):
                        # 保持相对路径不变
                        pass
                    # 处理绝对路径
                    elif src.startswith('http'):
                        # 保持绝对路径不变
                        pass
            
            # 处理链接
            for a in content_container.find_all('a'):
                if 'href' in a.attrs:
                    href = a['href']
                    # 处理知乎内部链接
                    if href.startswith('/'):
                        a['href'] = f"https://www.zhihu.com{href}"
            
            # 转换为Markdown
            content = md(str(content_container), 
                         heading_style="ATX", 
                         code_language="",
                         wrap_width=0)
        else:
            # 如果没有找到特定容器，尝试提取所有p标签内容
            paragraphs = soup.find_all('p')
            content = "\n".join([p.get_text(strip=True) for p in paragraphs])
        
        # 后处理：清理Markdown内容
        # 移除多余的空行
        content = '\n'.join([line for line in content.split('\n') if line.strip() or line == ''])
        # 清理重复的换行
        while '\n\n\n' in content:
            content = content.replace('\n\n\n', '\n\n')
        
        # 组合标题和内容
        markdown_content = f"# {title}\n\n{content}"
        
        return markdown_content
        
    except Exception as e:
        print(f"解析出错: {e}")
        return ""


def save_markdown(content, output_file):
    """
    保存Markdown内容到文件
    
    Args:
        content: Markdown内容
        output_file: 输出文件路径
    """
    try:
        with open(output_file, 'w', encoding='utf-8') as f:
            f.write(content)
        print(f"Markdown文件已保存: {output_file}")
    except Exception as e:
        print(f"保存文件出错: {e}")


def main():
    """
    主函数
    """
    if len(sys.argv) != 2:
        print("使用方法: python parse_blog.py <html_file_path>")
        sys.exit(1)
    
    html_file = sys.argv[1]
    
    if not os.path.exists(html_file):
        print(f"文件不存在: {html_file}")
        sys.exit(1)
    
    # 生成输出文件路径
    base_name = os.path.splitext(html_file)[0]
    output_file = f"{base_name}.md"
    
    # 解析HTML并转换为Markdown
    print(f"正在解析: {html_file}")
    markdown_content = parse_zhihu_blog(html_file)
    
    if markdown_content:
        # 保存为Markdown文件
        save_markdown(markdown_content, output_file)
    else:
        print("解析失败，无法生成Markdown文件")


if __name__ == "__main__":
    main()