Files
XCDesktop/tools/blog/parse_blog.py
2026-03-08 01:34:54 +08:00

160 lines
4.8 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
知乎博客HTML解析为Markdown工具
功能将下载的知乎HTML文件解析为标准Markdown格式
使用python parse_blog.py <html_file_path>
"""
import os
import sys
from bs4 import BeautifulSoup
from markdownify import markdownify as md
def parse_zhihu_blog(html_file):
"""
解析知乎博客HTML文件并转换为Markdown
Args:
html_file: HTML文件路径
Returns:
str: 转换后的Markdown内容
"""
try:
# 读取HTML文件
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
# 解析HTML
soup = BeautifulSoup(html_content, 'lxml')
# 提取标题
title = ""
title_tag = soup.find('h1', class_='Post-Title')
if not title_tag:
title_tag = soup.find('h1')
if title_tag:
title = title_tag.get_text(strip=True)
# 提取正文内容
content = ""
# 知乎文章正文的常见容器
content_containers = [
soup.find('div', class_='Post-RichTextContainer'),
soup.find('div', class_='RichText ztext Post-RichText'),
soup.find('article', class_='Post-content'),
soup.find('div', class_='Post-content')
]
# 尝试找到第一个有效的内容容器
content_container = None
for container in content_containers:
if container:
content_container = container
break
# 如果找到了内容容器,提取内容
if content_container:
# 预处理:移除不需要的元素
for element in content_container.find_all(['script', 'style', 'iframe', 'noscript']):
element.decompose()
# 处理图片路径
for img in content_container.find_all('img'):
if 'src' in img.attrs:
src = img['src']
# 处理相对路径
if src.startswith('./'):
# 保持相对路径不变
pass
# 处理绝对路径
elif src.startswith('http'):
# 保持绝对路径不变
pass
# 处理链接
for a in content_container.find_all('a'):
if 'href' in a.attrs:
href = a['href']
# 处理知乎内部链接
if href.startswith('/'):
a['href'] = f"https://www.zhihu.com{href}"
# 转换为Markdown
content = md(str(content_container),
heading_style="ATX",
code_language="",
wrap_width=0)
else:
# 如果没有找到特定容器尝试提取所有p标签内容
paragraphs = soup.find_all('p')
content = "\n".join([p.get_text(strip=True) for p in paragraphs])
# 后处理清理Markdown内容
# 移除多余的空行
content = '\n'.join([line for line in content.split('\n') if line.strip() or line == ''])
# 清理重复的换行
while '\n\n\n' in content:
content = content.replace('\n\n\n', '\n\n')
# 组合标题和内容
markdown_content = f"# {title}\n\n{content}"
return markdown_content
except Exception as e:
print(f"解析出错: {e}")
return ""
def save_markdown(content, output_file):
"""
保存Markdown内容到文件
Args:
content: Markdown内容
output_file: 输出文件路径
"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Markdown文件已保存: {output_file}")
except Exception as e:
print(f"保存文件出错: {e}")
def main():
"""
主函数
"""
if len(sys.argv) != 2:
print("使用方法: python parse_blog.py <html_file_path>")
sys.exit(1)
html_file = sys.argv[1]
if not os.path.exists(html_file):
print(f"文件不存在: {html_file}")
sys.exit(1)
# 生成输出文件路径
base_name = os.path.splitext(html_file)[0]
output_file = f"{base_name}.md"
# 解析HTML并转换为Markdown
print(f"正在解析: {html_file}")
markdown_content = parse_zhihu_blog(html_file)
if markdown_content:
# 保存为Markdown文件
save_markdown(markdown_content, output_file)
else:
print("解析失败无法生成Markdown文件")
if __name__ == "__main__":
main()