Files
XCDesktop/tools/blog/parse_blog.py

160 lines
4.8 KiB
Python
Raw Normal View History

2026-03-08 01:34:54 +08:00
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
知乎博客HTML解析为Markdown工具
功能将下载的知乎HTML文件解析为标准Markdown格式
使用python parse_blog.py <html_file_path>
"""
import os
import sys
from bs4 import BeautifulSoup
from markdownify import markdownify as md
def parse_zhihu_blog(html_file):
"""
解析知乎博客HTML文件并转换为Markdown
Args:
html_file: HTML文件路径
Returns:
str: 转换后的Markdown内容
"""
try:
# 读取HTML文件
with open(html_file, 'r', encoding='utf-8') as f:
html_content = f.read()
# 解析HTML
soup = BeautifulSoup(html_content, 'lxml')
# 提取标题
title = ""
title_tag = soup.find('h1', class_='Post-Title')
if not title_tag:
title_tag = soup.find('h1')
if title_tag:
title = title_tag.get_text(strip=True)
# 提取正文内容
content = ""
# 知乎文章正文的常见容器
content_containers = [
soup.find('div', class_='Post-RichTextContainer'),
soup.find('div', class_='RichText ztext Post-RichText'),
soup.find('article', class_='Post-content'),
soup.find('div', class_='Post-content')
]
# 尝试找到第一个有效的内容容器
content_container = None
for container in content_containers:
if container:
content_container = container
break
# 如果找到了内容容器,提取内容
if content_container:
# 预处理:移除不需要的元素
for element in content_container.find_all(['script', 'style', 'iframe', 'noscript']):
element.decompose()
# 处理图片路径
for img in content_container.find_all('img'):
if 'src' in img.attrs:
src = img['src']
# 处理相对路径
if src.startswith('./'):
# 保持相对路径不变
pass
# 处理绝对路径
elif src.startswith('http'):
# 保持绝对路径不变
pass
# 处理链接
for a in content_container.find_all('a'):
if 'href' in a.attrs:
href = a['href']
# 处理知乎内部链接
if href.startswith('/'):
a['href'] = f"https://www.zhihu.com{href}"
# 转换为Markdown
content = md(str(content_container),
heading_style="ATX",
code_language="",
wrap_width=0)
else:
# 如果没有找到特定容器尝试提取所有p标签内容
paragraphs = soup.find_all('p')
content = "\n".join([p.get_text(strip=True) for p in paragraphs])
# 后处理清理Markdown内容
# 移除多余的空行
content = '\n'.join([line for line in content.split('\n') if line.strip() or line == ''])
# 清理重复的换行
while '\n\n\n' in content:
content = content.replace('\n\n\n', '\n\n')
# 组合标题和内容
markdown_content = f"# {title}\n\n{content}"
return markdown_content
except Exception as e:
print(f"解析出错: {e}")
return ""
def save_markdown(content, output_file):
"""
保存Markdown内容到文件
Args:
content: Markdown内容
output_file: 输出文件路径
"""
try:
with open(output_file, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Markdown文件已保存: {output_file}")
except Exception as e:
print(f"保存文件出错: {e}")
def main():
"""
主函数
"""
if len(sys.argv) != 2:
print("使用方法: python parse_blog.py <html_file_path>")
sys.exit(1)
html_file = sys.argv[1]
if not os.path.exists(html_file):
print(f"文件不存在: {html_file}")
sys.exit(1)
# 生成输出文件路径
base_name = os.path.splitext(html_file)[0]
output_file = f"{base_name}.md"
# 解析HTML并转换为Markdown
print(f"正在解析: {html_file}")
markdown_content = parse_zhihu_blog(html_file)
if markdown_content:
# 保存为Markdown文件
save_markdown(markdown_content, output_file)
else:
print("解析失败无法生成Markdown文件")
if __name__ == "__main__":
main()