160 lines
4.8 KiB
Python
160 lines
4.8 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
知乎博客HTML解析为Markdown工具
|
||
|
||
功能:将下载的知乎HTML文件解析为标准Markdown格式
|
||
使用:python parse_blog.py <html_file_path>
|
||
"""
|
||
|
||
import os
|
||
import sys
|
||
from bs4 import BeautifulSoup
|
||
from markdownify import markdownify as md
|
||
|
||
|
||
def parse_zhihu_blog(html_file):
|
||
"""
|
||
解析知乎博客HTML文件并转换为Markdown
|
||
|
||
Args:
|
||
html_file: HTML文件路径
|
||
|
||
Returns:
|
||
str: 转换后的Markdown内容
|
||
"""
|
||
try:
|
||
# 读取HTML文件
|
||
with open(html_file, 'r', encoding='utf-8') as f:
|
||
html_content = f.read()
|
||
|
||
# 解析HTML
|
||
soup = BeautifulSoup(html_content, 'lxml')
|
||
|
||
# 提取标题
|
||
title = ""
|
||
title_tag = soup.find('h1', class_='Post-Title')
|
||
if not title_tag:
|
||
title_tag = soup.find('h1')
|
||
if title_tag:
|
||
title = title_tag.get_text(strip=True)
|
||
|
||
# 提取正文内容
|
||
content = ""
|
||
|
||
# 知乎文章正文的常见容器
|
||
content_containers = [
|
||
soup.find('div', class_='Post-RichTextContainer'),
|
||
soup.find('div', class_='RichText ztext Post-RichText'),
|
||
soup.find('article', class_='Post-content'),
|
||
soup.find('div', class_='Post-content')
|
||
]
|
||
|
||
# 尝试找到第一个有效的内容容器
|
||
content_container = None
|
||
for container in content_containers:
|
||
if container:
|
||
content_container = container
|
||
break
|
||
|
||
# 如果找到了内容容器,提取内容
|
||
if content_container:
|
||
# 预处理:移除不需要的元素
|
||
for element in content_container.find_all(['script', 'style', 'iframe', 'noscript']):
|
||
element.decompose()
|
||
|
||
# 处理图片路径
|
||
for img in content_container.find_all('img'):
|
||
if 'src' in img.attrs:
|
||
src = img['src']
|
||
# 处理相对路径
|
||
if src.startswith('./'):
|
||
# 保持相对路径不变
|
||
pass
|
||
# 处理绝对路径
|
||
elif src.startswith('http'):
|
||
# 保持绝对路径不变
|
||
pass
|
||
|
||
# 处理链接
|
||
for a in content_container.find_all('a'):
|
||
if 'href' in a.attrs:
|
||
href = a['href']
|
||
# 处理知乎内部链接
|
||
if href.startswith('/'):
|
||
a['href'] = f"https://www.zhihu.com{href}"
|
||
|
||
# 转换为Markdown
|
||
content = md(str(content_container),
|
||
heading_style="ATX",
|
||
code_language="",
|
||
wrap_width=0)
|
||
else:
|
||
# 如果没有找到特定容器,尝试提取所有p标签内容
|
||
paragraphs = soup.find_all('p')
|
||
content = "\n".join([p.get_text(strip=True) for p in paragraphs])
|
||
|
||
# 后处理:清理Markdown内容
|
||
# 移除多余的空行
|
||
content = '\n'.join([line for line in content.split('\n') if line.strip() or line == ''])
|
||
# 清理重复的换行
|
||
while '\n\n\n' in content:
|
||
content = content.replace('\n\n\n', '\n\n')
|
||
|
||
# 组合标题和内容
|
||
markdown_content = f"# {title}\n\n{content}"
|
||
|
||
return markdown_content
|
||
|
||
except Exception as e:
|
||
print(f"解析出错: {e}")
|
||
return ""
|
||
|
||
|
||
def save_markdown(content, output_file):
|
||
"""
|
||
保存Markdown内容到文件
|
||
|
||
Args:
|
||
content: Markdown内容
|
||
output_file: 输出文件路径
|
||
"""
|
||
try:
|
||
with open(output_file, 'w', encoding='utf-8') as f:
|
||
f.write(content)
|
||
print(f"Markdown文件已保存: {output_file}")
|
||
except Exception as e:
|
||
print(f"保存文件出错: {e}")
|
||
|
||
|
||
def main():
|
||
"""
|
||
主函数
|
||
"""
|
||
if len(sys.argv) != 2:
|
||
print("使用方法: python parse_blog.py <html_file_path>")
|
||
sys.exit(1)
|
||
|
||
html_file = sys.argv[1]
|
||
|
||
if not os.path.exists(html_file):
|
||
print(f"文件不存在: {html_file}")
|
||
sys.exit(1)
|
||
|
||
# 生成输出文件路径
|
||
base_name = os.path.splitext(html_file)[0]
|
||
output_file = f"{base_name}.md"
|
||
|
||
# 解析HTML并转换为Markdown
|
||
print(f"正在解析: {html_file}")
|
||
markdown_content = parse_zhihu_blog(html_file)
|
||
|
||
if markdown_content:
|
||
# 保存为Markdown文件
|
||
save_markdown(markdown_content, output_file)
|
||
else:
|
||
print("解析失败,无法生成Markdown文件")
|
||
|
||
|
||
if __name__ == "__main__":
|
||
main() |