160 lines
4.8 KiB
Python
160 lines
4.8 KiB
Python
|
|
#!/usr/bin/env python3
|
|||
|
|
# -*- coding: utf-8 -*-
|
|||
|
|
"""
|
|||
|
|
知乎博客HTML解析为Markdown工具
|
|||
|
|
|
|||
|
|
功能:将下载的知乎HTML文件解析为标准Markdown格式
|
|||
|
|
使用:python parse_blog.py <html_file_path>
|
|||
|
|
"""
|
|||
|
|
|
|||
|
|
import os
|
|||
|
|
import sys
|
|||
|
|
from bs4 import BeautifulSoup
|
|||
|
|
from markdownify import markdownify as md
|
|||
|
|
|
|||
|
|
|
|||
|
|
def parse_zhihu_blog(html_file):
|
|||
|
|
"""
|
|||
|
|
解析知乎博客HTML文件并转换为Markdown
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
html_file: HTML文件路径
|
|||
|
|
|
|||
|
|
Returns:
|
|||
|
|
str: 转换后的Markdown内容
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
# 读取HTML文件
|
|||
|
|
with open(html_file, 'r', encoding='utf-8') as f:
|
|||
|
|
html_content = f.read()
|
|||
|
|
|
|||
|
|
# 解析HTML
|
|||
|
|
soup = BeautifulSoup(html_content, 'lxml')
|
|||
|
|
|
|||
|
|
# 提取标题
|
|||
|
|
title = ""
|
|||
|
|
title_tag = soup.find('h1', class_='Post-Title')
|
|||
|
|
if not title_tag:
|
|||
|
|
title_tag = soup.find('h1')
|
|||
|
|
if title_tag:
|
|||
|
|
title = title_tag.get_text(strip=True)
|
|||
|
|
|
|||
|
|
# 提取正文内容
|
|||
|
|
content = ""
|
|||
|
|
|
|||
|
|
# 知乎文章正文的常见容器
|
|||
|
|
content_containers = [
|
|||
|
|
soup.find('div', class_='Post-RichTextContainer'),
|
|||
|
|
soup.find('div', class_='RichText ztext Post-RichText'),
|
|||
|
|
soup.find('article', class_='Post-content'),
|
|||
|
|
soup.find('div', class_='Post-content')
|
|||
|
|
]
|
|||
|
|
|
|||
|
|
# 尝试找到第一个有效的内容容器
|
|||
|
|
content_container = None
|
|||
|
|
for container in content_containers:
|
|||
|
|
if container:
|
|||
|
|
content_container = container
|
|||
|
|
break
|
|||
|
|
|
|||
|
|
# 如果找到了内容容器,提取内容
|
|||
|
|
if content_container:
|
|||
|
|
# 预处理:移除不需要的元素
|
|||
|
|
for element in content_container.find_all(['script', 'style', 'iframe', 'noscript']):
|
|||
|
|
element.decompose()
|
|||
|
|
|
|||
|
|
# 处理图片路径
|
|||
|
|
for img in content_container.find_all('img'):
|
|||
|
|
if 'src' in img.attrs:
|
|||
|
|
src = img['src']
|
|||
|
|
# 处理相对路径
|
|||
|
|
if src.startswith('./'):
|
|||
|
|
# 保持相对路径不变
|
|||
|
|
pass
|
|||
|
|
# 处理绝对路径
|
|||
|
|
elif src.startswith('http'):
|
|||
|
|
# 保持绝对路径不变
|
|||
|
|
pass
|
|||
|
|
|
|||
|
|
# 处理链接
|
|||
|
|
for a in content_container.find_all('a'):
|
|||
|
|
if 'href' in a.attrs:
|
|||
|
|
href = a['href']
|
|||
|
|
# 处理知乎内部链接
|
|||
|
|
if href.startswith('/'):
|
|||
|
|
a['href'] = f"https://www.zhihu.com{href}"
|
|||
|
|
|
|||
|
|
# 转换为Markdown
|
|||
|
|
content = md(str(content_container),
|
|||
|
|
heading_style="ATX",
|
|||
|
|
code_language="",
|
|||
|
|
wrap_width=0)
|
|||
|
|
else:
|
|||
|
|
# 如果没有找到特定容器,尝试提取所有p标签内容
|
|||
|
|
paragraphs = soup.find_all('p')
|
|||
|
|
content = "\n".join([p.get_text(strip=True) for p in paragraphs])
|
|||
|
|
|
|||
|
|
# 后处理:清理Markdown内容
|
|||
|
|
# 移除多余的空行
|
|||
|
|
content = '\n'.join([line for line in content.split('\n') if line.strip() or line == ''])
|
|||
|
|
# 清理重复的换行
|
|||
|
|
while '\n\n\n' in content:
|
|||
|
|
content = content.replace('\n\n\n', '\n\n')
|
|||
|
|
|
|||
|
|
# 组合标题和内容
|
|||
|
|
markdown_content = f"# {title}\n\n{content}"
|
|||
|
|
|
|||
|
|
return markdown_content
|
|||
|
|
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"解析出错: {e}")
|
|||
|
|
return ""
|
|||
|
|
|
|||
|
|
|
|||
|
|
def save_markdown(content, output_file):
|
|||
|
|
"""
|
|||
|
|
保存Markdown内容到文件
|
|||
|
|
|
|||
|
|
Args:
|
|||
|
|
content: Markdown内容
|
|||
|
|
output_file: 输出文件路径
|
|||
|
|
"""
|
|||
|
|
try:
|
|||
|
|
with open(output_file, 'w', encoding='utf-8') as f:
|
|||
|
|
f.write(content)
|
|||
|
|
print(f"Markdown文件已保存: {output_file}")
|
|||
|
|
except Exception as e:
|
|||
|
|
print(f"保存文件出错: {e}")
|
|||
|
|
|
|||
|
|
|
|||
|
|
def main():
|
|||
|
|
"""
|
|||
|
|
主函数
|
|||
|
|
"""
|
|||
|
|
if len(sys.argv) != 2:
|
|||
|
|
print("使用方法: python parse_blog.py <html_file_path>")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
html_file = sys.argv[1]
|
|||
|
|
|
|||
|
|
if not os.path.exists(html_file):
|
|||
|
|
print(f"文件不存在: {html_file}")
|
|||
|
|
sys.exit(1)
|
|||
|
|
|
|||
|
|
# 生成输出文件路径
|
|||
|
|
base_name = os.path.splitext(html_file)[0]
|
|||
|
|
output_file = f"{base_name}.md"
|
|||
|
|
|
|||
|
|
# 解析HTML并转换为Markdown
|
|||
|
|
print(f"正在解析: {html_file}")
|
|||
|
|
markdown_content = parse_zhihu_blog(html_file)
|
|||
|
|
|
|||
|
|
if markdown_content:
|
|||
|
|
# 保存为Markdown文件
|
|||
|
|
save_markdown(markdown_content, output_file)
|
|||
|
|
else:
|
|||
|
|
print("解析失败,无法生成Markdown文件")
|
|||
|
|
|
|||
|
|
|
|||
|
|
if __name__ == "__main__":
|
|||
|
|
main()
|