#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ 知乎博客HTML解析为Markdown工具 功能:将下载的知乎HTML文件解析为标准Markdown格式 使用:python parse_blog.py """ import os import sys from bs4 import BeautifulSoup from markdownify import markdownify as md def parse_zhihu_blog(html_file): """ 解析知乎博客HTML文件并转换为Markdown Args: html_file: HTML文件路径 Returns: str: 转换后的Markdown内容 """ try: # 读取HTML文件 with open(html_file, 'r', encoding='utf-8') as f: html_content = f.read() # 解析HTML soup = BeautifulSoup(html_content, 'lxml') # 提取标题 title = "" title_tag = soup.find('h1', class_='Post-Title') if not title_tag: title_tag = soup.find('h1') if title_tag: title = title_tag.get_text(strip=True) # 提取正文内容 content = "" # 知乎文章正文的常见容器 content_containers = [ soup.find('div', class_='Post-RichTextContainer'), soup.find('div', class_='RichText ztext Post-RichText'), soup.find('article', class_='Post-content'), soup.find('div', class_='Post-content') ] # 尝试找到第一个有效的内容容器 content_container = None for container in content_containers: if container: content_container = container break # 如果找到了内容容器,提取内容 if content_container: # 预处理:移除不需要的元素 for element in content_container.find_all(['script', 'style', 'iframe', 'noscript']): element.decompose() # 处理图片路径 for img in content_container.find_all('img'): if 'src' in img.attrs: src = img['src'] # 处理相对路径 if src.startswith('./'): # 保持相对路径不变 pass # 处理绝对路径 elif src.startswith('http'): # 保持绝对路径不变 pass # 处理链接 for a in content_container.find_all('a'): if 'href' in a.attrs: href = a['href'] # 处理知乎内部链接 if href.startswith('/'): a['href'] = f"https://www.zhihu.com{href}" # 转换为Markdown content = md(str(content_container), heading_style="ATX", code_language="", wrap_width=0) else: # 如果没有找到特定容器,尝试提取所有p标签内容 paragraphs = soup.find_all('p') content = "\n".join([p.get_text(strip=True) for p in paragraphs]) # 后处理:清理Markdown内容 # 移除多余的空行 content = '\n'.join([line for line in content.split('\n') if line.strip() or line == '']) # 清理重复的换行 while '\n\n\n' in content: content = content.replace('\n\n\n', '\n\n') # 组合标题和内容 markdown_content = f"# {title}\n\n{content}" return markdown_content except Exception as e: print(f"解析出错: {e}") return "" def save_markdown(content, output_file): """ 保存Markdown内容到文件 Args: content: Markdown内容 output_file: 输出文件路径 """ try: with open(output_file, 'w', encoding='utf-8') as f: f.write(content) print(f"Markdown文件已保存: {output_file}") except Exception as e: print(f"保存文件出错: {e}") def main(): """ 主函数 """ if len(sys.argv) != 2: print("使用方法: python parse_blog.py ") sys.exit(1) html_file = sys.argv[1] if not os.path.exists(html_file): print(f"文件不存在: {html_file}") sys.exit(1) # 生成输出文件路径 base_name = os.path.splitext(html_file)[0] output_file = f"{base_name}.md" # 解析HTML并转换为Markdown print(f"正在解析: {html_file}") markdown_content = parse_zhihu_blog(html_file) if markdown_content: # 保存为Markdown文件 save_markdown(markdown_content, output_file) else: print("解析失败,无法生成Markdown文件") if __name__ == "__main__": main()