Files
XCDesktop/tools/clipboard/save_from_clipboard.py
2026-03-08 01:34:54 +08:00

251 lines
9.7 KiB
Python

import os
import datetime
import re
import base64
import shutil
import requests
import random
from urllib.parse import unquote, urlparse
from io import BytesIO
# Third-party libraries
import pyperclip
from PIL import ImageGrab, Image
import win32clipboard
from bs4 import BeautifulSoup
from markdownify import markdownify as md
# Configuration
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
MARKDOWN_DIR = os.path.join(PROJECT_ROOT, 'notebook', 'markdowns')
IMAGES_DIR = os.path.join(PROJECT_ROOT, 'notebook', 'images')
def sanitize_filename(name):
"""Sanitize string to be used as filename"""
safe_name = "".join([c for c in name if c.isalnum() or c in (' ', '-', '_')]).strip()
return safe_name[:100]
def get_html_from_clipboard():
"""Extract HTML format from Windows Clipboard"""
try:
win32clipboard.OpenClipboard()
# Register/Get HTML Format ID
html_format = win32clipboard.RegisterClipboardFormat("HTML Format")
if win32clipboard.IsClipboardFormatAvailable(html_format):
raw_data = win32clipboard.GetClipboardData(html_format)
win32clipboard.CloseClipboard()
# Raw data contains headers, we need to decode and parse them
# Example Header:
# Version:0.9
# StartHTML:00000097
# EndHTML:00000170
# StartFragment:00000133
# EndFragment:00000134
try:
html_str = raw_data.decode('utf-8')
except:
html_str = raw_data.decode('cp1252', errors='ignore')
# Extract the actual HTML fragment using regex or string splitting
start_html = re.search(r'StartHTML:(\d+)', html_str)
end_html = re.search(r'EndHTML:(\d+)', html_str)
if start_html and end_html:
start_idx = int(start_html.group(1))
end_idx = int(end_html.group(1))
return html_str[start_idx:end_idx]
return html_str # Fallback to full string if parsing fails
win32clipboard.CloseClipboard()
return None
except Exception as e:
print(f"Error reading clipboard HTML: {e}")
try:
win32clipboard.CloseClipboard()
except:
pass
return None
def process_html_images(html_content, timestamp):
"""Find images in HTML, save them locally, and update src"""
soup = BeautifulSoup(html_content, 'html.parser')
for img in soup.find_all('img'):
src = img.get('src')
if not src:
continue
new_filename = None
# Case 1: Base64 Image
if src.startswith('data:image'):
try:
# Extract header and data
# data:image/png;base64,xxxx
match = re.match(r'data:image/(\w+);base64,(.+)', src)
if match:
ext = match.group(1)
if ext == 'jpeg': ext = 'jpg'
data_str = match.group(2)
img_data = base64.b64decode(data_str)
new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}.{ext}"
dest_path = os.path.join(IMAGES_DIR, new_filename)
with open(dest_path, 'wb') as f:
f.write(img_data)
print(f" - Saved Base64 image: {new_filename}")
except Exception as e:
print(f" - Failed to process base64 image: {e}")
# Case 2: Local File (file://)
elif src.startswith('file://'):
try:
# Remove file:// prefix and decode URL encoded chars
local_path = unquote(src[7:])
# On Windows, it might be file:///C:/... -> /C:/... -> C:/...
if local_path.startswith('/') and ':' in local_path:
local_path = local_path[1:]
if os.path.exists(local_path):
ext = os.path.splitext(local_path)[1]
if not ext: ext = '.png'
new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}{ext}"
dest_path = os.path.join(IMAGES_DIR, new_filename)
shutil.copy2(local_path, dest_path)
print(f" - Copied local image: {new_filename}")
except Exception as e:
print(f" - Failed to copy local image: {e}")
# Case 3: Remote URL (http/https)
# Optional: We could download it, but for now let's keep it as is
# or download if it's a direct image link.
# Let's try to download to make it truly local/offline
elif src.startswith('http'):
try:
# Basic check if it's an image
# We skip downloading if user wants to keep remote links, but usually local is better for notes
# Let's try downloading
response = requests.get(src, timeout=5)
if response.status_code == 200 and 'image' in response.headers.get('content-type', ''):
ext = '.jpg' # default
if 'png' in response.headers['content-type']: ext = '.png'
elif 'gif' in response.headers['content-type']: ext = '.gif'
new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}{ext}"
dest_path = os.path.join(IMAGES_DIR, new_filename)
with open(dest_path, 'wb') as f:
f.write(response.content)
print(f" - Downloaded remote image: {new_filename}")
except Exception as e:
print(f" - Failed to download remote image: {e}")
# Update src in HTML if we saved a file
if new_filename:
# We use bare filename as requested
img['src'] = new_filename
return str(soup)
def extract_title_from_markdown(md_text):
"""Try to find the first H1 header to use as title"""
lines = md_text.strip().split('\n')
for line in lines[:10]:
if line.strip().startswith('# '):
return sanitize_filename(line.strip()[2:])
return None
def save_clipboard_content():
# Ensure directories exist
os.makedirs(MARKDOWN_DIR, exist_ok=True)
os.makedirs(IMAGES_DIR, exist_ok=True)
timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')
# Priority 1: Check for HTML (Rich Text)
# This covers most "Copy from Note App" scenarios
html_content = get_html_from_clipboard()
if html_content:
print("Detected HTML/Rich Text in clipboard...")
# 1. Process Images in HTML (Save to disk)
processed_html = process_html_images(html_content, timestamp)
# 2. Convert to Markdown
# heading_style='atx' ensures # Header style instead of underlines
md_text = md(processed_html, heading_style='atx')
# 3. Clean up extra newlines often introduced by conversion
md_text = re.sub(r'\n{3,}', '\n\n', md_text).strip()
# 4. Determine Filename
title = extract_title_from_markdown(md_text)
markdown_filename = f"{title}.md" if title else f"Note_{timestamp}.md"
# 5. Save
markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)
# Avoid collision
if os.path.exists(markdown_path):
markdown_filename = f"{title}_{timestamp}.md" if title else f"Note_{timestamp}_1.md"
markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(md_text)
print(f"Saved Rich Text Note to: {markdown_path}")
return
# Priority 2: Check for Bitmap Image (Direct Screenshot Copy)
try:
image = ImageGrab.grabclipboard()
if image and not isinstance(image, list):
print("Detected BITMAP image in clipboard...")
image_filename = f"clip_image_{timestamp}.png"
image_path = os.path.join(IMAGES_DIR, image_filename)
image.save(image_path, 'PNG')
markdown_filename = f"Image_{timestamp}.md"
markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)
# Use bare filename as requested
content = f"# Clipboard Image {timestamp}\n\n![{image_filename}]({image_filename})"
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(content)
print(f"Saved image to: {image_path}")
print(f"Saved markdown to: {markdown_path}")
return
except Exception as e:
print(f"Error checking bitmap: {e}")
# Priority 3: Fallback to Plain Text
text = pyperclip.paste()
if text:
print("Detected PLAIN TEXT in clipboard...")
title = extract_title_from_markdown(text)
markdown_filename = f"{title}.md" if title else f"Note_{timestamp}.md"
markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)
if os.path.exists(markdown_path):
markdown_filename = f"{title}_{timestamp}.md" if title else f"Note_{timestamp}_1.md"
markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)
with open(markdown_path, 'w', encoding='utf-8') as f:
f.write(text)
print(f"Saved Text Note to: {markdown_path}")
else:
print("Clipboard is empty.")
if __name__ == "__main__":
save_clipboard_content()