XCDesktop/tools/clipboard/save_from_clipboard.py

import os
import datetime
import re
import base64
import shutil
import requests
import random
from urllib.parse import unquote, urlparse
from io import BytesIO

# Third-party libraries
import pyperclip
from PIL import ImageGrab, Image
import win32clipboard
from bs4 import BeautifulSoup
from markdownify import markdownify as md

# Configuration
PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
MARKDOWN_DIR = os.path.join(PROJECT_ROOT, 'notebook', 'markdowns')
IMAGES_DIR = os.path.join(PROJECT_ROOT, 'notebook', 'images')

def sanitize_filename(name):
    """Sanitize string to be used as filename"""
    safe_name = "".join([c for c in name if c.isalnum() or c in (' ', '-', '_')]).strip()
    return safe_name[:100]

def get_html_from_clipboard():
    """Extract HTML format from Windows Clipboard"""
    try:
        win32clipboard.OpenClipboard()

        # Register/Get HTML Format ID
        html_format = win32clipboard.RegisterClipboardFormat("HTML Format")

        if win32clipboard.IsClipboardFormatAvailable(html_format):
            raw_data = win32clipboard.GetClipboardData(html_format)
            win32clipboard.CloseClipboard()

            # Raw data contains headers, we need to decode and parse them
            # Example Header:
            # Version:0.9
            # StartHTML:00000097
            # EndHTML:00000170
            # StartFragment:00000133
            # EndFragment:00000134

            try:
                html_str = raw_data.decode('utf-8')
            except:
                html_str = raw_data.decode('cp1252', errors='ignore')

            # Extract the actual HTML fragment using regex or string splitting
            start_html = re.search(r'StartHTML:(\d+)', html_str)
            end_html = re.search(r'EndHTML:(\d+)', html_str)

            if start_html and end_html:
                start_idx = int(start_html.group(1))
                end_idx = int(end_html.group(1))
                return html_str[start_idx:end_idx]

            return html_str # Fallback to full string if parsing fails

        win32clipboard.CloseClipboard()
        return None
    except Exception as e:
        print(f"Error reading clipboard HTML: {e}")
        try:
            win32clipboard.CloseClipboard()
        except:
            pass
        return None

def process_html_images(html_content, timestamp):
    """Find images in HTML, save them locally, and update src"""
    soup = BeautifulSoup(html_content, 'html.parser')

    for img in soup.find_all('img'):
        src = img.get('src')
        if not src:
            continue

        new_filename = None

        # Case 1: Base64 Image
        if src.startswith('data:image'):
            try:
                # Extract header and data
                # data:image/png;base64,xxxx
                match = re.match(r'data:image/(\w+);base64,(.+)', src)
                if match:
                    ext = match.group(1)
                    if ext == 'jpeg': ext = 'jpg'
                    data_str = match.group(2)

                    img_data = base64.b64decode(data_str)
                    new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}.{ext}"
                    dest_path = os.path.join(IMAGES_DIR, new_filename)

                    with open(dest_path, 'wb') as f:
                        f.write(img_data)
                    print(f"  - Saved Base64 image: {new_filename}")
            except Exception as e:
                print(f"  - Failed to process base64 image: {e}")

        # Case 2: Local File (file://)
        elif src.startswith('file://'):
            try:
                # Remove file:// prefix and decode URL encoded chars
                local_path = unquote(src[7:])
                # On Windows, it might be file:///C:/... -> /C:/... -> C:/...
                if local_path.startswith('/') and ':' in local_path:
                    local_path = local_path[1:]

                if os.path.exists(local_path):
                    ext = os.path.splitext(local_path)[1]
                    if not ext: ext = '.png'

                    new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}{ext}"
                    dest_path = os.path.join(IMAGES_DIR, new_filename)

                    shutil.copy2(local_path, dest_path)
                    print(f"  - Copied local image: {new_filename}")
            except Exception as e:
                print(f"  - Failed to copy local image: {e}")

        # Case 3: Remote URL (http/https)
        # Optional: We could download it, but for now let's keep it as is
        # or download if it's a direct image link.
        # Let's try to download to make it truly local/offline
        elif src.startswith('http'):
            try:
                # Basic check if it's an image
                # We skip downloading if user wants to keep remote links, but usually local is better for notes
                # Let's try downloading
                response = requests.get(src, timeout=5)
                if response.status_code == 200 and 'image' in response.headers.get('content-type', ''):
                    ext = '.jpg' # default
                    if 'png' in response.headers['content-type']: ext = '.png'
                    elif 'gif' in response.headers['content-type']: ext = '.gif'

                    new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}{ext}"
                    dest_path = os.path.join(IMAGES_DIR, new_filename)

                    with open(dest_path, 'wb') as f:
                        f.write(response.content)
                    print(f"  - Downloaded remote image: {new_filename}")
            except Exception as e:
                print(f"  - Failed to download remote image: {e}")

        # Update src in HTML if we saved a file
        if new_filename:
            # We use bare filename as requested
            img['src'] = new_filename

    return str(soup)

def extract_title_from_markdown(md_text):
    """Try to find the first H1 header to use as title"""
    lines = md_text.strip().split('\n')
    for line in lines[:10]:
        if line.strip().startswith('# '):
            return sanitize_filename(line.strip()[2:])
    return None

def save_clipboard_content():
    # Ensure directories exist
    os.makedirs(MARKDOWN_DIR, exist_ok=True)
    os.makedirs(IMAGES_DIR, exist_ok=True)

    timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S')

    # Priority 1: Check for HTML (Rich Text)
    # This covers most "Copy from Note App" scenarios
    html_content = get_html_from_clipboard()

    if html_content:
        print("Detected HTML/Rich Text in clipboard...")

        # 1. Process Images in HTML (Save to disk)
        processed_html = process_html_images(html_content, timestamp)

        # 2. Convert to Markdown
        # heading_style='atx' ensures # Header style instead of underlines
        md_text = md(processed_html, heading_style='atx')

        # 3. Clean up extra newlines often introduced by conversion
        md_text = re.sub(r'\n{3,}', '\n\n', md_text).strip()

        # 4. Determine Filename
        title = extract_title_from_markdown(md_text)
        markdown_filename = f"{title}.md" if title else f"Note_{timestamp}.md"

        # 5. Save
        markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)
        # Avoid collision
        if os.path.exists(markdown_path):
            markdown_filename = f"{title}_{timestamp}.md" if title else f"Note_{timestamp}_1.md"
            markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)

        with open(markdown_path, 'w', encoding='utf-8') as f:
            f.write(md_text)

        print(f"Saved Rich Text Note to: {markdown_path}")
        return

    # Priority 2: Check for Bitmap Image (Direct Screenshot Copy)
    try:
        image = ImageGrab.grabclipboard()
        if image and not isinstance(image, list):
            print("Detected BITMAP image in clipboard...")
            image_filename = f"clip_image_{timestamp}.png"
            image_path = os.path.join(IMAGES_DIR, image_filename)
            image.save(image_path, 'PNG')

            markdown_filename = f"Image_{timestamp}.md"
            markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)

            # Use bare filename as requested
            content = f"# Clipboard Image {timestamp}\n\n![{image_filename}]({image_filename})"

            with open(markdown_path, 'w', encoding='utf-8') as f:
                f.write(content)

            print(f"Saved image to: {image_path}")
            print(f"Saved markdown to: {markdown_path}")
            return
    except Exception as e:
        print(f"Error checking bitmap: {e}")

    # Priority 3: Fallback to Plain Text
    text = pyperclip.paste()
    if text:
        print("Detected PLAIN TEXT in clipboard...")
        title = extract_title_from_markdown(text)
        markdown_filename = f"{title}.md" if title else f"Note_{timestamp}.md"
        markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)

        if os.path.exists(markdown_path):
            markdown_filename = f"{title}_{timestamp}.md" if title else f"Note_{timestamp}_1.md"
            markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename)

        with open(markdown_path, 'w', encoding='utf-8') as f:
            f.write(text)
        print(f"Saved Text Note to: {markdown_path}")
    else:
        print("Clipboard is empty.")

if __name__ == "__main__":
    save_clipboard_content()