import os import datetime import re import base64 import shutil import requests import random from urllib.parse import unquote, urlparse from io import BytesIO # Third-party libraries import pyperclip from PIL import ImageGrab, Image import win32clipboard from bs4 import BeautifulSoup from markdownify import markdownify as md # Configuration PROJECT_ROOT = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) MARKDOWN_DIR = os.path.join(PROJECT_ROOT, 'notebook', 'markdowns') IMAGES_DIR = os.path.join(PROJECT_ROOT, 'notebook', 'images') def sanitize_filename(name): """Sanitize string to be used as filename""" safe_name = "".join([c for c in name if c.isalnum() or c in (' ', '-', '_')]).strip() return safe_name[:100] def get_html_from_clipboard(): """Extract HTML format from Windows Clipboard""" try: win32clipboard.OpenClipboard() # Register/Get HTML Format ID html_format = win32clipboard.RegisterClipboardFormat("HTML Format") if win32clipboard.IsClipboardFormatAvailable(html_format): raw_data = win32clipboard.GetClipboardData(html_format) win32clipboard.CloseClipboard() # Raw data contains headers, we need to decode and parse them # Example Header: # Version:0.9 # StartHTML:00000097 # EndHTML:00000170 # StartFragment:00000133 # EndFragment:00000134 try: html_str = raw_data.decode('utf-8') except: html_str = raw_data.decode('cp1252', errors='ignore') # Extract the actual HTML fragment using regex or string splitting start_html = re.search(r'StartHTML:(\d+)', html_str) end_html = re.search(r'EndHTML:(\d+)', html_str) if start_html and end_html: start_idx = int(start_html.group(1)) end_idx = int(end_html.group(1)) return html_str[start_idx:end_idx] return html_str # Fallback to full string if parsing fails win32clipboard.CloseClipboard() return None except Exception as e: print(f"Error reading clipboard HTML: {e}") try: win32clipboard.CloseClipboard() except: pass return None def process_html_images(html_content, timestamp): """Find images in HTML, save them locally, and update src""" soup = BeautifulSoup(html_content, 'html.parser') for img in soup.find_all('img'): src = img.get('src') if not src: continue new_filename = None # Case 1: Base64 Image if src.startswith('data:image'): try: # Extract header and data # data:image/png;base64,xxxx match = re.match(r'data:image/(\w+);base64,(.+)', src) if match: ext = match.group(1) if ext == 'jpeg': ext = 'jpg' data_str = match.group(2) img_data = base64.b64decode(data_str) new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}.{ext}" dest_path = os.path.join(IMAGES_DIR, new_filename) with open(dest_path, 'wb') as f: f.write(img_data) print(f" - Saved Base64 image: {new_filename}") except Exception as e: print(f" - Failed to process base64 image: {e}") # Case 2: Local File (file://) elif src.startswith('file://'): try: # Remove file:// prefix and decode URL encoded chars local_path = unquote(src[7:]) # On Windows, it might be file:///C:/... -> /C:/... -> C:/... if local_path.startswith('/') and ':' in local_path: local_path = local_path[1:] if os.path.exists(local_path): ext = os.path.splitext(local_path)[1] if not ext: ext = '.png' new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}{ext}" dest_path = os.path.join(IMAGES_DIR, new_filename) shutil.copy2(local_path, dest_path) print(f" - Copied local image: {new_filename}") except Exception as e: print(f" - Failed to copy local image: {e}") # Case 3: Remote URL (http/https) # Optional: We could download it, but for now let's keep it as is # or download if it's a direct image link. # Let's try to download to make it truly local/offline elif src.startswith('http'): try: # Basic check if it's an image # We skip downloading if user wants to keep remote links, but usually local is better for notes # Let's try downloading response = requests.get(src, timeout=5) if response.status_code == 200 and 'image' in response.headers.get('content-type', ''): ext = '.jpg' # default if 'png' in response.headers['content-type']: ext = '.png' elif 'gif' in response.headers['content-type']: ext = '.gif' new_filename = f"paste_img_{timestamp}_{random.randint(1000,9999)}{ext}" dest_path = os.path.join(IMAGES_DIR, new_filename) with open(dest_path, 'wb') as f: f.write(response.content) print(f" - Downloaded remote image: {new_filename}") except Exception as e: print(f" - Failed to download remote image: {e}") # Update src in HTML if we saved a file if new_filename: # We use bare filename as requested img['src'] = new_filename return str(soup) def extract_title_from_markdown(md_text): """Try to find the first H1 header to use as title""" lines = md_text.strip().split('\n') for line in lines[:10]: if line.strip().startswith('# '): return sanitize_filename(line.strip()[2:]) return None def save_clipboard_content(): # Ensure directories exist os.makedirs(MARKDOWN_DIR, exist_ok=True) os.makedirs(IMAGES_DIR, exist_ok=True) timestamp = datetime.datetime.now().strftime('%Y%m%d_%H%M%S') # Priority 1: Check for HTML (Rich Text) # This covers most "Copy from Note App" scenarios html_content = get_html_from_clipboard() if html_content: print("Detected HTML/Rich Text in clipboard...") # 1. Process Images in HTML (Save to disk) processed_html = process_html_images(html_content, timestamp) # 2. Convert to Markdown # heading_style='atx' ensures # Header style instead of underlines md_text = md(processed_html, heading_style='atx') # 3. Clean up extra newlines often introduced by conversion md_text = re.sub(r'\n{3,}', '\n\n', md_text).strip() # 4. Determine Filename title = extract_title_from_markdown(md_text) markdown_filename = f"{title}.md" if title else f"Note_{timestamp}.md" # 5. Save markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename) # Avoid collision if os.path.exists(markdown_path): markdown_filename = f"{title}_{timestamp}.md" if title else f"Note_{timestamp}_1.md" markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename) with open(markdown_path, 'w', encoding='utf-8') as f: f.write(md_text) print(f"Saved Rich Text Note to: {markdown_path}") return # Priority 2: Check for Bitmap Image (Direct Screenshot Copy) try: image = ImageGrab.grabclipboard() if image and not isinstance(image, list): print("Detected BITMAP image in clipboard...") image_filename = f"clip_image_{timestamp}.png" image_path = os.path.join(IMAGES_DIR, image_filename) image.save(image_path, 'PNG') markdown_filename = f"Image_{timestamp}.md" markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename) # Use bare filename as requested content = f"# Clipboard Image {timestamp}\n\n![{image_filename}]({image_filename})" with open(markdown_path, 'w', encoding='utf-8') as f: f.write(content) print(f"Saved image to: {image_path}") print(f"Saved markdown to: {markdown_path}") return except Exception as e: print(f"Error checking bitmap: {e}") # Priority 3: Fallback to Plain Text text = pyperclip.paste() if text: print("Detected PLAIN TEXT in clipboard...") title = extract_title_from_markdown(text) markdown_filename = f"{title}.md" if title else f"Note_{timestamp}.md" markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename) if os.path.exists(markdown_path): markdown_filename = f"{title}_{timestamp}.md" if title else f"Note_{timestamp}_1.md" markdown_path = os.path.join(MARKDOWN_DIR, markdown_filename) with open(markdown_path, 'w', encoding='utf-8') as f: f.write(text) print(f"Saved Text Note to: {markdown_path}") else: print("Clipboard is empty.") if __name__ == "__main__": save_clipboard_content()