#!/usr/bin/env python3
"""导入 OpenClaw memory_conversations 到本地向量数据库"""

import json
import hashlib
from pathlib import Path
from datetime import datetime

# 复用 local_memory.py 的函数
from local_memory import get_client, get_collection, get_embedding, generate_id

def chunk_text(text: str, max_len: int = 400) -> list:
    """简单分块"""
    if len(text) <= max_len:
        return [text]
    chunks = []
    while text:
        chunks.append(text[:max_len])
        text = text[max_len:]
    return chunks

CONVERSATIONS_DIR = Path.home() / ".hermes/openclaw_sessions_backup/memory_conversations"

def import_conversations():
    client = get_client()
    collection = get_collection(client)
    
    imported = 0
    skipped = 0
    errors = 0
    
    for jsonl_file in sorted(CONVERSATIONS_DIR.glob("*.jsonl")):
        print(f"\n处理 {jsonl_file.name}...")
        
        with open(jsonl_file, 'r', encoding='utf-8') as f:
            for line_num, line in enumerate(f, 1):
                try:
                    data = json.loads(line.strip())
                    
                    # 提取内容
                    content = data.get('content', '')
                    if not content or len(content) < 20:
                        continue
                    
                    # 清理 to= 前缀
                    if content.startswith('to='):
                        lines = content.split('\n', 2)
                        if len(lines) > 1:
                            content = '\n'.join(lines[1:]).strip()
                    
                    role = data.get('role', 'unknown')
                    timestamp = data.get('recordedAt', '')
                    
                    # 分块处理长内容
                    chunks = chunk_text(content, max_len=400)
                    
                    for i, chunk in enumerate(chunks):
                        if len(chunk) < 20:
                            continue
                        
                        # 生成唯一ID
                        chunk_id = hashlib.md5(f"{jsonl_file.name}:{line_num}:{i}:{chunk[:50]}".encode()).hexdigest()[:12]
                        
                        # 检查是否已存在
                        existing = collection.get(ids=[chunk_id])
                        if existing and existing['ids']:
                            skipped += 1
                            continue
                        
                        # 获取 embedding
                        embedding = get_embedding(chunk)
                        if not embedding:
                            errors += 1
                            continue
                        
                        # 添加到数据库
                        collection.add(
                            ids=[chunk_id],
                            embeddings=[embedding],
                            documents=[chunk],
                            metadatas=[{
                                "source": "openclaw_memory_conversations",
                                "file": jsonl_file.name,
                                "role": role,
                                "timestamp": timestamp,
                                "created_at": datetime.now().isoformat()
                            }]
                        )
                        imported += 1
                        
                        if imported % 50 == 0:
                            print(f"  已导入 {imported} 条...")
                            
                except Exception as e:
                    errors += 1
                    if errors < 10:
                        print(f"  错误 {jsonl_file.name}:{line_num}: {e}")
    
    print(f"\n完成！导入 {imported} 条，跳过 {skipped} 条，错误 {errors} 条")
    print(f"数据库总记忆数: {collection.count()}")

if __name__ == "__main__":
    import_conversations()
