#!/usr/bin/env python3
"""
Cleanup duplicate articles in Weaviate database
"""

import weaviate
import os
from dotenv import load_dotenv

load_dotenv('/home/chris/.env')

def cleanup_duplicate_articles():
    """Remove duplicate articles, keeping only the first one of each slug"""
    
    client = weaviate.connect_to_local()
    articles = client.collections.get("Articles")
    
    # Get all articles
    all_articles = articles.query.fetch_objects(limit=1000)
    
    print(f"📊 Found {len(all_articles.objects)} total articles")
    
    # Group by slug
    slug_groups = {}
    for article_obj in all_articles.objects:
        props = article_obj.properties
        slug = props.get('slug')
        
        if slug:
            if slug not in slug_groups:
                slug_groups[slug] = []
            slug_groups[slug].append({
                'uuid': article_obj.uuid,
                'title': props.get('title'),
                'published_date': props.get('published_date')
            })
    
    # Find duplicates
    duplicates_removed = 0
    
    for slug, articles_list in slug_groups.items():
        if len(articles_list) > 1:
            print(f"\n🔍 Found {len(articles_list)} duplicates for slug: {slug}")
            
            # Sort by published_date to keep the earliest one
            articles_list.sort(key=lambda x: x.get('published_date', ''))
            
            # Keep the first, delete the rest
            keep_article = articles_list[0]
            duplicates = articles_list[1:]
            
            print(f"✅ Keeping: {keep_article['title']} ({keep_article['published_date']})")
            
            for duplicate in duplicates:
                try:
                    articles.data.delete_by_id(duplicate['uuid'])
                    print(f"🗑️  Deleted: {duplicate['title']} ({duplicate['published_date']})")
                    duplicates_removed += 1
                except Exception as e:
                    print(f"❌ Error deleting {duplicate['uuid']}: {e}")
    
    client.close()
    
    print(f"\n🎉 Cleanup complete!")
    print(f"📊 Total duplicates removed: {duplicates_removed}")
    print(f"📊 Unique articles remaining: {len(slug_groups)}")
    
    return duplicates_removed

if __name__ == "__main__":
    cleanup_duplicate_articles()