#!/usr/bin/env python3
"""
Remove duplicate articles from Weaviate
Keeps only the first article for each inspection_id
"""

import weaviate
from datetime import datetime

def remove_duplicates():
    """Remove duplicate articles keeping the oldest one for each inspection"""
    
    # Connect to Weaviate
    client = weaviate.connect_to_local(host="localhost", port=8080)
    
    try:
        collection = client.collections.get("Articles")
        
        # Get all articles
        print("Fetching all articles...")
        all_articles = collection.query.fetch_objects(
            return_properties=["title", "inspection_id", "slug", "published_date"],
            limit=100
        )
        
        # Group by inspection_id
        articles_by_inspection = {}
        for obj in all_articles.objects:
            inspection_id = obj.properties.get('inspection_id')
            if inspection_id:
                if inspection_id not in articles_by_inspection:
                    articles_by_inspection[inspection_id] = []
                articles_by_inspection[inspection_id].append({
                    'uuid': obj.uuid,
                    'title': obj.properties.get('title'),
                    'published_date': obj.properties.get('published_date'),
                    'slug': obj.properties.get('slug')
                })
        
        # Find and remove duplicates
        total_deleted = 0
        for inspection_id, articles in articles_by_inspection.items():
            if len(articles) > 1:
                print(f"\nInspection {inspection_id} has {len(articles)} articles:")
                
                # Sort by published_date to keep the oldest
                articles.sort(key=lambda x: x['published_date'])
                
                # Keep the first one, delete the rest
                keep = articles[0]
                print(f"  ✅ Keeping: {keep['title']} ({keep['published_date']})")
                
                for article in articles[1:]:
                    print(f"  ❌ Deleting: {article['title']} ({article['published_date']})")
                    collection.data.delete_by_id(article['uuid'])
                    total_deleted += 1
        
        print(f"\n✅ Deleted {total_deleted} duplicate articles")
        
        # Show remaining articles
        remaining = collection.aggregate.over_all(total=True)
        print(f"📊 Total articles remaining: {remaining.total_count}")
        
    finally:
        client.close()

if __name__ == "__main__":
    remove_duplicates()