#!/usr/bin/env python3
"""
Automatically remove duplicate articles from Weaviate, keeping the oldest version
"""

import weaviate
import os
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv('/home/chris/.env')

# Connect to Weaviate
client = weaviate.connect_to_local()
articles = client.collections.get("Articles")

print("🔍 Checking for duplicate articles...")

# Get all articles
all_articles = []
batch_size = 100
offset = 0

while True:
    result = articles.query.fetch_objects(limit=batch_size, offset=offset)
    if not result.objects:
        break
    all_articles.extend(result.objects)
    offset += batch_size
    print(f"  Fetched {len(all_articles)} articles...")

print(f"\n📊 Total articles: {len(all_articles)}")

# Find duplicates by slug
slug_map = defaultdict(list)
for article in all_articles:
    slug = article.properties.get('slug', '')
    if slug:
        slug_map[slug].append(article)

# Count and remove duplicates
duplicates_found = 0
removed_count = 0
duplicate_details = []

for slug, articles_list in slug_map.items():
    if len(articles_list) > 1:
        duplicates_found += len(articles_list) - 1
        duplicate_details.append(f"  '{slug}': {len(articles_list)} copies")
        
        # Sort by published_date to keep oldest
        sorted_articles = sorted(articles_list, 
            key=lambda x: x.properties.get('published_date', ''))
        
        # Keep first, delete rest
        for article in sorted_articles[1:]:
            try:
                articles.data.delete_by_id(article.uuid)
                removed_count += 1
                print(f"  ✓ Removed duplicate: {slug}")
            except Exception as e:
                print(f"  ✗ Error removing {slug}: {e}")

print(f"\n📋 Duplicate Summary:")
for detail in duplicate_details[:10]:  # Show first 10
    print(detail)
if len(duplicate_details) > 10:
    print(f"  ... and {len(duplicate_details) - 10} more")

print(f"\n✅ Results:")
print(f"  - Found {duplicates_found} duplicate articles")
print(f"  - Removed {removed_count} duplicates")
print(f"  - Kept the oldest version of each article")

# Verify final count
try:
    remaining_articles = articles.aggregate.over_all().total_count
    print(f"  - Articles remaining: {remaining_articles}")
except:
    pass

client.close()
print("\n✅ Duplicate removal complete!")