#!/usr/bin/env python3
"""
Check and remove duplicate articles from Weaviate
"""

import weaviate
import os
from collections import defaultdict
from dotenv import load_dotenv

load_dotenv('/home/chris/.env')

# Connect to Weaviate
client = weaviate.connect_to_local()
articles = client.collections.get("Articles")

print("🔍 Checking for duplicate articles...")

# Get all articles
all_articles = []
batch_size = 100
offset = 0

while True:
    result = articles.query.fetch_objects(limit=batch_size, offset=offset)
    if not result.objects:
        break
    all_articles.extend(result.objects)
    offset += batch_size
    print(f"  Fetched {len(all_articles)} articles...")

print(f"\n📊 Total articles: {len(all_articles)}")

# Find duplicates by slug
slug_map = defaultdict(list)
for article in all_articles:
    slug = article.properties.get('slug', '')
    if slug:
        slug_map[slug].append(article)

# Count duplicates
duplicates_found = 0
for slug, articles_list in slug_map.items():
    if len(articles_list) > 1:
        duplicates_found += len(articles_list) - 1
        print(f"  Duplicate slug '{slug}': {len(articles_list)} copies")

print(f"\n✅ Found {duplicates_found} duplicate articles")

if duplicates_found > 0:
    remove = input("\nRemove duplicates (keep oldest)? (y/n): ")
    if remove.lower() == 'y':
        removed_count = 0
        for slug, articles_list in slug_map.items():
            if len(articles_list) > 1:
                # Sort by published_date to keep oldest
                sorted_articles = sorted(articles_list, 
                    key=lambda x: x.properties.get('published_date', ''))
                
                # Keep first, delete rest
                for article in sorted_articles[1:]:
                    try:
                        articles.data.delete_by_id(article.uuid)
                        removed_count += 1
                        print(f"  Removed duplicate: {slug}")
                    except Exception as e:
                        print(f"  Error removing {slug}: {e}")
        
        print(f"\n🗑️  Removed {removed_count} duplicates")

client.close()
print("\n✅ Duplicate check complete")