#!/usr/bin/env python3
"""
Process 50 oldest inspection records from new Chicago data
"""

import os
import sys
import pandas as pd
from pathlib import Path

# Add the scripts directory to path
sys.path.append('/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/scripts')

from comprehensive_processor import CleanKitchensProcessor

def process_50_oldest():
    """Process the 50 oldest inspection records from new data"""
    
    # Use the new CSV file we just downloaded
    csv_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/data/chicago_food_inspections_latest.csv"
    
    if not Path(csv_file).exists():
        print(f"❌ File not found: {csv_file}")
        return
    
    print("📖 Reading CSV to find oldest records...")
    
    # Read CSV
    df = pd.read_csv(csv_file, low_memory=False)
    print(f"📊 Total records in dataset: {len(df):,}")
    
    # Convert inspection date to datetime for sorting
    df['inspection_date'] = pd.to_datetime(df['inspection_date'], errors='coerce')
    
    # Sort by inspection date and get oldest 50
    df_sorted = df.dropna(subset=['inspection_date']).sort_values('inspection_date')
    oldest_50 = df_sorted.head(50)
    
    print(f"\n📊 Found oldest 50 records:")
    print(f"Date range: {oldest_50['inspection_date'].min()} to {oldest_50['inspection_date'].max()}")
    
    # Show summary of results
    results_summary = oldest_50['results'].value_counts()
    print(f"\nResults breakdown:")
    for result, count in results_summary.items():
        print(f"  {result}: {count}")
    
    # Save oldest 50 to temp file
    temp_file = '/tmp/oldest_50_inspections.csv'
    oldest_50.to_csv(temp_file, index=False)
    print(f"\n💾 Saved oldest 50 to: {temp_file}")
    
    # Process with our system
    print(f"\n🔄 Starting processor...")
    processor = CleanKitchensProcessor()
    
    try:
        print(f"Processing 50 oldest records...")
        processor.process_bulk_upload(temp_file, batch_size=50)
        
        print(f"\n✅ Processing complete!")
        print(f"Articles generated: {processor.articles_generated}")
        print(f"Estimated cost: ${processor.total_cost:.4f}")
        
        # Show what was created
        print(f"\n📋 Generated content:")
        
        # Query articles from Weaviate to see what was created
        articles = processor.articles.query.fetch_objects(limit=60)  # Get more than 50 to see all
        
        individual_count = 0
        group_count = 0
        
        for obj in articles.objects:
            props = obj.properties
            article_type = props.get('article_type', 'individual')
            title = props.get('title', 'Unknown')
            published = props.get('published_date', 'Unknown')[:10]  # Just date part
            
            if article_type == 'group_pass':
                restaurant_count = props.get('restaurant_count', 0)
                print(f"  📰 GROUP PASS: {title} ({restaurant_count} restaurants) - {published}")
                group_count += 1
            else:
                establishment = props.get('establishment_name', 'Unknown')
                print(f"  📰 INDIVIDUAL: {title[:60]}... - {published}")
                individual_count += 1
        
        print(f"\nSummary:")
        print(f"  Individual articles: {individual_count}")
        print(f"  Group articles: {group_count}")
        print(f"  Total articles in database: {len(articles.objects)}")
        
    except Exception as e:
        print(f"❌ Error during processing: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        # Clean up
        print(f"\n🧹 Cleaning up temp file...")
        Path(temp_file).unlink(missing_ok=True)

if __name__ == "__main__":
    process_50_oldest()