#!/usr/bin/env python3
"""
Process 2023-2025 Chicago inspection data, working backwards from most recent
Auto-run version for background processing
"""

import os
import sys
import pandas as pd
import logging
from pathlib import Path
from datetime import datetime

# Set up logging
log_file = '/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/logs/processing.log'
os.makedirs(os.path.dirname(log_file), exist_ok=True)

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_file),
        logging.StreamHandler(sys.stdout)
    ]
)

# Add the scripts directory to path
sys.path.append('/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/scripts')

from comprehensive_processor import CleanKitchensProcessor

def process_recent_years():
    """Process 2023-2025 inspection data, starting from most recent"""
    
    logging.info("🚀 Starting CleanKitchens 2023-2025 processing")
    
    # Use the new CSV file we downloaded
    csv_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/data/chicago_food_inspections_latest.csv"
    
    if not Path(csv_file).exists():
        logging.error(f"❌ File not found: {csv_file}")
        return
    
    logging.info("📖 Reading CSV and filtering for 2023-2025...")
    
    # Read CSV
    df = pd.read_csv(csv_file, low_memory=False)
    logging.info(f"📊 Total records in dataset: {len(df):,}")
    
    # Convert inspection date to datetime for filtering
    df['inspection_date'] = pd.to_datetime(df['inspection_date'], errors='coerce')
    
    # Filter for 2023-2025 and remove nulls
    df_filtered = df.dropna(subset=['inspection_date'])
    df_recent = df_filtered[df_filtered['inspection_date'].dt.year >= 2023]
    
    # Sort by inspection date (most recent first - descending)
    df_recent_sorted = df_recent.sort_values('inspection_date', ascending=False)
    
    logging.info(f"📊 Found {len(df_recent_sorted):,} records from 2023-2025")
    logging.info(f"Date range: {df_recent_sorted['inspection_date'].min()} to {df_recent_sorted['inspection_date'].max()}")
    
    # Show breakdown by year
    year_counts = df_recent_sorted['inspection_date'].dt.year.value_counts().sort_index()
    logging.info("Records by year:")
    for year, count in year_counts.items():
        logging.info(f"  {year}: {count:,}")
    
    # Show results breakdown
    results_summary = df_recent_sorted['results'].value_counts()
    logging.info("Results breakdown:")
    for result, count in results_summary.items():
        logging.info(f"  {result}: {count:,}")
    
    # Estimate costs
    total_records = len(df_recent_sorted)
    estimated_individual_articles = len(df_recent_sorted[df_recent_sorted['results'].isin(['Fail', 'Pass w/ Conditions', 'Out of Business', 'Not Ready'])])
    estimated_group_articles = len(df_recent_sorted[df_recent_sorted['results'] == 'Pass']) // 5
    
    total_estimated_articles = estimated_individual_articles + estimated_group_articles
    estimated_cost = total_estimated_articles * 0.001
    
    logging.info("💰 Cost Estimation:")
    logging.info(f"  Individual articles (failures): ~{estimated_individual_articles:,}")
    logging.info(f"  Group articles (passes): ~{estimated_group_articles:,}")
    logging.info(f"  Total estimated articles: ~{total_estimated_articles:,}")
    logging.info(f"  Estimated cost: ${estimated_cost:.2f}")
    
    # Save filtered data to temp file
    temp_file = '/tmp/recent_years_inspections.csv'
    df_recent_sorted.to_csv(temp_file, index=False)
    logging.info(f"💾 Saved filtered data to: {temp_file}")
    
    # Process with our system
    logging.info("🔄 Starting processor for 2023-2025 data...")
    processor = CleanKitchensProcessor()
    
    try:
        logging.info(f"Processing {len(df_recent_sorted):,} records (working backwards from most recent)...")
        
        # Use larger batch size for efficiency
        batch_size = 100  # Start smaller to avoid timeouts
        processor.process_bulk_upload(temp_file, batch_size=batch_size)
        
        logging.info("✅ Processing complete!")
        logging.info(f"Articles generated: {processor.articles_generated}")
        logging.info(f"Total cost: ${processor.total_cost:.2f}")
        
        # Show what was created
        logging.info("📋 Content Summary:")
        
        # Query articles from Weaviate to see what was created
        articles = processor.articles.query.fetch_objects(limit=100)
        
        individual_count = 0
        group_count = 0
        
        # Count by type
        for obj in articles.objects:
            props = obj.properties
            article_type = props.get('article_type', 'individual')
            
            if article_type == 'group_pass':
                group_count += 1
            else:
                individual_count += 1
        
        logging.info(f"  Individual articles: {individual_count}")
        logging.info(f"  Group articles: {group_count}")
        logging.info(f"  Total articles in database: {len(articles.objects)}")
        
        # Show recent articles created
        logging.info("📰 Recent articles created:")
        for i, obj in enumerate(articles.objects[:10]):
            props = obj.properties
            title = props.get('title', 'Unknown')
            published = props.get('published_date', 'Unknown')
            if hasattr(published, 'strftime'):
                published = published.strftime('%Y-%m-%d')
            elif isinstance(published, str):
                published = published[:10]
            
            logging.info(f"  {i+1}. {title[:80]}... - {published}")
        
        if len(articles.objects) > 10:
            logging.info(f"  ... and {len(articles.objects) - 10} more articles")
        
    except Exception as e:
        logging.error(f"❌ Error during processing: {e}")
        import traceback
        logging.error(traceback.format_exc())
        raise
    
    finally:
        # Clean up
        logging.info("🧹 Cleaning up temp file...")
        Path(temp_file).unlink(missing_ok=True)
    
    logging.info("🎉 CleanKitchens processing completed successfully!")

if __name__ == "__main__":
    process_recent_years()