#!/usr/bin/env python3
"""
Update database schema to include only fields Claude can actually populate
Makes chicago_temp and Articles tables have identical structure
"""

import weaviate
import weaviate.classes.config as wvc

def get_claude_accessible_fields():
    """
    Define all fields that Claude can actually populate
    Based on inspection data + Claude's general knowledge
    """
    return [
        # ========== CORE FIELDS FROM CSV ==========
        ('inspection_id', wvc.DataType.TEXT, 'Unique inspection identifier'),
        ('facility_name', wvc.DataType.TEXT, 'Restaurant/facility name'),
        ('aka_name', wvc.DataType.TEXT, 'Alternative business name'),
        ('license_number', wvc.DataType.TEXT, 'Business license'),
        ('facility_type', wvc.DataType.TEXT, 'Type of establishment'),
        ('risk_level', wvc.DataType.TEXT, 'Risk category from inspection'),
        ('address', wvc.DataType.TEXT, 'Street address'),
        ('city', wvc.DataType.TEXT, 'City name'),
        ('state', wvc.DataType.TEXT, 'State code'),
        ('zip_code', wvc.DataType.TEXT, 'Postal code'),
        ('inspection_date', wvc.DataType.TEXT, 'Date of inspection'),
        ('inspection_type', wvc.DataType.TEXT, 'Type of inspection'),
        ('results', wvc.DataType.TEXT, 'Pass/Fail/etc'),
        ('violations', wvc.DataType.TEXT, 'Full violation text'),
        ('latitude', wvc.DataType.TEXT, 'Geographic coordinate'),
        ('longitude', wvc.DataType.TEXT, 'Geographic coordinate'),
        
        # ========== CLAUDE-GENERATED CONTENT ==========
        ('title', wvc.DataType.TEXT, 'Article headline'),
        ('slug', wvc.DataType.TEXT, 'URL slug'),
        ('content', wvc.DataType.TEXT, 'Full article HTML'),
        ('excerpt', wvc.DataType.TEXT, 'Article summary'),
        ('meta_description', wvc.DataType.TEXT, 'SEO description'),
        ('tags', wvc.DataType.TEXT_ARRAY, 'Article tags'),
        ('image_url', wvc.DataType.TEXT, 'Selected image'),
        ('image_alt', wvc.DataType.TEXT, 'Image alt text'),
        
        # ========== CLAUDE-INFERABLE METADATA ==========
        # From restaurant name/type
        ('cuisine_type', wvc.DataType.TEXT, 'Italian/Mexican/etc - inferred from name'),
        ('service_style', wvc.DataType.TEXT, 'fast-food/casual/fine-dining - inferred'),
        ('price_range', wvc.DataType.TEXT, '$/$$/$$$/$$$$- known chains only'),
        ('is_chain', wvc.DataType.BOOL, 'Recognizable chain'),
        ('chain_name', wvc.DataType.TEXT, 'Name of chain if applicable'),
        
        # From violation text analysis
        ('violation_category', wvc.DataType.TEXT, 'Primary violation type'),
        ('violation_severity', wvc.DataType.NUMBER, 'Severity score 1-10'),
        ('critical_violations', wvc.DataType.TEXT_ARRAY, 'List of critical issues'),
        ('violation_count', wvc.DataType.NUMBER, 'Total number of violations'),
        ('is_repeat_offender', wvc.DataType.BOOL, 'If violations mention repeat'),
        ('is_closure', wvc.DataType.BOOL, 'If establishment was closed'),
        
        # From Claude's geographic knowledge
        ('neighborhood', wvc.DataType.TEXT, 'Known neighborhood name'),
        ('neighborhood_type', wvc.DataType.TEXT, 'commercial/residential/tourist/college'),
        ('nearby_landmarks', wvc.DataType.TEXT_ARRAY, 'Known landmarks within area'),
        ('nearby_transit', wvc.DataType.TEXT_ARRAY, 'Known transit stops/lines'),
        ('nearby_universities', wvc.DataType.TEXT_ARRAY, 'Known colleges/universities'),
        ('district_characteristics', wvc.DataType.TEXT, 'General area description'),
        
        # From Claude's food safety knowledge
        ('risk_factors', wvc.DataType.TEXT_ARRAY, 'Known risks for this food type'),
        ('typical_violations', wvc.DataType.TEXT_ARRAY, 'Common issues for cuisine type'),
        ('remediation_required', wvc.DataType.TEXT, 'Standard fixes needed'),
        ('food_safety_concerns', wvc.DataType.TEXT_ARRAY, 'Specific safety issues'),
        
        # From Claude's historical knowledge (pre-2024)
        ('chain_history', wvc.DataType.TEXT, 'Known chain-level incidents'),
        ('historical_context', wvc.DataType.TEXT, 'Any known history'),
        
        # ========== PROCESSING METADATA ==========
        ('raw_data', wvc.DataType.TEXT, 'Original key:value pairs'),
        ('status', wvc.DataType.TEXT, 'Processing status'),
        ('source_city', wvc.DataType.TEXT, 'Data source city'),
        ('is_combined', wvc.DataType.BOOL, 'Multiple violations combined'),
        ('combined_from_rows', wvc.DataType.NUMBER, 'Number of rows combined'),
        ('claude_response', wvc.DataType.TEXT, 'Full Claude API response'),
        ('error_log', wvc.DataType.TEXT, 'Any errors during processing'),
        
        # ========== TIMESTAMPS ==========
        ('created_at', wvc.DataType.TEXT, 'Record creation time'),
        ('processed_at', wvc.DataType.TEXT, 'Processing completion time'),
        ('published_date', wvc.DataType.TEXT, 'Article publication date'),
        
        # ========== SEO/SCHEMA ==========
        ('schema_json', wvc.DataType.TEXT, 'Structured data schema'),
        ('faq_json', wvc.DataType.TEXT, 'FAQ schema if present'),
        ('article_type', wvc.DataType.TEXT, 'Type of article'),
    ]

def create_chicago_temp_updated(client):
    """Create updated chicago_temp table with Claude-accessible fields"""
    print("Creating updated ChicagoTemp table...")
    
    try:
        # Delete existing if present
        try:
            client.collections.delete("ChicagoTemp")
            print("  Deleted existing ChicagoTemp collection")
        except:
            pass
        
        # Create with new schema
        properties = []
        for field_name, data_type, description in get_claude_accessible_fields():
            properties.append(
                wvc.Property(
                    name=field_name,
                    data_type=data_type,
                    description=description
                )
            )
        
        client.collections.create(
            name="ChicagoTemp",
            description="Staging table with Claude-accessible fields only",
            properties=properties
        )
        
        print("✅ ChicagoTemp table created with Claude-accessible fields")
        return True
        
    except Exception as e:
        print(f"❌ Error creating ChicagoTemp: {e}")
        return False

def create_articles_table(client):
    """Create Articles table with identical structure to ChicagoTemp"""
    print("Creating Articles table with matching schema...")
    
    try:
        # Check if exists
        try:
            client.collections.delete("Articles")
            print("  Deleted existing Articles collection")
        except:
            pass
        
        # Create with same schema as ChicagoTemp
        properties = []
        for field_name, data_type, description in get_claude_accessible_fields():
            properties.append(
                wvc.Property(
                    name=field_name,
                    data_type=data_type,
                    description=description
                )
            )
        
        client.collections.create(
            name="Articles",
            description="Main articles table with Claude-accessible fields",
            properties=properties
        )
        
        print("✅ Articles table created with matching schema")
        return True
        
    except Exception as e:
        print(f"❌ Error creating Articles: {e}")
        return False

def list_fields_to_remove():
    """List fields that Claude CANNOT access (to be removed)"""
    cannot_access = [
        # Demographic data (needs Census API)
        'population_density',
        'median_income',
        'demographic_profile',
        
        # Real-time data (needs external APIs)
        'foot_traffic',
        'venue_density',
        'competitor_count',
        'current_wait_time',
        
        # Business details (needs business API)
        'establishment_age',
        'seating_capacity',
        'employee_count',
        'annual_revenue',
        'ownership_type',
        
        # Current news/social (needs news/social APIs)
        'recent_news_mentions',
        'social_media_sentiment',
        'yelp_rating',
        'google_reviews',
        'recent_complaints',
        
        # Precise measurements (needs mapping API)
        'exact_distance_to_transit',
        'exact_distance_to_schools',
        'walking_score',
        'parking_availability',
    ]
    
    print("\n❌ Fields to REMOVE (Claude cannot access):")
    for field in cannot_access:
        print(f"   - {field}")
    
    return cannot_access

def list_fields_to_add():
    """List new fields that Claude CAN provide"""
    can_provide = [
        # Location context from knowledge
        ('neighborhood', 'Known neighborhood name'),
        ('nearby_landmarks', 'Famous landmarks Claude knows'),
        ('nearby_transit', 'Major transit lines/stations'),
        ('nearby_universities', 'Colleges/universities'),
        
        # Inferred from name/type
        ('cuisine_type', 'Inferred from restaurant name'),
        ('service_style', 'fast-food/casual/fine-dining'),
        ('price_range', 'For known chains'),
        
        # From violation analysis
        ('violation_category', 'Primary type of violation'),
        ('violation_severity', 'Severity score 1-10'),
        ('critical_violations', 'List of critical issues'),
        
        # From knowledge base
        ('risk_factors', 'Known risks for food type'),
        ('typical_violations', 'Common issues for cuisine'),
        ('chain_history', 'Known chain incidents pre-2024'),
    ]
    
    print("\n✅ Fields to ADD (Claude can provide):")
    for field, description in can_provide:
        print(f"   + {field}: {description}")
    
    return can_provide

def main():
    print("="*60)
    print("DATABASE SCHEMA UPDATE")
    print("Updating to include only Claude-accessible fields")
    print("="*60)
    
    # Connect to Weaviate
    print("\nConnecting to Weaviate...")
    client = weaviate.connect_to_local(host="localhost", port=8080)
    
    try:
        # Show what changes we're making
        print("\n📋 SCHEMA CHANGES:")
        list_fields_to_remove()
        list_fields_to_add()
        
        print("\n" + "="*60)
        print("Creating updated tables...")
        print("="*60)
        
        # Create updated tables
        if create_chicago_temp_updated(client):
            print("✅ ChicagoTemp updated successfully")
        
        if create_articles_table(client):
            print("✅ Articles table updated successfully")
        
        print("\n" + "="*60)
        print("✅ SCHEMA UPDATE COMPLETE")
        print("Both tables now have identical structure with only Claude-accessible fields")
        print("="*60)
        
        # List final schema
        print("\n📋 FINAL SCHEMA (both tables):")
        print("Categories:")
        print("  • Core CSV fields (16 fields)")
        print("  • Claude-generated content (8 fields)")
        print("  • Claude-inferable metadata (20 fields)")
        print("  • Processing metadata (7 fields)")
        print("  • Timestamps (3 fields)")
        print("  • SEO/Schema (3 fields)")
        print(f"  • TOTAL: {len(get_claude_accessible_fields())} fields")
        
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        client.close()

if __name__ == "__main__":
    main()