#!/usr/bin/env python3
"""
Extract Chicago CSV data into ChicagoTemp table
Version 3: Facts-only extraction - Claude will handle all inference
Stage 1: Bulk insert all rows from CSV with direct field mappings only
Stage 2: Consolidation will happen in separate script
Stage 3: Claude will infer all metadata during processing
"""

import weaviate
import csv
import os
from datetime import datetime
from tqdm import tqdm
import re

def extract_csv_to_temp(client):
    """Extract all CSV data into ChicagoTemp table - facts only, no inference"""
    
    csv_path = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/chicago/chicago_food_inspections.csv"
    
    # Count total rows
    print("Counting CSV rows...")
    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
        total_rows = sum(1 for line in f) - 1  # Subtract header
    
    print(f"Found {total_rows:,} records to process")
    
    # Get collection
    collection = client.collections.get("ChicagoTemp")
    
    # Process CSV
    print("Extracting CSV data (facts only - no inference)...")
    inserted = 0
    errors = 0
    
    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.DictReader(csvfile)
        
        batch = []
        batch_size = 100
        
        with tqdm(total=total_rows, desc="Inserting") as pbar:
            for row in reader:
                try:
                    # Create raw_data as key:value pairs for reference
                    raw_data_pairs = []
                    for key, value in row.items():
                        if value and value.strip():
                            clean_key = key.replace(' ', '_').replace('#', 'Number')
                            clean_value = value.replace(':', ';')
                            raw_data_pairs.append(f"{clean_key}:{clean_value}")
                    
                    raw_data = ", ".join(raw_data_pairs)
                    
                    # Count violations (split by "|" delimiter) - this is factual, not inference
                    violations_text = row.get('Violations', '')
                    violation_count = len(violations_text.split('|')) if violations_text else 0
                    
                    # Check for closure in results - this is factual from the Results field
                    results = row.get('Results', '')
                    is_closure = 'Business Not Located' in results or 'Out of Business' in results
                    
                    # Create record with ONLY direct field mappings - no inference
                    record = {
                        # ========== DIRECT CSV MAPPINGS (FACTS) ==========
                        'inspection_id': row.get('Inspection ID', ''),
                        'facility_name': row.get('DBA Name', ''),
                        'aka_name': row.get('AKA Name', ''),
                        'license_number': row.get('License #', ''),
                        'facility_type': row.get('Facility Type', ''),
                        'risk_level': row.get('Risk', ''),
                        'address': row.get('Address', ''),
                        'city': row.get('City', ''),
                        'state': row.get('State', ''),
                        'zip_code': row.get('Zip', ''),
                        'inspection_date': row.get('Inspection Date', ''),
                        'inspection_type': row.get('Inspection Type', ''),
                        'results': results,
                        'violations': violations_text,
                        'latitude': float(row.get('Latitude', 0) or 0),
                        'longitude': float(row.get('Longitude', 0) or 0),
                        'violation_count': violation_count,  # Factual count
                        'is_closure': is_closure,  # Factual from Results field
                        
                        # ========== FIELDS FOR CLAUDE TO POPULATE ==========
                        # Establishment analysis (Claude will infer)
                        'cuisine_type': '',  # Claude will analyze name/type
                        'service_style': '',  # Claude will determine
                        'price_range': '',  # Claude will estimate
                        'is_chain': False,  # Claude will detect
                        'chain_name': '',  # Claude will identify
                        
                        # Location context (Claude will provide)
                        'neighborhood': '',  # Claude knows Chicago
                        'neighborhood_type': '',  # Claude will categorize
                        'nearby_landmarks': [],  # Claude will identify
                        'nearby_transit': [],  # Claude knows CTA
                        'nearby_universities': [],  # Claude knows schools
                        'district_characteristics': '',  # Claude will describe
                        
                        # Violation analysis (Claude will analyze)
                        'violation_category': '',  # Claude will categorize
                        'violation_severity': '',  # Claude will score (text field)
                        'critical_violations': 0,  # Claude will count (number field)
                        'is_repeat_offender': False,  # Claude will determine from text
                        
                        # Risk assessment (Claude will evaluate)
                        'risk_factors': [],  # Claude will identify
                        'typical_violations': [],  # Claude knows patterns
                        'remediation_required': '',  # Claude will recommend
                        'food_safety_concerns': [],  # Claude will list
                        
                        # Historical context (Claude may know)
                        'chain_history': '',  # Claude knows major incidents
                        'historical_context': '',  # Claude may have context
                        
                        # ========== PROCESSING METADATA ==========
                        'raw_data': raw_data,
                        'status': 'pending',
                        'source_city': 'Chicago',
                        'is_combined': False,
                        'combined_from_rows': [],  # Text array field
                        'claude_response': '',
                        'error_log': '',
                        
                        # ========== TIMESTAMPS ==========
                        'created_at': datetime.now().isoformat(),
                        'processed_at': '',
                        'published_date': '',
                        
                        # ========== CONTENT FIELDS (EMPTY UNTIL CLAUDE) ==========
                        'title': '',
                        'slug': '',
                        'content': '',
                        'excerpt': '',
                        'meta_description': '',
                        'tags': [],
                        'image_url': '',
                        'image_alt': '',
                        
                        # ========== SEO/SCHEMA ==========
                        'schema_json': '',
                        'faq_json': '',
                        'article_type': '',
                    }
                    
                    batch.append(record)
                    
                    # Insert batch
                    if len(batch) >= batch_size:
                        for data_obj in batch:
                            try:
                                collection.data.insert(properties=data_obj)
                                inserted += 1
                            except Exception as e:
                                errors += 1
                                if errors <= 5:  # Only show first 5 errors
                                    print(f"Insert error: {e}")
                        batch = []
                        pbar.update(batch_size)
                        
                except Exception as e:
                    errors += 1
                    if errors <= 5:
                        print(f"Processing error: {e}")
                    pbar.update(1)
                    continue
            
            # Insert remaining records
            if batch:
                for data_obj in batch:
                    try:
                        collection.data.insert(properties=data_obj)
                        inserted += 1
                    except Exception as e:
                        errors += 1
                pbar.update(len(batch))
    
    print(f"\n✅ Extraction complete:")
    print(f"   Inserted: {inserted:,} records")
    print(f"   Errors: {errors:,} records")
    
    return inserted

def verify_data(client):
    """Verify data in ChicagoTemp table"""
    print("\nVerifying data in ChicagoTemp...")
    
    collection = client.collections.get("ChicagoTemp")
    
    # Get count
    response = collection.aggregate.over_all()
    total_count = response.total_count
    
    print(f"Total records in ChicagoTemp: {total_count:,}")
    
    # Get sample records to show what we extracted
    print("\nSample records (facts only - no inference):")
    sample = collection.query.fetch_objects(limit=3)
    
    for i, obj in enumerate(sample.objects, 1):
        props = obj.properties
        print(f"\nRecord {i}:")
        print(f"  === FACTS FROM CSV ===")
        print(f"  Facility: {props.get('facility_name', 'N/A')}")
        print(f"  Type: {props.get('facility_type', 'N/A')}")
        print(f"  Address: {props.get('address', 'N/A')}")
        print(f"  Zip: {props.get('zip_code', 'N/A')}")
        print(f"  Date: {props.get('inspection_date', 'N/A')}")
        print(f"  Result: {props.get('results', 'N/A')}")
        print(f"  Violation Count: {props.get('violation_count', 0)}")
        print(f"  === FIELDS AWAITING CLAUDE ===")
        print(f"  Cuisine Type: {props.get('cuisine_type', '[Pending Claude analysis]')}")
        print(f"  Neighborhood: {props.get('neighborhood', '[Pending Claude analysis]')}")
        print(f"  Service Style: {props.get('service_style', '[Pending Claude analysis]')}")
        print(f"  Violation Category: {props.get('violation_category', '[Pending Claude analysis]')}")
    
    # Check for facilities with multiple violations on same date
    print("\nChecking for inspections with multiple records...")
    print("(These will be consolidated in the next step)")
    
    # Query a sample for analysis
    sample_for_analysis = collection.query.fetch_objects(limit=1000)
    
    facility_date_counts = {}
    for obj in sample_for_analysis.objects:
        props = obj.properties
        key = f"{props.get('facility_name', '')}|{props.get('inspection_date', '')}"
        if key in facility_date_counts:
            facility_date_counts[key] += 1
        else:
            facility_date_counts[key] = 1
    
    multiples = {k: v for k, v in facility_date_counts.items() if v > 1}
    
    if multiples:
        print(f"Found {len(multiples)} facilities with multiple records (sample of 1000)")
        # Show first 3 examples
        for key, count in list(multiples.items())[:3]:
            facility, date = key.split('|')
            print(f"  - {facility} on {date}: {count} records")
    
    return total_count

def main():
    print("="*60)
    print("CHICAGO DATA EXTRACTION V3 - FACTS ONLY")
    print("Claude will handle all intelligent inference")
    print("="*60)
    
    # Connect to Weaviate
    print("Connecting to Weaviate...")
    client = weaviate.connect_to_local(host="localhost", port=8080)
    
    try:
        # Extract CSV data - facts only
        inserted = extract_csv_to_temp(client)
        
        # Verify
        total = verify_data(client)
        
        print("\n" + "="*60)
        print("✅ EXTRACTION COMPLETE")
        print(f"   Total records in ChicagoTemp: {total:,}")
        print("\n   Extraction approach:")
        print("   ✓ Direct CSV field mappings only")
        print("   ✓ No hardcoded inference rules")
        print("   ✓ Claude will analyze and enrich during processing")
        print("\n   Benefits:")
        print("   • No maintenance of hardcoded rules")
        print("   • Claude's complete knowledge utilized")
        print("   • Consistent, intelligent analysis")
        print("   • Richer context and relationships")
        print("\n   Ready for consolidation step")
        print("="*60)
        
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        client.close()

if __name__ == "__main__":
    main()