#!/usr/bin/env python3
"""
Extract Chicago CSV data into updated ChicagoTemp table
Version 2: Includes new Claude-accessible metadata fields
Stage 1: Bulk insert all rows from CSV
Stage 2: Consolidation will happen in separate script
"""

import weaviate
import csv
import os
from datetime import datetime
from tqdm import tqdm
import re

def infer_cuisine_type(facility_name):
    """Infer cuisine type from restaurant name"""
    name_lower = facility_name.lower()
    
    # Common cuisine indicators
    cuisines = {
        'pizza': ['pizza', 'pizzeria'],
        'mexican': ['taco', 'burrito', 'mexican', 'taqueria', 'cantina'],
        'chinese': ['china', 'chinese', 'wok', 'panda', 'szechuan', 'hunan', 'cantonese'],
        'italian': ['italian', 'pasta', 'trattoria', 'osteria', 'ristorante'],
        'japanese': ['sushi', 'ramen', 'japanese', 'hibachi', 'teriyaki', 'bento'],
        'thai': ['thai', 'pad thai', 'bangkok'],
        'indian': ['indian', 'curry', 'tandoor', 'masala', 'biryani'],
        'american': ['burger', 'grill', 'bbq', 'barbecue', 'steakhouse', 'diner'],
        'coffee': ['coffee', 'cafe', 'espresso', 'starbucks', 'dunkin'],
        'vietnamese': ['pho', 'vietnamese', 'banh mi', 'saigon'],
        'mediterranean': ['mediterranean', 'greek', 'gyro', 'kebab', 'falafel'],
        'korean': ['korean', 'kimchi', 'bulgogi', 'bbq'],
        'seafood': ['seafood', 'fish', 'crab', 'lobster', 'oyster'],
        'bakery': ['bakery', 'bakehouse', 'patisserie', 'donut', 'bread'],
        'bar': ['bar', 'pub', 'tavern', 'brewery', 'taproom'],
    }
    
    for cuisine, indicators in cuisines.items():
        for indicator in indicators:
            if indicator in name_lower:
                return cuisine
    
    return 'general'

def infer_service_style(facility_name, facility_type):
    """Infer service style from name and type"""
    name_lower = facility_name.lower()
    type_lower = facility_type.lower() if facility_type else ''
    
    # Fast food chains
    fast_food = ['mcdonald', 'subway', 'burger king', 'wendy', 'taco bell', 'kfc', 
                 'popeyes', 'chipotle', 'five guys', 'jimmy john', 'potbelly']
    for chain in fast_food:
        if chain in name_lower:
            return 'fast_food'
    
    # Fine dining indicators
    if any(word in name_lower for word in ['bistro', 'ristorante', 'fine dining', 'supper club']):
        return 'fine_dining'
    
    # Casual dining chains
    casual_chains = ['applebee', 'olive garden', 'red lobster', 'outback', 'chili']
    for chain in casual_chains:
        if chain in name_lower:
            return 'casual_dining'
    
    # Facility type based
    if 'school' in type_lower:
        return 'institutional'
    if 'daycare' in type_lower:
        return 'institutional'
    if 'hospital' in type_lower:
        return 'institutional'
    if 'mobile' in type_lower or 'truck' in type_lower:
        return 'food_truck'
    if 'bakery' in type_lower:
        return 'bakery'
    if 'bar' in type_lower or 'tavern' in type_lower:
        return 'bar'
    
    return 'casual_dining'  # Default

def detect_chain(facility_name):
    """Detect if restaurant is part of a chain"""
    name_lower = facility_name.lower()
    
    # Major chains (extend this list as needed)
    chains = {
        'mcdonald': "McDonald's",
        'subway': 'Subway',
        'starbucks': 'Starbucks',
        'dunkin': 'Dunkin',
        'burger king': 'Burger King',
        'wendy': "Wendy's",
        'taco bell': 'Taco Bell',
        'kfc': 'KFC',
        'popeyes': 'Popeyes',
        'chipotle': 'Chipotle',
        'five guys': 'Five Guys',
        'jimmy john': "Jimmy John's",
        'potbelly': 'Potbelly',
        'panera': 'Panera Bread',
        'pizza hut': 'Pizza Hut',
        'domino': "Domino's",
        'papa john': "Papa John's",
        'olive garden': 'Olive Garden',
        'red lobster': 'Red Lobster',
        'applebee': "Applebee's",
        'chili\'s': "Chili's",
        'outback': 'Outback Steakhouse',
        'buffalo wild wings': 'Buffalo Wild Wings',
        'ihop': 'IHOP',
        'denny': "Denny's",
        'waffle house': 'Waffle House',
    }
    
    for chain_key, chain_name in chains.items():
        if chain_key in name_lower:
            return True, chain_name
    
    return False, None

def get_price_range(chain_name, service_style):
    """Get price range for known chains or service styles"""
    # Price ranges for known chains
    chain_prices = {
        "McDonald's": '$',
        'Subway': '$',
        'Taco Bell': '$',
        'Chipotle': '$$',
        'Five Guys': '$$',
        'Panera Bread': '$$',
        'Olive Garden': '$$',
        'Red Lobster': '$$$',
        'Outback Steakhouse': '$$$',
    }
    
    if chain_name and chain_name in chain_prices:
        return chain_prices[chain_name]
    
    # Default by service style
    style_prices = {
        'fast_food': '$',
        'casual_dining': '$$',
        'fine_dining': '$$$$',
        'food_truck': '$',
        'bakery': '$$',
        'bar': '$$',
    }
    
    return style_prices.get(service_style, '$$')

def get_chicago_neighborhood(address, zip_code):
    """Map address/zip to Chicago neighborhood"""
    # Chicago zip to neighborhood mapping (partial list)
    zip_neighborhoods = {
        '60601': 'Loop',
        '60602': 'Loop',
        '60603': 'Loop',
        '60604': 'Loop',
        '60605': 'Loop',
        '60606': 'Loop',
        '60607': 'West Loop',
        '60608': 'Pilsen',
        '60609': 'Bridgeport',
        '60610': 'Near North',
        '60611': 'Streeterville',
        '60612': 'West Town',
        '60613': 'Lakeview',
        '60614': 'Lincoln Park',
        '60615': 'Kenwood',
        '60616': 'South Loop',
        '60617': 'South Chicago',
        '60618': 'North Center',
        '60619': 'Chatham',
        '60620': 'Auburn Gresham',
        '60621': 'Englewood',
        '60622': 'Wicker Park',
        '60623': 'Little Village',
        '60624': 'West Garfield Park',
        '60625': 'Lincoln Square',
        '60626': 'Rogers Park',
        '60628': 'Roseland',
        '60629': 'Gage Park',
        '60630': 'Jefferson Park',
        '60631': 'Norwood Park',
        '60632': 'Brighton Park',
        '60634': 'Portage Park',
        '60636': 'West Englewood',
        '60637': 'Hyde Park',
        '60638': 'Belmont Cragin',
        '60639': 'Hermosa',
        '60640': 'Uptown',
        '60641': 'Old Irving Park',
        '60642': 'Noble Square',
        '60643': 'Beverly',
        '60644': 'Austin',
        '60645': 'West Ridge',
        '60646': 'Edison Park',
        '60647': 'Logan Square',
        '60649': 'South Shore',
        '60651': 'Humboldt Park',
        '60652': 'West Lawn',
        '60653': 'Grand Boulevard',
        '60654': 'River North',
        '60655': 'Mount Greenwood',
        '60656': 'Norwood Park',
        '60657': 'Lakeview',
        '60659': 'West Rogers Park',
        '60660': 'Edgewater',
        '60661': 'West Loop',
    }
    
    if zip_code and zip_code[:5] in zip_neighborhoods:
        return zip_neighborhoods[zip_code[:5]]
    
    # Try to extract from address
    address_lower = address.lower() if address else ''
    
    # Common street names that indicate neighborhoods
    if 'michigan' in address_lower and 'north' in address_lower:
        return 'Magnificent Mile'
    if 'state st' in address_lower and any(x in address_lower for x in ['0', '1', '2', '3', '4', '5']):
        return 'Loop'
    if 'halsted' in address_lower and 'north' in address_lower:
        return 'Lincoln Park'
    if 'clark st' in address_lower and 'north' in address_lower:
        return 'Wrigleyville'
    
    return 'Chicago'  # Default

def analyze_violations(violations_text):
    """Analyze violations to extract metadata"""
    if not violations_text:
        return {
            'violation_category': None,
            'violation_severity': 0,
            'critical_violations': [],
            'is_closure': False,
            'food_safety_concerns': []
        }
    
    violations_lower = violations_text.lower()
    
    # Determine primary violation category
    category = 'general'
    if any(term in violations_lower for term in ['temperature', 'cold', 'hot', 'cooling', 'thaw']):
        category = 'temperature_control'
    elif any(term in violations_lower for term in ['rodent', 'mice', 'rat', 'roach', 'pest', 'insect']):
        category = 'pest_control'
    elif any(term in violations_lower for term in ['hand', 'wash', 'glove', 'hygiene', 'sick']):
        category = 'hygiene'
    elif any(term in violations_lower for term in ['contamination', 'cross', 'raw', 'separate']):
        category = 'contamination'
    elif any(term in violations_lower for term in ['clean', 'sanit', 'wash', 'surface']):
        category = 'sanitation'
    
    # Determine severity (1-10 scale)
    severity = 3  # Base level
    if 'critical' in violations_lower or 'priority' in violations_lower:
        severity += 3
    if 'repeat' in violations_lower:
        severity += 2
    if 'closure' in violations_lower or 'cease' in violations_lower or 'shut' in violations_lower:
        severity = 10
    if 'corrected' in violations_lower:
        severity -= 1
    
    severity = min(10, max(1, severity))
    
    # Extract critical violations
    critical = []
    critical_patterns = [
        r'priority violation[s]?[:\-\s]+([^\.]+)',
        r'critical violation[s]?[:\-\s]+([^\.]+)',
        r'priority foundation[:\-\s]+([^\.]+)'
    ]
    
    for pattern in critical_patterns:
        matches = re.findall(pattern, violations_lower, re.IGNORECASE)
        critical.extend(matches)
    
    # Check for closure
    is_closure = any(term in violations_lower for term in ['ordered closed', 'cease operation', 'shut down', 'closure'])
    
    # Extract specific food safety concerns
    concerns = []
    concern_mapping = {
        'temperature': ['Temperature abuse', 'Time-temperature control'],
        'rodent': ['Rodent activity', 'Pest infestation'],
        'hand': ['Poor hand hygiene', 'Handwashing violations'],
        'contamination': ['Cross-contamination risk', 'Food contamination'],
        'expired': ['Expired food', 'Date marking violations'],
        'mold': ['Mold growth', 'Spoiled food'],
    }
    
    for key, concern_list in concern_mapping.items():
        if key in violations_lower:
            concerns.extend(concern_list)
    
    return {
        'violation_category': category,
        'violation_severity': severity,
        'critical_violations': critical[:5],  # Limit to 5
        'is_closure': is_closure,
        'food_safety_concerns': concerns[:5]  # Limit to 5
    }

def extract_csv_to_temp(client):
    """Extract all CSV data into updated ChicagoTemp table"""
    
    csv_path = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/chicago/chicago_food_inspections.csv"
    
    # Count total rows
    print("Counting CSV rows...")
    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as f:
        total_rows = sum(1 for line in f) - 1  # Subtract header
    
    print(f"Found {total_rows:,} records to process")
    
    # Get collection
    collection = client.collections.get("ChicagoTemp")
    
    # Process CSV
    print("Extracting CSV data with enhanced metadata...")
    inserted = 0
    errors = 0
    
    with open(csv_path, 'r', encoding='utf-8', errors='ignore') as csvfile:
        reader = csv.DictReader(csvfile)
        
        batch = []
        batch_size = 100
        
        with tqdm(total=total_rows, desc="Inserting") as pbar:
            for row in reader:
                try:
                    # Extract base fields
                    facility_name = row.get('DBA Name', '')
                    facility_type = row.get('Facility Type', '')
                    address = row.get('Address', '')
                    zip_code = row.get('Zip', '')
                    violations_text = row.get('Violations', '')
                    results = row.get('Results', '')
                    
                    # Infer metadata
                    cuisine_type = infer_cuisine_type(facility_name)
                    service_style = infer_service_style(facility_name, facility_type)
                    is_chain, chain_name = detect_chain(facility_name)
                    price_range = get_price_range(chain_name, service_style) if is_chain else None
                    neighborhood = get_chicago_neighborhood(address, zip_code)
                    
                    # Analyze violations
                    violation_analysis = analyze_violations(violations_text)
                    
                    # Count violations (split by "|" delimiter)
                    violation_count = len(violations_text.split('|')) if violations_text else 0
                    
                    # Create raw_data as key:value pairs
                    raw_data_pairs = []
                    for key, value in row.items():
                        if value and value.strip():
                            clean_key = key.replace(' ', '_').replace('#', 'Number')
                            clean_value = value.replace(':', ';')
                            raw_data_pairs.append(f"{clean_key}:{clean_value}")
                    
                    raw_data = ", ".join(raw_data_pairs)
                    
                    # Create record with ALL fields from new schema
                    record = {
                        # Core CSV fields
                        'inspection_id': row.get('Inspection ID', ''),
                        'facility_name': facility_name,
                        'aka_name': row.get('AKA Name', ''),
                        'license_number': row.get('License #', ''),
                        'facility_type': facility_type,
                        'risk_level': row.get('Risk', ''),
                        'address': address,
                        'city': row.get('City', ''),
                        'state': row.get('State', ''),
                        'zip_code': zip_code,
                        'inspection_date': row.get('Inspection Date', ''),
                        'inspection_type': row.get('Inspection Type', ''),
                        'results': results,
                        'violations': violations_text,
                        'latitude': row.get('Latitude', ''),
                        'longitude': row.get('Longitude', ''),
                        
                        # Claude-inferable metadata
                        'cuisine_type': cuisine_type,
                        'service_style': service_style,
                        'price_range': price_range or '',
                        'is_chain': is_chain,
                        'chain_name': chain_name or '',
                        
                        # Violation analysis
                        'violation_category': violation_analysis['violation_category'] or '',
                        'violation_severity': violation_analysis['violation_severity'],
                        'critical_violations': violation_analysis['critical_violations'],
                        'violation_count': violation_count,
                        'is_repeat_offender': False,  # Will be determined in consolidation
                        'is_closure': violation_analysis['is_closure'],
                        
                        # Geographic knowledge
                        'neighborhood': neighborhood,
                        'neighborhood_type': '',  # Will be enriched by Claude
                        'nearby_landmarks': [],  # Will be enriched by Claude
                        'nearby_transit': [],  # Will be enriched by Claude
                        'nearby_universities': [],  # Will be enriched by Claude
                        'district_characteristics': '',  # Will be enriched by Claude
                        
                        # Food safety knowledge (will be enriched by Claude)
                        'risk_factors': [],
                        'typical_violations': [],
                        'remediation_required': '',
                        'food_safety_concerns': violation_analysis['food_safety_concerns'],
                        
                        # Historical knowledge (will be enriched by Claude)
                        'chain_history': '',
                        'historical_context': '',
                        
                        # Processing metadata
                        'raw_data': raw_data,
                        'status': 'pending',
                        'source_city': 'chicago',
                        'is_combined': False,
                        'combined_from_rows': 0,
                        'claude_response': '',
                        'error_log': '',
                        
                        # Timestamps
                        'created_at': datetime.now().isoformat(),
                        'processed_at': '',
                        'published_date': '',
                        
                        # Content fields (empty until Claude processing)
                        'title': '',
                        'slug': '',
                        'content': '',
                        'excerpt': '',
                        'meta_description': '',
                        'tags': [],
                        'image_url': '',
                        'image_alt': '',
                        
                        # SEO/Schema
                        'schema_json': '',
                        'faq_json': '',
                        'article_type': '',
                    }
                    
                    batch.append(record)
                    
                    # Insert batch
                    if len(batch) >= batch_size:
                        for data_obj in batch:
                            try:
                                collection.data.insert(properties=data_obj)
                                inserted += 1
                            except Exception as e:
                                errors += 1
                                if errors <= 5:  # Only show first 5 errors
                                    print(f"Insert error: {e}")
                        batch = []
                        pbar.update(batch_size)
                        
                except Exception as e:
                    errors += 1
                    if errors <= 5:
                        print(f"Processing error: {e}")
                    pbar.update(1)
                    continue
            
            # Insert remaining records
            if batch:
                for data_obj in batch:
                    try:
                        collection.data.insert(properties=data_obj)
                        inserted += 1
                    except Exception as e:
                        errors += 1
                pbar.update(len(batch))
    
    print(f"\n✅ Extraction complete:")
    print(f"   Inserted: {inserted:,} records")
    print(f"   Errors: {errors:,} records")
    
    return inserted

def verify_data(client):
    """Verify data in ChicagoTemp table"""
    print("\nVerifying data in ChicagoTemp...")
    
    collection = client.collections.get("ChicagoTemp")
    
    # Get count
    response = collection.aggregate.over_all()
    total_count = response.total_count
    
    print(f"Total records in ChicagoTemp: {total_count:,}")
    
    # Get sample records to verify enhanced metadata
    print("\nSample records with enhanced metadata:")
    sample = collection.query.fetch_objects(limit=3)
    
    for i, obj in enumerate(sample.objects, 1):
        props = obj.properties
        print(f"\nRecord {i}:")
        print(f"  Facility: {props.get('facility_name', 'N/A')}")
        print(f"  Cuisine Type: {props.get('cuisine_type', 'N/A')}")
        print(f"  Service Style: {props.get('service_style', 'N/A')}")
        print(f"  Is Chain: {props.get('is_chain', False)}")
        print(f"  Chain Name: {props.get('chain_name', 'N/A')}")
        print(f"  Neighborhood: {props.get('neighborhood', 'N/A')}")
        print(f"  Violation Category: {props.get('violation_category', 'N/A')}")
        print(f"  Violation Severity: {props.get('violation_severity', 0)}")
        print(f"  Is Closure: {props.get('is_closure', False)}")
        print(f"  Date: {props.get('inspection_date', 'N/A')}")
        print(f"  Result: {props.get('results', 'N/A')}")
    
    # Check for facilities with multiple violations on same date
    print("\nChecking for inspections with multiple violations...")
    print("(These will be consolidated in the next step)")
    
    # Query a sample for analysis
    sample_for_analysis = collection.query.fetch_objects(limit=1000)
    
    facility_date_counts = {}
    chains_found = set()
    cuisines_found = set()
    neighborhoods_found = set()
    
    for obj in sample_for_analysis.objects:
        props = obj.properties
        
        # Count facility+date combinations
        key = f"{props.get('facility_name', '')}|{props.get('inspection_date', '')}"
        if key in facility_date_counts:
            facility_date_counts[key] += 1
        else:
            facility_date_counts[key] = 1
        
        # Collect metadata variety
        if props.get('is_chain'):
            chains_found.add(props.get('chain_name'))
        if props.get('cuisine_type'):
            cuisines_found.add(props.get('cuisine_type'))
        if props.get('neighborhood'):
            neighborhoods_found.add(props.get('neighborhood'))
    
    multiples = {k: v for k, v in facility_date_counts.items() if v > 1}
    
    if multiples:
        print(f"Found {len(multiples)} facilities with multiple records (sample of 1000)")
        # Show first 3 examples
        for key, count in list(multiples.items())[:3]:
            facility, date = key.split('|')
            print(f"  - {facility} on {date}: {count} records")
    
    print(f"\nMetadata variety found in sample:")
    print(f"  Chains detected: {len(chains_found)} unique")
    if chains_found:
        print(f"    Examples: {', '.join(list(chains_found)[:5])}")
    print(f"  Cuisine types: {len(cuisines_found)} unique")
    if cuisines_found:
        print(f"    Examples: {', '.join(list(cuisines_found)[:5])}")
    print(f"  Neighborhoods: {len(neighborhoods_found)} unique")
    if neighborhoods_found:
        print(f"    Examples: {', '.join(list(neighborhoods_found)[:5])}")
    
    return total_count

def main():
    print("="*60)
    print("CHICAGO DATA EXTRACTION V2 - ENHANCED METADATA")
    print("="*60)
    
    # Connect to Weaviate
    print("Connecting to Weaviate...")
    client = weaviate.connect_to_local(host="localhost", port=8080)
    
    try:
        # Extract CSV data with enhanced metadata
        inserted = extract_csv_to_temp(client)
        
        # Verify
        total = verify_data(client)
        
        print("\n" + "="*60)
        print("✅ EXTRACTION COMPLETE")
        print(f"   Total records in ChicagoTemp: {total:,}")
        print("   Enhanced metadata added:")
        print("     - Cuisine type inference")
        print("     - Service style detection")
        print("     - Chain recognition")
        print("     - Neighborhood mapping")
        print("     - Violation analysis")
        print("   Ready for consolidation step")
        print("="*60)
        
    except Exception as e:
        print(f"❌ Error: {e}")
        import traceback
        traceback.print_exc()
    
    finally:
        client.close()

if __name__ == "__main__":
    main()