#!/usr/bin/env python3
"""
Chicago Food Inspections API Importer
Fetches data from Chicago Open Data API and imports to CleanKitchens database

This script replaces the missing chicago_api_update.py referenced in cron jobs
Designed to run every 6 hours to fetch new inspection data
"""

import json
import requests
import mysql.connector
from datetime import datetime, timedelta
import logging
import sys
import time
import re
import os

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('/var/log/cleankitchens_chicago_api.log'),
        logging.StreamHandler(sys.stdout)
    ]
)
logger = logging.getLogger(__name__)

# Database configuration for CleanKitchens
DB_CONFIG = {
    'user': 'cleankitchens',
    'password': 'CleanK1tch3ns2025',
    'host': 'localhost',
    'database': 'cleankitchens',
    'raise_on_warnings': False,
    'charset': 'utf8mb4',
    'autocommit': False
}

# Chicago Open Data API configuration
CHICAGO_API_BASE = "https://data.cityofchicago.org/resource/4ijn-s7e5.json"
RATE_LIMIT_DELAY = 1  # seconds between API calls
MAX_RETRIES = 3
BATCH_SIZE = 1000

class ChicagoAPIImporter:
    def __init__(self):
        self.conn = None
        self.cursor = None
        self.stats = {
            'api_calls': 0,
            'new_inspections': 0,
            'updated_facilities': 0,
            'errors': 0,
            'duplicate_inspections': 0,
            'start_time': datetime.now()
        }

    def connect_db(self):
        """Connect to CleanKitchens database"""
        try:
            self.conn = mysql.connector.connect(**DB_CONFIG)
            self.cursor = self.conn.cursor(buffered=True)
            logger.info("Connected to CleanKitchens database")
            return True
        except Exception as e:
            logger.error(f"Database connection failed: {e}")
            return False

    def disconnect_db(self):
        """Close database connection"""
        if self.cursor:
            self.cursor.close()
        if self.conn:
            self.conn.close()
        logger.info("Database connection closed")

    def get_last_import_date(self):
        """Get the date of the most recent Chicago inspection in database"""
        try:
            query = """
            SELECT MAX(inspection_date)
            FROM inspections
            WHERE source_city = 'chicago' AND source_inspection_id IS NOT NULL
            """
            self.cursor.execute(query)
            result = self.cursor.fetchone()

            if result[0]:
                # Add 1 day to avoid re-importing the last day
                last_date = result[0] + timedelta(days=1)
                logger.info(f"Last Chicago inspection date in DB: {result[0]}, fetching from: {last_date}")
                return last_date
            else:
                # No Chicago data found, start from 30 days ago
                default_date = datetime.now().date() - timedelta(days=30)
                logger.info(f"No Chicago data found, starting from: {default_date}")
                return default_date

        except Exception as e:
            logger.error(f"Error getting last import date: {e}")
            return datetime.now().date() - timedelta(days=7)  # Default to 7 days ago

    def fetch_api_data(self, since_date, limit=BATCH_SIZE, offset=0):
        """Fetch data from Chicago API with date filtering"""
        try:
            # Format date for API query
            date_str = since_date.strftime('%Y-%m-%d')

            params = {
                '$where': f"inspection_date >= '{date_str}'",
                '$limit': limit,
                '$offset': offset,
                '$order': 'inspection_date DESC'
            }

            logger.info(f"API call: offset={offset}, limit={limit}, since={date_str}")

            response = requests.get(CHICAGO_API_BASE, params=params, timeout=30)
            response.raise_for_status()

            self.stats['api_calls'] += 1
            time.sleep(RATE_LIMIT_DELAY)  # Rate limiting

            data = response.json()
            logger.info(f"API returned {len(data)} records")

            return data

        except Exception as e:
            logger.error(f"API fetch error: {e}")
            self.stats['errors'] += 1
            return None

    def clean_string(self, value):
        """Clean string values for database insertion"""
        if not value:
            return None
        value = str(value).strip()
        if value.upper() in ['NULL', 'NONE', '']:
            return None
        return value[:255]  # Truncate to avoid field overflow

    def parse_date(self, date_str):
        """Parse API date string to MySQL date format"""
        if not date_str:
            return None
        try:
            # API returns ISO format: "2025-12-04T00:00:00.000"
            date_obj = datetime.fromisoformat(date_str.replace('T00:00:00.000', ''))
            return date_obj.date()
        except Exception as e:
            logger.warning(f"Date parse error for '{date_str}': {e}")
            return None

    def generate_slug(self, text):
        """Generate URL-friendly slug from text"""
        if not text:
            return None
        # Remove special characters and convert to lowercase
        slug = re.sub(r'[^a-zA-Z0-9\s-]', '', text.lower())
        # Replace spaces and multiple hyphens with single hyphen
        slug = re.sub(r'[\s-]+', '-', slug)
        # Remove leading/trailing hyphens
        slug = slug.strip('-')
        return slug[:100]  # Limit length

    def find_or_create_facility(self, inspection_data):
        """Find existing facility or create new one"""
        try:
            license_num = self.clean_string(inspection_data.get('license_'))
            dba_name = self.clean_string(inspection_data.get('dba_name'))
            address = self.clean_string(inspection_data.get('address'))

            if not license_num:
                return None

            # Try to find existing facility by license number
            check_query = """
            SELECT id FROM facilities
            WHERE source_city = 'chicago' AND source_license_number = %s
            LIMIT 1
            """
            self.cursor.execute(check_query, (license_num,))
            result = self.cursor.fetchone()

            if result:
                return result[0]  # Return existing facility_id

            # Create new facility
            facility_data = {
                'name': dba_name or 'Unknown',
                'facility_type': self.clean_string(inspection_data.get('facility_type')) or 'restaurant',
                'address': address or '',
                'city': 'Chicago',
                'state': 'IL',
                'zip_code': self.clean_string(inspection_data.get('zip')),
                'latitude': self.parse_float(inspection_data.get('latitude')),
                'longitude': self.parse_float(inspection_data.get('longitude')),
                'source_city': 'chicago',
                'source_license_number': license_num
            }

            insert_query = """
            INSERT INTO facilities (
                name, facility_type, address, city, state, zip_code,
                latitude, longitude, source_city, source_license_number,
                created_at, updated_at
            ) VALUES (
                %(name)s, %(facility_type)s, %(address)s, %(city)s, %(state)s,
                %(zip_code)s, %(latitude)s, %(longitude)s, %(source_city)s,
                %(source_license_number)s, NOW(), NOW()
            )
            """

            self.cursor.execute(insert_query, facility_data)
            facility_id = self.cursor.lastrowid
            self.stats['updated_facilities'] += 1

            logger.info(f"Created new facility: {dba_name} (ID: {facility_id})")
            return facility_id

        except Exception as e:
            logger.error(f"Error creating facility: {e}")
            self.stats['errors'] += 1
            return None

    def parse_float(self, value):
        """Parse string to float"""
        try:
            return float(value) if value else None
        except:
            return None

    def import_inspection(self, inspection_data):
        """Import single inspection record"""
        try:
            inspection_id = self.clean_string(inspection_data.get('inspection_id'))
            if not inspection_id:
                return False

            # Check if inspection already exists
            check_query = """
            SELECT id FROM inspections
            WHERE source_city = 'chicago' AND source_inspection_id = %s
            """
            self.cursor.execute(check_query, (inspection_id,))
            if self.cursor.fetchone():
                self.stats['duplicate_inspections'] += 1
                return False  # Skip duplicate

            # Find or create facility
            facility_id = self.find_or_create_facility(inspection_data)
            if not facility_id:
                logger.warning(f"Could not create facility for inspection {inspection_id}")
                return False

            # Prepare inspection data
            inspection_date = self.parse_date(inspection_data.get('inspection_date'))
            if not inspection_date:
                logger.warning(f"Invalid inspection date for {inspection_id}")
                return False

            inspection_type = self.clean_string(inspection_data.get('inspection_type')) or 'Routine'
            result = self.clean_string(inspection_data.get('results')) or 'Unknown'
            violations = self.clean_string(inspection_data.get('violations'))

            # Map Chicago results to CleanKitchens format
            result_mapping = {
                'Pass': 'Pass',
                'Fail': 'Fail',
                'Pass w/ Conditions': 'Conditional',
                'Not Ready': 'Not Ready',
                'No Entry': 'No Entry',
                'Out of Business': 'Closed'
            }
            mapped_result = result_mapping.get(result, result)

            insert_data = {
                'facility_id': facility_id,
                'inspection_date': inspection_date,
                'inspection_type': inspection_type,
                'result': mapped_result,
                'comments': violations,
                'source_city': 'chicago',
                'source_inspection_id': inspection_id
            }

            insert_query = """
            INSERT INTO inspections (
                facility_id, inspection_date, inspection_type, result,
                comments, source_city, source_inspection_id,
                created_at, updated_at
            ) VALUES (
                %(facility_id)s, %(inspection_date)s, %(inspection_type)s,
                %(result)s, %(comments)s, %(source_city)s, %(source_inspection_id)s,
                NOW(), NOW()
            )
            """

            self.cursor.execute(insert_query, insert_data)
            self.stats['new_inspections'] += 1

            return True

        except Exception as e:
            logger.error(f"Error importing inspection {inspection_id}: {e}")
            self.stats['errors'] += 1
            return False

    def update_facility_stats(self):
        """Update facility statistics after import"""
        try:
            logger.info("Updating facility statistics...")

            update_query = """
            UPDATE facilities f
            SET
                total_inspections = (
                    SELECT COUNT(*) FROM inspections i
                    WHERE i.facility_id = f.id
                ),
                pass_count = (
                    SELECT COUNT(*) FROM inspections i
                    WHERE i.facility_id = f.id AND i.result = 'Pass'
                ),
                fail_count = (
                    SELECT COUNT(*) FROM inspections i
                    WHERE i.facility_id = f.id AND i.result = 'Fail'
                ),
                pass_rate = CASE
                    WHEN (SELECT COUNT(*) FROM inspections i WHERE i.facility_id = f.id) > 0
                    THEN (SELECT COUNT(*) FROM inspections i WHERE i.facility_id = f.id AND i.result = 'Pass') * 100.0 /
                         (SELECT COUNT(*) FROM inspections i WHERE i.facility_id = f.id)
                    ELSE 0
                END,
                updated_at = NOW()
            WHERE source_city = 'chicago'
            AND id IN (
                SELECT DISTINCT facility_id FROM inspections
                WHERE source_city = 'chicago' AND created_at >= CURDATE() - INTERVAL 1 DAY
            )
            """

            self.cursor.execute(update_query)
            updated_count = self.cursor.rowcount
            logger.info(f"Updated statistics for {updated_count} facilities")

        except Exception as e:
            logger.error(f"Error updating facility stats: {e}")
            self.stats['errors'] += 1

    def run_import(self):
        """Main import process"""
        logger.info("Starting Chicago API import...")

        if not self.connect_db():
            return False

        try:
            # Get starting date for import
            since_date = self.get_last_import_date()

            offset = 0
            total_processed = 0

            while True:
                # Fetch data from API
                data = self.fetch_api_data(since_date, BATCH_SIZE, offset)

                if not data:
                    logger.error("Failed to fetch data from API")
                    break

                if len(data) == 0:
                    logger.info("No more data to process")
                    break

                # Process each inspection
                batch_imported = 0
                for inspection in data:
                    if self.import_inspection(inspection):
                        batch_imported += 1

                total_processed += len(data)

                # Commit batch
                self.conn.commit()
                logger.info(f"Batch complete: {batch_imported}/{len(data)} imported, offset={offset}")

                # Check if we got a full batch (more data might be available)
                if len(data) < BATCH_SIZE:
                    break

                offset += BATCH_SIZE

                # Safety limit to prevent infinite loops
                if offset > 10000:
                    logger.warning("Reached safety limit of 10,000 records")
                    break

            # Update facility statistics
            self.update_facility_stats()
            self.conn.commit()

            # Log final statistics
            duration = datetime.now() - self.stats['start_time']
            logger.info("="*60)
            logger.info("CHICAGO API IMPORT COMPLETE")
            logger.info("="*60)
            logger.info(f"Duration: {duration}")
            logger.info(f"API calls made: {self.stats['api_calls']}")
            logger.info(f"Total records processed: {total_processed}")
            logger.info(f"New inspections imported: {self.stats['new_inspections']}")
            logger.info(f"Facilities updated: {self.stats['updated_facilities']}")
            logger.info(f"Duplicate inspections skipped: {self.stats['duplicate_inspections']}")
            logger.info(f"Errors encountered: {self.stats['errors']}")

            return True

        except Exception as e:
            logger.error(f"Import process failed: {e}")
            if self.conn:
                self.conn.rollback()
            return False

        finally:
            self.disconnect_db()

def main():
    """Main execution function"""
    importer = ChicagoAPIImporter()
    success = importer.run_import()

    if not success:
        logger.error("Import failed!")
        sys.exit(1)

    logger.info("Import completed successfully")
    sys.exit(0)

if __name__ == "__main__":
    main()