<?php
/**
 * Process SF Inspection Data into Database
 * Extracts all fields from PDFs and JSON, handles duplicates
 */

error_reporting(E_ALL);
ini_set('display_errors', 1);

// Database configuration
$db_host = 'localhost';
$db_name = 'cleankitchens';
$db_user = 'your_db_user';  // Update this
$db_pass = 'your_db_pass';  // Update this

// Connect to database
try {
    $pdo = new PDO("mysql:host=$db_host;dbname=$db_name;charset=utf8mb4", $db_user, $db_pass);
    $pdo->setAttribute(PDO::ATTR_ERRMODE, PDO::ERRMODE_EXCEPTION);
} catch (PDOException $e) {
    die("Database connection failed: " . $e->getMessage());
}

// Create sf_temp table if not exists
$create_table_sql = "
CREATE TABLE IF NOT EXISTS sf_temp (
    id INT AUTO_INCREMENT PRIMARY KEY,
    inspection_id VARCHAR(50) UNIQUE NOT NULL,
    facility_name VARCHAR(255),
    address VARCHAR(500),
    inspection_date DATE,
    inspection_time TIME,
    inspector_name VARCHAR(255),
    inspector_email VARCHAR(255),
    inspector_phone VARCHAR(50),
    permit_expiration DATE,
    owner_name VARCHAR(255),
    certified_manager VARCHAR(255),
    manager_cert_expiration DATE,
    inspection_type VARCHAR(100),
    inspection_status VARCHAR(50),  -- PASS, CONDITIONAL PASS, CLOSURE
    violation_count INT DEFAULT 0,
    violations JSON,  -- Store violations as JSON
    corrective_actions TEXT,
    observations TEXT,
    score INT,
    pdf_text LONGTEXT,
    pdf_filename VARCHAR(255),
    pdf_url VARCHAR(500),
    page_text TEXT,
    collected_at TIMESTAMP,
    processed_at TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    data_hash VARCHAR(64),  -- MD5 hash for duplicate detection
    raw_data JSON,  -- Store complete raw data
    
    INDEX idx_inspection_date (inspection_date),
    INDEX idx_facility_name (facility_name),
    INDEX idx_data_hash (data_hash),
    INDEX idx_inspection_status (inspection_status)
) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci;
";

$pdo->exec($create_table_sql);
echo "Table sf_temp created/verified.\n";

/**
 * Extract structured data from PDF text
 */
function extractFromPdfText($pdf_text) {
    $data = [];
    
    // Inspection Date
    if (preg_match('/Inspection Date\s*(\d{2}\/\d{2}\/\d{4})/i', $pdf_text, $matches)) {
        $data['inspection_date'] = date('Y-m-d', strtotime($matches[1]));
    }
    
    // Inspection Time
    if (preg_match('/Inspection Time\s*(\d{1,2}:\d{2}\s*[AP]M)/i', $pdf_text, $matches)) {
        $data['inspection_time'] = date('H:i:s', strtotime($matches[1]));
    }
    
    // Facility Name
    if (preg_match('/Permit Name\s*([^\n]+)/i', $pdf_text, $matches)) {
        $data['facility_name'] = trim($matches[1]);
    }
    
    // Address
    if (preg_match('/Address\s*(\d+[^,\n]+(?:,\s*[^,\n]+)*,\s*SAN FRANCISCO[^,\n]*CA\s*\d{5})/i', $pdf_text, $matches)) {
        $data['address'] = trim($matches[1]);
    }
    
    // Inspector
    if (preg_match('/Inspector\s+([A-Za-z\s]+?)(?:Inspector Phone|Inspector Email)/i', $pdf_text, $matches)) {
        $data['inspector_name'] = trim($matches[1]);
    }
    if (preg_match('/Inspector Email\s*([\w\.\-]+@[\w\.\-]+)/i', $pdf_text, $matches)) {
        $data['inspector_email'] = trim($matches[1]);
    }
    if (preg_match('/Inspector Phone\s*\(?([\d\-\s\(\)]+)\)?/i', $pdf_text, $matches)) {
        $data['inspector_phone'] = trim($matches[1]);
    }
    
    // Owner
    if (preg_match('/Owner\s*([^\n]+?)(?:Owner|PIC Email|Certified)/i', $pdf_text, $matches)) {
        $data['owner_name'] = trim($matches[1]);
    }
    
    // Certified Manager
    if (preg_match('/Certified Food Manager\s*([^\n]+?)(?:Certification|Owner)/i', $pdf_text, $matches)) {
        $data['certified_manager'] = trim($matches[1]);
    }
    
    // Inspection Type
    if (preg_match('/Purpose of Inspection\s*([^\n]+)/i', $pdf_text, $matches)) {
        $data['inspection_type'] = trim($matches[1]);
    }
    
    // Status - Check for PASS/CONDITIONAL PASS/CLOSURE
    $data['inspection_status'] = 'UNKNOWN';
    if (stripos($pdf_text, 'CONDITIONAL PASS') !== false || stripos($pdf_text, 'CONDITIONAL\nPASS') !== false) {
        $data['inspection_status'] = 'CONDITIONAL PASS';
    } elseif (stripos($pdf_text, 'CLOSURE') !== false) {
        $data['inspection_status'] = 'CLOSURE';
    } elseif (stripos($pdf_text, 'PASS') !== false) {
        $data['inspection_status'] = 'PASS';
    }
    
    // Extract violations
    $violations = [];
    $pattern = '/(\d{1,2})\s*-\s*([A-Z][A-Z\s,:;&]+?)(?:Corrective Action:|Observation:|California Retail Food Code:|\d{1,2}\s*-|$)/s';
    if (preg_match_all($pattern, $pdf_text, $matches, PREG_SET_ORDER)) {
        foreach ($matches as $match) {
            $violations[] = [
                'code' => $match[1],
                'description' => trim(preg_replace('/\s+/', ' ', $match[2]))
            ];
        }
    }
    
    $data['violations'] = $violations;
    $data['violation_count'] = count($violations);
    
    // Extract corrective actions
    if (preg_match_all('/Corrective Action:\s*([^:]+?)(?:Observation:|California Retail Food Code:|$)/s', $pdf_text, $matches)) {
        $data['corrective_actions'] = implode(' | ', array_map('trim', $matches[1]));
    }
    
    // Extract observations
    if (preg_match_all('/Observation:\s*([^:]+?)(?:Corrective Action:|California Retail Food Code:|$)/s', $pdf_text, $matches)) {
        $data['observations'] = implode(' | ', array_map('trim', $matches[1]));
    }
    
    // Score (if present)
    if (preg_match('/Score[:\s]+(\d+)/i', $pdf_text, $matches)) {
        $data['score'] = intval($matches[1]);
    }
    
    return $data;
}

/**
 * Process inspection data and insert into database
 */
function processInspection($pdo, $inspection_data, $pdf_text = null) {
    // Extract inspection ID
    $inspection_id = $inspection_data['inspection_id'] ?? null;
    if (!$inspection_id) {
        echo "Warning: No inspection ID found\n";
        return false;
    }
    
    // Create data hash for duplicate detection (based on key fields)
    $hash_data = [
        'inspection_id' => $inspection_id,
        'inspection_date' => $inspection_data['inspection_date'] ?? '',
        'facility_name' => $inspection_data['facility_name'] ?? '',
        'pdf_text_length' => strlen($pdf_text ?? '')
    ];
    $data_hash = md5(json_encode($hash_data));
    
    // Check for duplicate
    $check_sql = "SELECT id, data_hash FROM sf_temp WHERE inspection_id = :inspection_id";
    $stmt = $pdo->prepare($check_sql);
    $stmt->execute(['inspection_id' => $inspection_id]);
    $existing = $stmt->fetch(PDO::FETCH_ASSOC);
    
    if ($existing) {
        if ($existing['data_hash'] === $data_hash) {
            echo "Skipping duplicate: $inspection_id (identical data)\n";
            return false;
        } else {
            echo "Updating existing record: $inspection_id (data changed)\n";
            // Continue to update the record
        }
    } else {
        echo "Processing new inspection: $inspection_id\n";
    }
    
    // Extract structured data from PDF if available
    $extracted_data = [];
    if ($pdf_text) {
        $extracted_data = extractFromPdfText($pdf_text);
    }
    
    // Merge extracted data with provided data (extracted takes precedence)
    $final_data = array_merge($inspection_data, $extracted_data);
    
    // Prepare insert/update SQL
    $sql = "INSERT INTO sf_temp (
        inspection_id,
        facility_name,
        address,
        inspection_date,
        inspection_time,
        inspector_name,
        inspector_email,
        inspector_phone,
        permit_expiration,
        owner_name,
        certified_manager,
        manager_cert_expiration,
        inspection_type,
        inspection_status,
        violation_count,
        violations,
        corrective_actions,
        observations,
        score,
        pdf_text,
        pdf_filename,
        pdf_url,
        page_text,
        collected_at,
        data_hash,
        raw_data
    ) VALUES (
        :inspection_id,
        :facility_name,
        :address,
        :inspection_date,
        :inspection_time,
        :inspector_name,
        :inspector_email,
        :inspector_phone,
        :permit_expiration,
        :owner_name,
        :certified_manager,
        :manager_cert_expiration,
        :inspection_type,
        :inspection_status,
        :violation_count,
        :violations,
        :corrective_actions,
        :observations,
        :score,
        :pdf_text,
        :pdf_filename,
        :pdf_url,
        :page_text,
        :collected_at,
        :data_hash,
        :raw_data
    ) ON DUPLICATE KEY UPDATE
        facility_name = VALUES(facility_name),
        address = VALUES(address),
        inspection_date = VALUES(inspection_date),
        inspection_time = VALUES(inspection_time),
        inspector_name = VALUES(inspector_name),
        inspector_email = VALUES(inspector_email),
        inspector_phone = VALUES(inspector_phone),
        permit_expiration = VALUES(permit_expiration),
        owner_name = VALUES(owner_name),
        certified_manager = VALUES(certified_manager),
        manager_cert_expiration = VALUES(manager_cert_expiration),
        inspection_type = VALUES(inspection_type),
        inspection_status = VALUES(inspection_status),
        violation_count = VALUES(violation_count),
        violations = VALUES(violations),
        corrective_actions = VALUES(corrective_actions),
        observations = VALUES(observations),
        score = VALUES(score),
        pdf_text = VALUES(pdf_text),
        pdf_filename = VALUES(pdf_filename),
        pdf_url = VALUES(pdf_url),
        page_text = VALUES(page_text),
        collected_at = VALUES(collected_at),
        data_hash = VALUES(data_hash),
        raw_data = VALUES(raw_data),
        processed_at = NOW()";
    
    $params = [
        'inspection_id' => $inspection_id,
        'facility_name' => $final_data['facility_name'] ?? null,
        'address' => $final_data['address'] ?? null,
        'inspection_date' => $final_data['inspection_date'] ?? null,
        'inspection_time' => $final_data['inspection_time'] ?? null,
        'inspector_name' => $final_data['inspector_name'] ?? null,
        'inspector_email' => $final_data['inspector_email'] ?? null,
        'inspector_phone' => $final_data['inspector_phone'] ?? null,
        'permit_expiration' => $final_data['permit_expiration'] ?? null,
        'owner_name' => $final_data['owner_name'] ?? null,
        'certified_manager' => $final_data['certified_manager'] ?? null,
        'manager_cert_expiration' => $final_data['manager_cert_expiration'] ?? null,
        'inspection_type' => $final_data['inspection_type'] ?? null,
        'inspection_status' => $final_data['inspection_status'] ?? null,
        'violation_count' => $final_data['violation_count'] ?? 0,
        'violations' => json_encode($final_data['violations'] ?? []),
        'corrective_actions' => $final_data['corrective_actions'] ?? null,
        'observations' => $final_data['observations'] ?? null,
        'score' => $final_data['score'] ?? null,
        'pdf_text' => $pdf_text,
        'pdf_filename' => $final_data['pdf_filename'] ?? null,
        'pdf_url' => $final_data['pdf_url'] ?? null,
        'page_text' => substr($final_data['inspection_page_text'] ?? '', 0, 10000),
        'collected_at' => $final_data['collected_at'] ?? date('Y-m-d H:i:s'),
        'data_hash' => $data_hash,
        'raw_data' => json_encode($inspection_data)
    ];
    
    try {
        $stmt = $pdo->prepare($sql);
        $stmt->execute($params);
        
        if ($existing) {
            echo "  Updated successfully\n";
        } else {
            echo "  Inserted successfully\n";
        }
        return true;
    } catch (PDOException $e) {
        echo "  Database error: " . $e->getMessage() . "\n";
        return false;
    }
}

// Process incoming data from API
if ($_SERVER['REQUEST_METHOD'] === 'POST') {
    $input = file_get_contents('php://input');
    $data = json_decode($input, true);
    
    if ($data) {
        $inspection_data = $data['inspection_data'] ?? $data;
        $pdf_text = null;
        
        // If PDF is provided as base64
        if (!empty($data['pdf_base64'])) {
            // We would need to extract text from PDF here
            // For now, store the filename
            $inspection_data['pdf_filename'] = $data['pdf_filename'] ?? null;
        }
        
        // Process the inspection
        $result = processInspection($pdo, $inspection_data, $pdf_text);
        
        header('Content-Type: application/json');
        echo json_encode(['success' => $result]);
    }
}

// Command line processing for batch import
if (php_sapi_name() === 'cli') {
    $data_dir = $argv[1] ?? './inspection_data';
    
    if (!is_dir($data_dir)) {
        die("Usage: php process_sf_data.php <data_directory>\n");
    }
    
    echo "Processing data from: $data_dir\n";
    echo "=" . str_repeat("=", 50) . "\n";
    
    $json_dir = $data_dir . '/json';
    $pdf_dir = $data_dir . '/pdfs';
    
    $processed = 0;
    $skipped = 0;
    $errors = 0;
    
    // Process JSON files
    if (is_dir($json_dir)) {
        $json_files = glob($json_dir . '/*.json');
        
        foreach ($json_files as $json_file) {
            $inspection_data = json_decode(file_get_contents($json_file), true);
            
            if (!$inspection_data) {
                echo "Error reading: $json_file\n";
                $errors++;
                continue;
            }
            
            // Check for corresponding PDF
            $inspection_id = $inspection_data['inspection_id'] ?? pathinfo($json_file, PATHINFO_FILENAME);
            $pdf_file = $pdf_dir . '/' . $inspection_id . '.pdf';
            $pdf_text = null;
            
            if (file_exists($pdf_file)) {
                // Extract text from PDF using Python script
                $cmd = "/home/chris/cleankitchens-env/bin/python3 -c \"
import PyPDF2
with open('$pdf_file', 'rb') as f:
    reader = PyPDF2.PdfReader(f)
    text = ''
    for page in reader.pages:
        text += page.extract_text()
    print(text)
\" 2>/dev/null";
                
                $pdf_text = shell_exec($cmd);
            }
            
            if (processInspection($pdo, $inspection_data, $pdf_text)) {
                $processed++;
            } else {
                $skipped++;
            }
        }
    }
    
    echo "\n" . str_repeat("=", 50) . "\n";
    echo "Processing complete:\n";
    echo "  Processed: $processed\n";
    echo "  Skipped: $skipped\n";
    echo "  Errors: $errors\n";
    
    // Show summary
    $stmt = $pdo->query("SELECT 
        COUNT(*) as total,
        COUNT(DISTINCT facility_name) as unique_facilities,
        MIN(inspection_date) as earliest_date,
        MAX(inspection_date) as latest_date,
        SUM(CASE WHEN inspection_status = 'PASS' THEN 1 ELSE 0 END) as pass_count,
        SUM(CASE WHEN inspection_status = 'CONDITIONAL PASS' THEN 1 ELSE 0 END) as conditional_count,
        SUM(CASE WHEN inspection_status = 'CLOSURE' THEN 1 ELSE 0 END) as closure_count
    FROM sf_temp");
    
    $summary = $stmt->fetch(PDO::FETCH_ASSOC);
    
    echo "\nDatabase Summary:\n";
    echo "  Total inspections: " . $summary['total'] . "\n";
    echo "  Unique facilities: " . $summary['unique_facilities'] . "\n";
    echo "  Date range: " . $summary['earliest_date'] . " to " . $summary['latest_date'] . "\n";
    echo "  Pass: " . $summary['pass_count'] . "\n";
    echo "  Conditional Pass: " . $summary['conditional_count'] . "\n";
    echo "  Closures: " . $summary['closure_count'] . "\n";
}
?>