#!/usr/bin/env python3
"""
San Francisco Inspection Collector for Windows
Standalone script - Just run with Python on Windows
"""

import os
import sys
import json
import time
import base64
import requests
from datetime import datetime

# Try to import required packages
try:
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC
except ImportError:
    print("Missing required packages. Installing...")
    import subprocess
    subprocess.check_call([sys.executable, "-m", "pip", "install", "selenium", "requests"])
    from selenium import webdriver
    from selenium.webdriver.common.by import By
    from selenium.webdriver.chrome.options import Options
    from selenium.webdriver.support.ui import WebDriverWait
    from selenium.webdriver.support import expected_conditions as EC

# ============================================================================
# CONFIGURATION - CHANGE THESE SETTINGS!
# ============================================================================

# Your server URL (change this to your actual server)
SERVER_URL = "https://cleankitchens.org/data/api_receiver.php"

# API Key (must match the key in api_receiver.php on your server)
API_KEY = "your-secret-api-key-change-this"

# Number of inspections to collect (set to None to collect all)
COLLECT_LIMIT = 10

# Run browser in background (False recommended - site blocks headless)
HEADLESS = False

# ============================================================================

class SFCollector:
    def __init__(self):
        self.base_url = "https://inspections.myhealthdepartment.com/san-francisco"
        self.driver = None
        self.data_dir = "sf_data_" + datetime.now().strftime("%Y%m%d_%H%M%S")
        self.collected = []
        
    def setup(self):
        """Setup directories and browser"""
        # Create directories
        os.makedirs(self.data_dir, exist_ok=True)
        os.makedirs(f"{self.data_dir}/json", exist_ok=True)
        os.makedirs(f"{self.data_dir}/pdfs", exist_ok=True)
        
        print(f"📁 Data will be saved to: {os.path.abspath(self.data_dir)}")
        
        # Setup Chrome
        try:
            options = Options()
            if HEADLESS:
                options.add_argument("--headless=new")
            options.add_argument("--window-size=1920,1080")
            options.add_experimental_option("excludeSwitches", ["enable-automation"])
            options.add_experimental_option('useAutomationExtension', False)
            
            self.driver = webdriver.Chrome(options=options)
            print("✅ Chrome browser started")
            return True
        except Exception as e:
            print(f"❌ Failed to start Chrome: {e}")
            print("\nMake sure you have:")
            print("1. Google Chrome installed")
            print("2. ChromeDriver downloaded (https://chromedriver.chromium.org/)")
            return False
    
    def test_server(self):
        """Test server connection"""
        try:
            print(f"🔍 Testing server connection...")
            r = requests.get(SERVER_URL, timeout=10)
            if r.status_code == 200:
                print("✅ Server is ready")
                return True
        except:
            pass
        print("⚠️  Cannot connect to server - data will be saved locally only")
        return False
    
    def get_inspections(self):
        """Get list of inspections"""
        print(f"🌐 Loading {self.base_url}")
        self.driver.get(self.base_url)
        time.sleep(3)
        
        if "403" in self.driver.title:
            print("❌ Site blocked access - make sure you can access it in your browser")
            return []
        
        print("✅ Site accessed successfully")
        
        # Find inspection links
        inspections = []
        links = self.driver.find_elements(By.TAG_NAME, "a")
        
        for link in links:
            href = link.get_attribute("href") or ""
            if "inspectionID=" in href:
                import re
                match = re.search(r'inspectionID=([A-F0-9\-]+)', href)
                if match:
                    inspections.append({
                        'id': match.group(1),
                        'url': href,
                        'name': link.text.strip()
                    })
        
        print(f"📋 Found {len(inspections)} inspections")
        return inspections
    
    def collect_one(self, inspection):
        """Collect data for one inspection"""
        try:
            print(f"  📥 Collecting: {inspection['name']}")
            
            # Go to inspection page
            self.driver.get(inspection['url'])
            time.sleep(2)
            
            # Get data
            data = {
                'inspection_id': inspection['id'],
                'facility_name': inspection['name'],
                'url': inspection['url'],
                'collected_at': datetime.now().isoformat()
            }
            
            # Get page text
            body = self.driver.find_element(By.TAG_NAME, "body")
            data['page_text'] = body.text
            
            # Extract fields with regex
            import re
            text = data['page_text']
            
            patterns = {
                'address': r'(?:Address|Location)[:\s]+([^\n]+)',
                'date': r'(?:Date|Inspection Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
                'score': r'(?:Score|Points)[:\s]+(\d+)',
                'grade': r'(?:Grade|Rating)[:\s]+([A-F])'
            }
            
            for field, pattern in patterns.items():
                match = re.search(pattern, text, re.IGNORECASE)
                if match:
                    data[field] = match.group(1).strip()
            
            # Try to get PDF
            pdf_url = f"https://inspections.myhealthdepartment.com/san-francisco/print/?task=getPrintable&path=san-francisco&pKey={inspection['id']},{inspection['id']}"
            
            try:
                # Get cookies
                cookies = self.driver.get_cookies()
                session = requests.Session()
                for cookie in cookies:
                    session.cookies.set(cookie['name'], cookie['value'])
                
                # Download PDF
                r = session.get(pdf_url, headers={'User-Agent': 'Mozilla/5.0'}, timeout=30)
                if r.status_code == 200 and r.content.startswith(b'%PDF'):
                    pdf_path = f"{self.data_dir}/pdfs/{inspection['id']}.pdf"
                    with open(pdf_path, 'wb') as f:
                        f.write(r.content)
                    data['pdf_size'] = len(r.content)
                    data['pdf_data'] = base64.b64encode(r.content).decode('utf-8')
                    print(f"    ✅ PDF downloaded ({len(r.content):,} bytes)")
            except Exception as e:
                print(f"    ⚠️  PDF failed: {e}")
            
            # Save JSON
            json_path = f"{self.data_dir}/json/{inspection['id']}.json"
            with open(json_path, 'w') as f:
                json.dump(data, f, indent=2)
            
            return data
            
        except Exception as e:
            print(f"    ❌ Error: {e}")
            return None
    
    def upload(self, data):
        """Upload to server"""
        try:
            headers = {'X-API-Key': API_KEY, 'Content-Type': 'application/json'}
            
            # Upload inspection data
            upload_data = {
                'action': 'upload_inspection',
                'inspection_id': data['inspection_id'],
                'inspection_data': {k: v for k, v in data.items() if k != 'pdf_data'}
            }
            
            r = requests.post(SERVER_URL, json=upload_data, headers=headers, timeout=30)
            
            if r.status_code == 200:
                print(f"    ☁️  Uploaded to server")
                
                # Upload PDF if exists
                if 'pdf_data' in data:
                    pdf_upload = {
                        'action': 'upload_pdf',
                        'inspection_id': data['inspection_id'],
                        'pdf_data': data['pdf_data']
                    }
                    requests.post(SERVER_URL, json=pdf_upload, headers=headers, timeout=30)
                    
        except Exception as e:
            print(f"    ⚠️  Upload failed: {e}")
    
    def run(self):
        """Main process"""
        print("\n" + "="*60)
        print("🌁 SAN FRANCISCO INSPECTION COLLECTOR")
        print("="*60)
        
        # Setup
        if not self.setup():
            return
        
        # Test server
        can_upload = self.test_server()
        
        try:
            # Get inspections
            print("\n📋 Step 1: Getting inspection list...")
            inspections = self.get_inspections()
            
            if not inspections:
                print("❌ No inspections found")
                return
            
            # Limit if specified
            if COLLECT_LIMIT:
                inspections = inspections[:COLLECT_LIMIT]
            
            # Collect each
            print(f"\n📥 Step 2: Collecting {len(inspections)} inspections...")
            print("-" * 40)
            
            for i, inspection in enumerate(inspections, 1):
                print(f"\n[{i}/{len(inspections)}] {inspection['name']}")
                
                data = self.collect_one(inspection)
                if data:
                    self.collected.append(data)
                    if can_upload:
                        self.upload(data)
                
                time.sleep(1)
            
            # Summary
            print("\n" + "="*60)
            print("✅ COLLECTION COMPLETE!")
            print("="*60)
            print(f"📊 Collected: {len(self.collected)} inspections")
            print(f"📄 PDFs: {sum(1 for d in self.collected if 'pdf_size' in d)}")
            print(f"📁 Saved to: {os.path.abspath(self.data_dir)}")
            
            if can_upload:
                print(f"☁️  Uploaded to server")
            
        except Exception as e:
            print(f"\n❌ Error: {e}")
            import traceback
            traceback.print_exc()
            
        finally:
            if self.driver:
                self.driver.quit()
            
            print("\n" + "-"*40)
            input("Press Enter to exit...")

if __name__ == "__main__":
    try:
        collector = SFCollector()
        collector.run()
    except KeyboardInterrupt:
        print("\n\n⚠️  Cancelled by user")
    except Exception as e:
        print(f"\n❌ Error: {e}")
        import traceback
        traceback.print_exc()
        input("\nPress Enter to exit...")