#!/usr/bin/env python3
"""
San Francisco Inspection Scraper with Advanced Bot Detection Bypass
Uses undetected-chromedriver and other techniques
"""

import json
import time
import logging
import os
import re
import requests
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from bs4 import BeautifulSoup
import pandas as pd
import pdfplumber
import random

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class SFInspectionScraper:
    def __init__(self, headless=False):  # Start with headless=False for testing
        """Initialize scraper with anti-detection measures"""
        self.base_url = "https://inspections.myhealthdepartment.com/san-francisco"
        self.pdf_base_url = "https://inspections.myhealthdepartment.com/san-francisco/print/"
        self.driver = None
        self.headless = headless
        self.pdf_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_pdfs"
        self.setup_driver()
        self.create_directories()
        
    def create_directories(self):
        """Create necessary directories"""
        os.makedirs(self.pdf_dir, exist_ok=True)
        logger.info(f"PDF directory ready: {self.pdf_dir}")
        
    def setup_driver(self):
        """Set up Chrome driver with maximum stealth"""
        chrome_options = Options()
        
        # Stealth options
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # Add random window size
        window_sizes = [(1920, 1080), (1366, 768), (1440, 900), (1536, 864)]
        width, height = random.choice(window_sizes)
        chrome_options.add_argument(f"--window-size={width},{height}")
        
        # Essential options
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--disable-web-security")
        chrome_options.add_argument("--disable-features=VizDisplayCompositor")
        chrome_options.add_argument("--disable-dev-tools")
        
        # Random user agent
        user_agents = [
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36"
        ]
        chrome_options.add_argument(f"user-agent={random.choice(user_agents)}")
        
        # Only add headless if specified
        if self.headless:
            chrome_options.add_argument("--headless=new")
            # Additional headless options
            chrome_options.add_argument("--disable-gpu-sandbox")
        
        # Preferences to appear more human-like
        prefs = {
            "credentials_enable_service": False,
            "profile.password_manager_enabled": False,
            "profile.default_content_setting_values.notifications": 2,
            "download.default_directory": self.pdf_dir,
            "plugins.always_open_pdf_externally": True
        }
        chrome_options.add_experimental_option("prefs", prefs)
        
        try:
            # Create driver
            self.driver = webdriver.Chrome(options=chrome_options)
            
            # Execute stealth JavaScript
            stealth_js = """
            Object.defineProperty(navigator, 'webdriver', {
                get: () => undefined
            });
            Object.defineProperty(navigator, 'plugins', {
                get: () => [1, 2, 3, 4, 5]
            });
            Object.defineProperty(navigator, 'languages', {
                get: () => ['en-US', 'en']
            });
            window.chrome = {
                runtime: {}
            };
            Object.defineProperty(navigator, 'permissions', {
                get: () => ({
                    query: () => Promise.resolve({ state: 'granted' })
                })
            });
            """
            self.driver.execute_cdp_cmd('Page.addScriptToEvaluateOnNewDocument', {
                'source': stealth_js
            })
            
            logger.info("Chrome driver initialized with stealth mode")
            
        except Exception as e:
            logger.error(f"Failed to initialize Chrome driver: {e}")
            raise
    
    def human_like_delay(self, min_seconds=0.5, max_seconds=2.0):
        """Add random human-like delay"""
        delay = random.uniform(min_seconds, max_seconds)
        time.sleep(delay)
    
    def scrape_with_requests_fallback(self):
        """Try direct requests as fallback"""
        logger.info("Attempting direct requests approach...")
        
        session = requests.Session()
        
        # Set headers to mimic browser
        headers = {
            'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
            'Accept-Language': 'en-US,en;q=0.5',
            'Accept-Encoding': 'gzip, deflate, br',
            'DNT': '1',
            'Connection': 'keep-alive',
            'Upgrade-Insecure-Requests': '1',
            'Sec-Fetch-Dest': 'document',
            'Sec-Fetch-Mode': 'navigate',
            'Sec-Fetch-Site': 'none',
            'Cache-Control': 'max-age=0'
        }
        
        try:
            response = session.get(self.base_url, headers=headers, timeout=30)
            logger.info(f"Direct request status: {response.status_code}")
            
            if response.status_code == 200:
                with open("/tmp/sf_requests_page.html", "w") as f:
                    f.write(response.text)
                logger.info("Page saved via requests to /tmp/sf_requests_page.html")
                
                # Parse the HTML
                soup = BeautifulSoup(response.text, 'html.parser')
                
                # Look for inspection links
                inspection_links = []
                for link in soup.find_all('a', href=True):
                    href = link['href']
                    if 'inspectionID=' in href:
                        match = re.search(r'inspectionID=([A-F0-9\-]+)', href)
                        if match:
                            inspection_id = match.group(1)
                            inspection_links.append({
                                'inspection_id': inspection_id,
                                'text': link.get_text(strip=True),
                                'href': href
                            })
                
                logger.info(f"Found {len(inspection_links)} inspection links via requests")
                return inspection_links
                
            else:
                logger.warning(f"Request failed with status {response.status_code}")
                return []
                
        except Exception as e:
            logger.error(f"Requests approach failed: {e}")
            return []
    
    def scrape_with_selenium(self):
        """Main scraping with Selenium"""
        inspections = []
        
        try:
            logger.info(f"Navigating to {self.base_url}")
            self.driver.get(self.base_url)
            
            # Human-like delay
            self.human_like_delay(3, 5)
            
            # Check if we're blocked
            if "403" in self.driver.title or "Forbidden" in self.driver.title:
                logger.warning("Detected 403 Forbidden - trying workarounds...")
                
                # Try refreshing with different approach
                self.driver.execute_script("window.location.href = arguments[0]", self.base_url)
                self.human_like_delay(3, 5)
            
            # Save screenshot and source
            self.driver.save_screenshot("/tmp/sf_selenium_screenshot.png")
            logger.info("Screenshot saved to /tmp/sf_selenium_screenshot.png")
            
            page_source = self.driver.page_source
            with open("/tmp/sf_selenium_page.html", "w") as f:
                f.write(page_source)
            logger.info("Page source saved to /tmp/sf_selenium_page.html")
            
            # Parse the page
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Look for inspection links
            inspection_links = []
            for link in soup.find_all('a', href=True):
                href = link['href']
                if 'inspectionID=' in href:
                    match = re.search(r'inspectionID=([A-F0-9\-]+)', href)
                    if match:
                        inspection_id = match.group(1)
                        inspection_links.append({
                            'inspection_id': inspection_id,
                            'text': link.get_text(strip=True),
                            'href': href
                        })
            
            logger.info(f"Found {len(inspection_links)} inspection links")
            
            # Process first 5 inspections
            for i, link_info in enumerate(inspection_links[:5]):
                inspection_id = link_info['inspection_id']
                logger.info(f"Processing inspection {i+1}: {inspection_id}")
                
                # Download PDF
                pdf_url = f"{self.pdf_base_url}?task=getPrintable&path=san-francisco&pKey={inspection_id},{inspection_id}"
                
                inspection_data = {
                    'inspection_id': inspection_id,
                    'facility_name': link_info['text'],
                    'pdf_url': pdf_url
                }
                
                # Try to download PDF
                try:
                    headers = {
                        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                        'Referer': self.base_url
                    }
                    
                    pdf_response = requests.get(pdf_url, headers=headers, timeout=30)
                    
                    if pdf_response.status_code == 200 and pdf_response.content.startswith(b'%PDF'):
                        pdf_filename = f"inspection_{inspection_id}.pdf"
                        pdf_path = os.path.join(self.pdf_dir, pdf_filename)
                        
                        with open(pdf_path, 'wb') as f:
                            f.write(pdf_response.content)
                        
                        logger.info(f"PDF saved: {pdf_path}")
                        inspection_data['pdf_downloaded'] = True
                        inspection_data['pdf_path'] = pdf_path
                        
                        # Extract PDF data
                        inspection_data.update(self.extract_pdf_data(pdf_path))
                    else:
                        logger.warning(f"PDF download failed for {inspection_id}")
                        inspection_data['pdf_downloaded'] = False
                        
                except Exception as e:
                    logger.error(f"Error downloading PDF: {e}")
                    inspection_data['pdf_downloaded'] = False
                
                inspections.append(inspection_data)
                self.human_like_delay()
            
        except Exception as e:
            logger.error(f"Selenium scraping error: {e}")
            
        return inspections
    
    def extract_pdf_data(self, pdf_path):
        """Extract data from PDF"""
        data = {}
        
        try:
            with pdfplumber.open(pdf_path) as pdf:
                text = ""
                for page in pdf.pages:
                    text += page.extract_text() or ""
                
                data['pdf_text_length'] = len(text)
                
                # Extract key fields
                patterns = {
                    'address': r'(?:Address|Location)[:\s]+([^\n]+)',
                    'date': r'(?:Date|Inspection Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
                    'score': r'(?:Score|Points)[:\s]+(\d+)',
                }
                
                for field, pattern in patterns.items():
                    match = re.search(pattern, text, re.IGNORECASE)
                    if match:
                        data[field] = match.group(1).strip()
                
                # Count violations
                violations = re.findall(r'(?:violation|infraction|non-compliance)', text, re.IGNORECASE)
                data['violation_mentions'] = len(violations)
                
                # Save preview
                data['text_preview'] = text[:500]
                
        except Exception as e:
            logger.error(f"PDF extraction error: {e}")
            data['extraction_error'] = str(e)
        
        return data
    
    def run(self):
        """Main run method"""
        all_results = []
        
        # Try Selenium first
        logger.info("Attempting Selenium scraping...")
        selenium_results = self.scrape_with_selenium()
        
        if selenium_results:
            all_results.extend(selenium_results)
        else:
            logger.info("Selenium failed, trying requests fallback...")
            # Try requests fallback
            request_links = self.scrape_with_requests_fallback()
            
            if request_links:
                for link in request_links[:5]:
                    all_results.append({
                        'inspection_id': link['inspection_id'],
                        'facility_name': link['text'],
                        'method': 'requests'
                    })
        
        return all_results
    
    def close(self):
        """Close browser"""
        if self.driver:
            self.driver.quit()
            logger.info("Browser closed")

def main():
    logger.info("="*60)
    logger.info("SF Inspection Scraper with Anti-Detection")
    logger.info("="*60)
    
    # Try with visible browser first
    scraper = SFInspectionScraper(headless=True)  # Set to False to see browser
    
    try:
        results = scraper.run()
        
        # Save results
        output_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_undetected_results.json"
        with open(output_file, "w") as f:
            json.dump(results, f, indent=2, default=str)
        
        logger.info(f"Results saved to {output_file}")
        
        # Display summary
        print("\n" + "="*60)
        print("RESULTS SUMMARY")
        print("="*60)
        print(f"Total inspections found: {len(results)}")
        
        for i, result in enumerate(results, 1):
            print(f"\n[{i}] {result.get('inspection_id', 'N/A')}")
            print(f"    Facility: {result.get('facility_name', 'Unknown')}")
            if result.get('pdf_downloaded'):
                print(f"    PDF: ✓ Downloaded")
                print(f"    Date: {result.get('date', 'N/A')}")
                print(f"    Score: {result.get('score', 'N/A')}")
            else:
                print(f"    PDF: ✗ Not downloaded")
        
        # Create CSV
        if results:
            df = pd.DataFrame(results)
            csv_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_undetected_summary.csv"
            df.to_csv(csv_file, index=False)
            logger.info(f"CSV saved to {csv_file}")
            
    except Exception as e:
        logger.error(f"Main error: {e}")
        import traceback
        traceback.print_exc()
        
    finally:
        scraper.close()

if __name__ == "__main__":
    main()