#!/usr/bin/env python3
"""
San Francisco Restaurant Inspection Data Scraper
Uses Selenium with headless Chrome to scrape inspection data from
https://inspections.myhealthdepartment.com/san-francisco
"""

import json
import time
import logging
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import TimeoutException, NoSuchElementException
from webdriver_manager.chrome import ChromeDriverManager
from bs4 import BeautifulSoup
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class SFInspectionScraper:
    def __init__(self, headless=True):
        """Initialize the scraper with Chrome options"""
        self.base_url = "https://inspections.myhealthdepartment.com/san-francisco"
        self.driver = None
        self.headless = headless
        self.setup_driver()
        
    def setup_driver(self):
        """Set up Chrome driver with appropriate options"""
        chrome_options = Options()
        
        if self.headless:
            chrome_options.add_argument("--headless=new")
        
        # Essential options for headless scraping
        chrome_options.add_argument("--no-sandbox")
        chrome_options.add_argument("--disable-dev-shm-usage")
        chrome_options.add_argument("--disable-gpu")
        chrome_options.add_argument("--window-size=1920,1080")
        chrome_options.add_argument("--start-maximized")
        chrome_options.add_argument("--disable-blink-features=AutomationControlled")
        chrome_options.add_experimental_option("excludeSwitches", ["enable-automation"])
        chrome_options.add_experimental_option('useAutomationExtension', False)
        
        # User agent to appear more like a real browser
        chrome_options.add_argument("user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36")
        
        try:
            # Try to use system Chrome/Chromium first
            self.driver = webdriver.Chrome(options=chrome_options)
            logger.info("Using system Chrome/Chromium driver")
        except:
            # Fall back to webdriver-manager
            logger.info("Downloading Chrome driver...")
            service = Service(ChromeDriverManager().install())
            self.driver = webdriver.Chrome(service=service, options=chrome_options)
            logger.info("Using downloaded Chrome driver")
    
    def wait_for_element(self, by, value, timeout=10):
        """Wait for an element to be present"""
        try:
            element = WebDriverWait(self.driver, timeout).until(
                EC.presence_of_element_located((by, value))
            )
            return element
        except TimeoutException:
            logger.warning(f"Element not found: {value}")
            return None
    
    def scrape_recent_inspections(self, limit=5):
        """Scrape the most recent inspection records"""
        inspections = []
        
        try:
            logger.info(f"Navigating to {self.base_url}")
            self.driver.get(self.base_url)
            
            # Wait for page to load
            time.sleep(3)
            
            # Try to find the search button or recent inspections
            # The site might show recent inspections on the main page
            logger.info("Looking for recent inspections or search functionality...")
            
            # Take a screenshot for debugging
            self.driver.save_screenshot("/tmp/sf_inspection_page.png")
            logger.info("Screenshot saved to /tmp/sf_inspection_page.png")
            
            # Get page source for analysis
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Try multiple strategies to find inspection data
            
            # Strategy 1: Look for recent inspections section
            recent_section = soup.find_all(['div', 'section'], class_=lambda x: x and 'recent' in x.lower() if x else False)
            if recent_section:
                logger.info("Found recent inspections section")
            
            # Strategy 2: Look for table with inspection data
            tables = soup.find_all('table')
            for table in tables:
                logger.info(f"Found table with {len(table.find_all('tr'))} rows")
            
            # Strategy 3: Look for inspection cards or list items
            inspection_cards = soup.find_all(['div', 'article', 'li'], class_=lambda x: x and any(word in str(x).lower() for word in ['inspection', 'restaurant', 'facility']) if x else False)
            
            # Strategy 4: Try to click on "View All" or "Recent Inspections" link
            try:
                # Look for links that might lead to inspection lists
                links = self.driver.find_elements(By.TAG_NAME, "a")
                for link in links:
                    link_text = link.text.lower()
                    if any(word in link_text for word in ['recent', 'view all', 'inspections', 'search']):
                        logger.info(f"Found link: {link.text}")
                        link.click()
                        time.sleep(3)
                        break
            except Exception as e:
                logger.warning(f"Error clicking links: {e}")
            
            # Get updated page source after potential navigation
            page_source = self.driver.page_source
            soup = BeautifulSoup(page_source, 'html.parser')
            
            # Try to extract any facility/restaurant information visible
            facility_elements = soup.find_all(text=lambda text: text and len(text) > 10)
            
            inspection_count = 0
            for element in facility_elements[:100]:  # Check first 100 text elements
                text = str(element).strip()
                
                # Look for patterns that indicate inspection data
                if any(indicator in text.lower() for indicator in ['restaurant', 'cafe', 'kitchen', 'grill', 'deli', 'bakery']):
                    parent = element.parent
                    if parent:
                        # Try to extract related information
                        inspection_data = {
                            'facility_name': text,
                            'scraped_at': datetime.now().isoformat(),
                            'raw_html': str(parent)[:500]  # First 500 chars of HTML
                        }
                        
                        # Look for date patterns
                        date_pattern = r'\d{1,2}[/-]\d{1,2}[/-]\d{2,4}'
                        import re
                        dates = re.findall(date_pattern, str(parent))
                        if dates:
                            inspection_data['possible_date'] = dates[0]
                        
                        # Look for score patterns
                        score_pattern = r'\b\d{1,3}\b(?:\s*(?:score|points|%))?'
                        scores = re.findall(score_pattern, str(parent))
                        if scores:
                            inspection_data['possible_score'] = scores[0]
                        
                        # Look for address
                        address_keywords = ['street', 'st', 'ave', 'avenue', 'blvd', 'boulevard', 'rd', 'road']
                        parent_text = parent.get_text()
                        for keyword in address_keywords:
                            if keyword in parent_text.lower():
                                inspection_data['possible_address'] = parent_text[:200]
                                break
                        
                        inspections.append(inspection_data)
                        inspection_count += 1
                        
                        if inspection_count >= limit:
                            break
            
            # If we didn't find enough, try a different approach
            if len(inspections) < limit:
                logger.info("Trying alternative extraction method...")
                
                # Look for any clickable restaurant/facility elements
                clickable_elements = self.driver.find_elements(By.CSS_SELECTOR, "a, button, div[onclick], tr[onclick]")
                
                for element in clickable_elements[:20]:  # Check first 20 clickable elements
                    try:
                        element_text = element.text.strip()
                        if element_text and len(element_text) > 5:
                            # Check if it looks like a facility name
                            if not any(skip in element_text.lower() for skip in ['search', 'home', 'about', 'contact', 'login']):
                                inspection_data = {
                                    'facility_name': element_text[:100],
                                    'element_tag': element.tag_name,
                                    'element_href': element.get_attribute('href'),
                                    'scraped_at': datetime.now().isoformat()
                                }
                                
                                # Try to click and get more details
                                try:
                                    element.click()
                                    time.sleep(2)
                                    
                                    # Get details from the new page
                                    detail_source = self.driver.page_source
                                    detail_soup = BeautifulSoup(detail_source, 'html.parser')
                                    
                                    # Extract any additional information
                                    detail_text = detail_soup.get_text()
                                    if 'inspection' in detail_text.lower():
                                        inspection_data['has_inspection_details'] = True
                                        inspection_data['detail_preview'] = detail_text[:500]
                                    
                                    # Go back
                                    self.driver.back()
                                    time.sleep(2)
                                    
                                except Exception as e:
                                    logger.warning(f"Could not click element: {e}")
                                
                                inspections.append(inspection_data)
                                
                                if len(inspections) >= limit:
                                    break
                                    
                    except Exception as e:
                        continue
            
            logger.info(f"Scraped {len(inspections)} inspection records")
            
        except Exception as e:
            logger.error(f"Error during scraping: {e}")
            
            # Save page source for debugging
            if self.driver:
                with open("/tmp/sf_page_source.html", "w") as f:
                    f.write(self.driver.page_source)
                logger.info("Page source saved to /tmp/sf_page_source.html")
        
        return inspections
    
    def close(self):
        """Close the browser driver"""
        if self.driver:
            self.driver.quit()
            logger.info("Browser closed")

def main():
    """Main function to run the scraper"""
    logger.info("Starting San Francisco inspection scraper...")
    
    scraper = SFInspectionScraper(headless=True)
    
    try:
        # Scrape the 5 most recent inspections
        inspections = scraper.scrape_recent_inspections(limit=5)
        
        # Save results to JSON
        output_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_inspection_sample.json"
        with open(output_file, "w") as f:
            json.dump(inspections, f, indent=2)
        logger.info(f"Results saved to {output_file}")
        
        # Display summary
        print("\n" + "="*60)
        print("SCRAPING SUMMARY")
        print("="*60)
        print(f"Total records scraped: {len(inspections)}")
        print("\nSample of scraped data:")
        
        for i, inspection in enumerate(inspections, 1):
            print(f"\n--- Inspection {i} ---")
            for key, value in inspection.items():
                if key != 'raw_html' and key != 'detail_preview':  # Skip long fields
                    print(f"  {key}: {value}")
        
        # Create a simple CSV if we have data
        if inspections:
            df = pd.DataFrame(inspections)
            csv_file = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf_inspection_sample.csv"
            df.to_csv(csv_file, index=False)
            logger.info(f"CSV saved to {csv_file}")
            
    except Exception as e:
        logger.error(f"Error in main: {e}")
        
    finally:
        scraper.close()

if __name__ == "__main__":
    main()