#!/usr/bin/env python3
"""
San Francisco Inspection Data Collector - LOCAL VERSION
Run this script on your local machine where you can access the site.
It will collect the data and save it locally or upload to your server.
"""

import os
import json
import time
import requests
from datetime import datetime
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import logging

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s'
)
logger = logging.getLogger(__name__)

class LocalSFCollector:
    def __init__(self, headless=False, server_upload=False):
        """
        Initialize the local collector
        
        Args:
            headless: Run browser in headless mode (might get blocked)
            server_upload: Upload data to server (configure SERVER_URL below)
        """
        self.base_url = "https://inspections.myhealthdepartment.com/san-francisco"
        self.driver = None
        self.headless = headless
        self.server_upload = server_upload
        self.server_url = "http://your-server.com/upload"  # Configure this
        self.data_dir = "sf_inspection_data"
        self.create_directories()
        
    def create_directories(self):
        """Create local directories for data storage"""
        os.makedirs(self.data_dir, exist_ok=True)
        os.makedirs(f"{self.data_dir}/pdfs", exist_ok=True)
        os.makedirs(f"{self.data_dir}/json", exist_ok=True)
        logger.info(f"Data directory: {self.data_dir}")
        
    def setup_browser(self):
        """Setup Chrome browser"""
        chrome_options = Options()
        
        if self.headless:
            chrome_options.add_argument("--headless=new")
            logger.warning("Running in headless mode - might get blocked!")
        
        # Basic options
        chrome_options.add_argument("--window-size=1920,1080")
        
        # Create driver
        self.driver = webdriver.Chrome(options=chrome_options)
        logger.info("Browser started")
        
    def collect_inspection_list(self):
        """Collect list of inspections from main page"""
        logger.info(f"Navigating to {self.base_url}")
        self.driver.get(self.base_url)
        
        # Wait for page to load
        time.sleep(3)
        
        # Check if we can access
        if "403" in self.driver.title:
            logger.error("❌ BLOCKED: Site returned 403 Forbidden")
            logger.error("Please run this script on a machine that can access the site")
            return []
        
        logger.info("✅ Successfully accessed the site")
        
        # Find all inspection links
        inspection_links = []
        
        try:
            # Wait for content to load
            WebDriverWait(self.driver, 10).until(
                EC.presence_of_element_located((By.TAG_NAME, "a"))
            )
            
            # Find all links
            links = self.driver.find_elements(By.TAG_NAME, "a")
            
            for link in links:
                href = link.get_attribute("href") or ""
                text = link.text.strip()
                
                # Check if it's an inspection link
                if "inspectionID=" in href:
                    # Extract inspection ID
                    import re
                    match = re.search(r'inspectionID=([A-F0-9\-]+)', href)
                    if match:
                        inspection_id = match.group(1)
                        inspection_links.append({
                            'inspection_id': inspection_id,
                            'url': href,
                            'text': text,
                            'collected_at': datetime.now().isoformat()
                        })
            
            logger.info(f"Found {len(inspection_links)} inspection links")
            
        except Exception as e:
            logger.error(f"Error collecting inspection list: {e}")
        
        return inspection_links
    
    def collect_inspection_details(self, inspection_info, limit=None):
        """Collect detailed data for each inspection"""
        all_data = []
        
        inspections_to_process = inspection_info[:limit] if limit else inspection_info
        
        for i, info in enumerate(inspections_to_process, 1):
            logger.info(f"\nProcessing inspection {i}/{len(inspections_to_process)}")
            logger.info(f"ID: {info['inspection_id']}")
            
            try:
                # Navigate to inspection page
                self.driver.get(info['url'])
                time.sleep(2)
                
                # Collect page data
                inspection_data = {
                    'inspection_id': info['inspection_id'],
                    'facility_name': info['text'],
                    'url': info['url'],
                    'collected_at': datetime.now().isoformat()
                }
                
                # Extract text content
                try:
                    body_text = self.driver.find_element(By.TAG_NAME, "body").text
                    inspection_data['page_text'] = body_text
                    
                    # Try to extract specific fields
                    import re
                    
                    # Common patterns
                    patterns = {
                        'address': r'(?:Address|Location)[:\s]+([^\n]+)',
                        'date': r'(?:Date|Inspection Date)[:\s]+(\d{1,2}[/-]\d{1,2}[/-]\d{2,4})',
                        'score': r'(?:Score|Points)[:\s]+(\d+)',
                        'type': r'(?:Type|Inspection Type)[:\s]+([^\n]+)'
                    }
                    
                    for field, pattern in patterns.items():
                        match = re.search(pattern, body_text, re.IGNORECASE)
                        if match:
                            inspection_data[field] = match.group(1).strip()
                    
                except Exception as e:
                    logger.warning(f"Error extracting text: {e}")
                
                # Look for PDF link
                pdf_links = self.driver.find_elements(By.PARTIAL_LINK_TEXT, "PDF")
                if not pdf_links:
                    # Try another method
                    pdf_links = self.driver.find_elements(By.CSS_SELECTOR, "a[href*='print']")
                
                if pdf_links:
                    pdf_url = pdf_links[0].get_attribute("href")
                    inspection_data['pdf_url'] = pdf_url
                    
                    # Download PDF
                    pdf_path = self.download_pdf(info['inspection_id'], pdf_url)
                    if pdf_path:
                        inspection_data['pdf_downloaded'] = True
                        inspection_data['pdf_path'] = pdf_path
                
                # Take screenshot
                screenshot_path = f"{self.data_dir}/screenshots/{info['inspection_id']}.png"
                os.makedirs(f"{self.data_dir}/screenshots", exist_ok=True)
                self.driver.save_screenshot(screenshot_path)
                inspection_data['screenshot'] = screenshot_path
                
                all_data.append(inspection_data)
                
                # Save individual JSON
                json_path = f"{self.data_dir}/json/{info['inspection_id']}.json"
                with open(json_path, 'w') as f:
                    json.dump(inspection_data, f, indent=2)
                
                logger.info(f"✅ Collected data for {info['inspection_id']}")
                
                # Small delay between requests
                time.sleep(1)
                
            except Exception as e:
                logger.error(f"Error processing inspection {info['inspection_id']}: {e}")
                continue
        
        return all_data
    
    def download_pdf(self, inspection_id, pdf_url):
        """Download PDF for inspection"""
        try:
            # Use requests with cookies from browser
            cookies = self.driver.get_cookies()
            session = requests.Session()
            
            for cookie in cookies:
                session.cookies.set(cookie['name'], cookie['value'])
            
            headers = {
                'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36',
                'Referer': self.base_url
            }
            
            response = session.get(pdf_url, headers=headers, timeout=30)
            
            if response.status_code == 200:
                pdf_path = f"{self.data_dir}/pdfs/{inspection_id}.pdf"
                with open(pdf_path, 'wb') as f:
                    f.write(response.content)
                logger.info(f"PDF downloaded: {pdf_path}")
                return pdf_path
                
        except Exception as e:
            logger.error(f"Error downloading PDF: {e}")
        
        return None
    
    def save_summary(self, all_data):
        """Save summary of collected data"""
        summary = {
            'collection_date': datetime.now().isoformat(),
            'total_inspections': len(all_data),
            'pdfs_downloaded': sum(1 for d in all_data if d.get('pdf_downloaded')),
            'inspections': all_data
        }
        
        summary_path = f"{self.data_dir}/collection_summary.json"
        with open(summary_path, 'w') as f:
            json.dump(summary, f, indent=2)
        
        logger.info(f"Summary saved to {summary_path}")
        
        # Create CSV for easy viewing
        import csv
        csv_path = f"{self.data_dir}/inspections.csv"
        
        if all_data:
            keys = ['inspection_id', 'facility_name', 'address', 'date', 'score', 'type', 'pdf_downloaded']
            with open(csv_path, 'w', newline='') as f:
                writer = csv.DictWriter(f, fieldnames=keys, extrasaction='ignore')
                writer.writeheader()
                writer.writerows(all_data)
            
            logger.info(f"CSV saved to {csv_path}")
    
    def upload_to_server(self, data):
        """Upload collected data to server (optional)"""
        if not self.server_upload:
            return
        
        try:
            logger.info(f"Uploading to server: {self.server_url}")
            response = requests.post(
                self.server_url,
                json=data,
                timeout=30
            )
            
            if response.status_code == 200:
                logger.info("✅ Data uploaded successfully")
            else:
                logger.warning(f"Upload failed: {response.status_code}")
                
        except Exception as e:
            logger.error(f"Upload error: {e}")
    
    def run(self, limit=5):
        """Main collection process"""
        try:
            # Setup browser
            self.setup_browser()
            
            # Collect inspection list
            inspection_list = self.collect_inspection_list()
            
            if not inspection_list:
                logger.error("No inspections found")
                return
            
            # Collect detailed data
            logger.info(f"\nCollecting details for {min(limit, len(inspection_list))} inspections...")
            all_data = self.collect_inspection_details(inspection_list, limit=limit)
            
            # Save summary
            self.save_summary(all_data)
            
            # Optional: Upload to server
            if self.server_upload:
                self.upload_to_server(all_data)
            
            # Print summary
            print("\n" + "="*60)
            print("COLLECTION COMPLETE")
            print("="*60)
            print(f"Total inspections found: {len(inspection_list)}")
            print(f"Details collected: {len(all_data)}")
            print(f"PDFs downloaded: {sum(1 for d in all_data if d.get('pdf_downloaded'))}")
            print(f"\nData saved in: {os.path.abspath(self.data_dir)}/")
            print("\nFiles created:")
            print(f"  - {self.data_dir}/collection_summary.json")
            print(f"  - {self.data_dir}/inspections.csv")
            print(f"  - {self.data_dir}/json/ (individual inspection JSONs)")
            print(f"  - {self.data_dir}/pdfs/ (PDF reports)")
            print(f"  - {self.data_dir}/screenshots/ (page screenshots)")
            
        except Exception as e:
            logger.error(f"Collection failed: {e}")
            import traceback
            traceback.print_exc()
            
        finally:
            if self.driver:
                self.driver.quit()

def main():
    print("="*60)
    print("SF INSPECTION DATA COLLECTOR - LOCAL VERSION")
    print("="*60)
    print("\nThis script should be run on your LOCAL machine")
    print("where you can access the SF inspection site.\n")
    
    # Configuration
    HEADLESS = False  # Set to True to run in background (might get blocked)
    UPLOAD_TO_SERVER = False  # Set to True to upload to your server
    LIMIT = 5  # Number of inspections to collect (None for all)
    
    # Create collector
    collector = LocalSFCollector(
        headless=HEADLESS,
        server_upload=UPLOAD_TO_SERVER
    )
    
    # Run collection
    collector.run(limit=LIMIT)

if __name__ == "__main__":
    main()