#!/usr/bin/env python3
"""
San Francisco Restaurant Inspection Scraper
Attempts to scrape recent inspection data from the SF Health Department
"""

import requests
from bs4 import BeautifulSoup
import json
import time
from datetime import datetime

def scrape_sf_inspections():
    """
    Try to scrape SF restaurant inspection data
    """
    
    # Try the data.sfgov.org API endpoints that were found
    api_endpoints = [
        "https://data.sfgov.org/resource/pyih-qa8i.json?$limit=5&$order=inspection_date DESC",  # Old LIVES endpoint
        "https://data.sfgov.org/resource/y8fp-fbf5.json?$limit=5",  # Potential new endpoint
        "https://data.sfgov.org/resource/tvy3-wexg.json?$limit=5",  # Health Inspection Scores (2023-Present)
    ]
    
    headers = {
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36',
        'Accept': 'application/json',
    }
    
    results = {}
    
    for endpoint in api_endpoints:
        print(f"\nTrying endpoint: {endpoint}")
        try:
            response = requests.get(endpoint, headers=headers, timeout=10)
            if response.status_code == 200:
                data = response.json()
                if data:
                    print(f"Success! Found {len(data)} records")
                    results[endpoint] = data[:5]  # Get first 5 records
                else:
                    print("Empty response")
            else:
                print(f"Failed with status code: {response.status_code}")
        except Exception as e:
            print(f"Error: {str(e)}")
    
    # Try direct HTML scraping as fallback
    if not results:
        print("\nTrying HTML scraping...")
        try:
            # Try to get the main page
            url = "https://inspections.myhealthdepartment.com/san-francisco"
            response = requests.get(url, headers=headers, timeout=10)
            print(f"MyHealthDepartment status: {response.status_code}")
            
            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                # Look for inspection data in the HTML
                # This would need to be adjusted based on actual HTML structure
                print("Page title:", soup.title.string if soup.title else "No title")
        except Exception as e:
            print(f"HTML scraping error: {str(e)}")
    
    return results

if __name__ == "__main__":
    print("Starting San Francisco inspection data scraper...")
    print("=" * 50)
    
    results = scrape_sf_inspections()
    
    if results:
        print("\n" + "=" * 50)
        print("RESULTS FOUND:")
        for endpoint, data in results.items():
            print(f"\nFrom {endpoint}:")
            for i, record in enumerate(data, 1):
                print(f"\nRecord {i}:")
                # Print first few fields of each record
                for key in list(record.keys())[:10]:
                    value = str(record[key])[:100]  # Truncate long values
                    print(f"  {key}: {value}")
        
        # Save to file
        with open('sf_inspection_sample.json', 'w') as f:
            json.dump(results, f, indent=2, default=str)
        print("\nData saved to sf_inspection_sample.json")
    else:
        print("\nNo data could be retrieved from any source")