@echo off
title SF Inspection Collector - Full Historical Scraper
color 0A

echo ============================================
echo   SF INSPECTION COLLECTOR
echo   Full Historical Data Scraper
echo   With Date Search and Pagination
echo ============================================
echo.

REM Set up paths
set PYTHON_EXE=C:\Users\Chris\AppData\Local\Programs\Python\Python313\python.exe
set INSTALL_DIR=%LOCALAPPDATA%\SFCollector
set SCRIPT_FILE=%INSTALL_DIR%\sf_full_scraper.py

REM Check Python
echo Checking Python...
if not exist "%PYTHON_EXE%" (
    echo ERROR: Python not found
    pause
    exit /b 1
)
"%PYTHON_EXE%" --version
echo.

REM Create directory
echo Creating installation directory...
if not exist "%INSTALL_DIR%" mkdir "%INSTALL_DIR%"

REM Write the full scraper Python script
echo Creating full scraper script...
(
echo import os
echo import sys
echo import time
echo import json
echo import base64
echo import warnings
echo from datetime import datetime, timedelta
echo import hashlib
echo warnings.filterwarnings('ignore'^)
echo os.environ['WDM_LOG'] = '0'
echo os.environ['WDM_PROGRESS'] = '0'
echo #
echo print('SF INSPECTION COLLECTOR - FULL HISTORICAL SCRAPER'^)
echo print('='*50^)
echo print(^)
echo #
echo try:
echo     from selenium import webdriver
echo     from selenium.webdriver.common.by import By
echo     from selenium.webdriver.common.keys import Keys
echo     from selenium.webdriver.chrome.options import Options
echo     from selenium.webdriver.chrome.service import Service
echo     from selenium.webdriver.support.ui import WebDriverWait
echo     from selenium.webdriver.support import expected_conditions as EC
echo     from webdriver_manager.chrome import ChromeDriverManager
echo     import requests
echo except ImportError:
echo     print('Installing required packages...'^)
echo     import subprocess
echo     subprocess.run([sys.executable, '-m', 'pip', 'install', 'selenium', 'requests', 'webdriver-manager']^)
echo     from selenium import webdriver
echo     from selenium.webdriver.common.by import By
echo     from selenium.webdriver.common.keys import Keys
echo     from selenium.webdriver.chrome.options import Options
echo     from selenium.webdriver.chrome.service import Service
echo     from selenium.webdriver.support.ui import WebDriverWait
echo     from selenium.webdriver.support import expected_conditions as EC
echo     from webdriver_manager.chrome import ChromeDriverManager
echo     import requests
echo #
echo # CONFIGURATION
echo SERVER_URL = 'https://cleankitchens.org/data/sf/api_receiver_v2.php'
echo API_KEY = 'sk-sf-inspections-2025'  # Your actual API key
echo UPLOAD_ENABLED = True
echo START_DATE = '01/01/2025'  # MM/DD/YYYY format
echo END_DATE = datetime.now(^).strftime('%%m/%%d/%%Y'^)  # Today
echo MAX_SHOW_MORE_CLICKS = 50  # Safety limit
echo WAIT_TIME = 3  # Seconds between actions
echo #
echo # Tracking file for processed inspections
echo TRACKING_FILE = os.path.join(os.getcwd(^), 'processed_inspections.json'^)
echo LOG_FILE = os.path.join(os.getcwd(^), 'scraper_log.txt'^)
echo #
echo def log_message(message^):
echo     """Log messages to file and console"""
echo     timestamp = datetime.now(^).strftime('%%Y-%%m-%%d %%H:%%M:%%S'^)
echo     log_entry = f'[{timestamp}] {message}'
echo     print(log_entry^)
echo     with open(LOG_FILE, 'a'^) as f:
echo         f.write(log_entry + '\n'^)
echo #
echo def load_processed_ids(^):
echo     """Load list of already processed inspection IDs"""
echo     if os.path.exists(TRACKING_FILE^):
echo         try:
echo             with open(TRACKING_FILE, 'r'^) as f:
echo                 data = json.load(f^)
echo                 return set(data.get('processed_ids', []^)^)
echo         except:
echo             return set(^)
echo     return set(^)
echo #
echo def save_processed_id(inspection_id^):
echo     """Save processed inspection ID to tracking file"""
echo     processed = load_processed_ids(^)
echo     processed.add(inspection_id^)
echo     data = {
echo         'processed_ids': list(processed^),
echo         'last_update': datetime.now(^).isoformat(^),
echo         'total_processed': len(processed^)
echo     }
echo     with open(TRACKING_FILE, 'w'^) as f:
echo         json.dump(data, f, indent=2^)
echo #
echo def upload_to_server(data, pdf_path=None^):
echo     """Upload inspection data and optional PDF to server"""
echo     if not UPLOAD_ENABLED:
echo         return False
echo     try:
echo         payload = {
echo             'action': 'upload_inspection',
echo             'inspection_id': data['inspection_id'],
echo             'facility_name': data['facility_name'],
echo             'inspection_data': data
echo         }
echo         if pdf_path and os.path.exists(pdf_path^):
echo             with open(pdf_path, 'rb'^) as f:
echo                 pdf_content = f.read(^)
echo                 payload['pdf_base64'] = base64.b64encode(pdf_content^).decode('utf-8'^)
echo                 payload['pdf_filename'] = os.path.basename(pdf_path^)
echo         headers = {'X-API-Key': API_KEY, 'Content-Type': 'application/json'}
echo         response = requests.post(SERVER_URL, json=payload, headers=headers, timeout=30^)
echo         if response.status_code == 200:
echo             result = response.json(^)
echo             if result.get('success'^):
echo                 return True
echo     except Exception as e:
echo         log_message(f'Upload error: {str(e^)[:100]}'^)
echo     return False
echo #
echo # Create data folder
echo timestamp = datetime.now(^).strftime('%%Y%%m%%d_%%H%%M%%S'^)
echo data_dir = os.path.join(os.getcwd(^), f'sf_data_{timestamp}'^)
echo os.makedirs(data_dir, exist_ok=True^)
echo os.makedirs(os.path.join(data_dir, 'json'^), exist_ok=True^)
echo os.makedirs(os.path.join(data_dir, 'pdfs'^), exist_ok=True^)
echo #
echo log_message('Starting Chrome browser...'^)
echo options = Options(^)
echo options.add_argument('--log-level=3'^)
echo options.add_argument('--disable-logging'^)
echo options.add_experimental_option('excludeSwitches', ['enable-logging']^)
echo # PDF download settings
echo prefs = {
echo     'download.default_directory': os.path.join(data_dir, 'pdfs'^),
echo     'download.prompt_for_download': False,
echo     'download.directory_upgrade': True,
echo     'plugins.always_open_pdf_externally': True,
echo     'safebrowsing.enabled': False
echo }
echo options.add_experimental_option('prefs', prefs^)
echo #
echo try:
echo     service = Service(ChromeDriverManager(^).install(^)^)
echo     service.log_path = os.devnull
echo     driver = webdriver.Chrome(service=service, options=options^)
echo     driver.execute_cdp_cmd('Page.setDownloadBehavior', {
echo         'behavior': 'allow',
echo         'downloadPath': os.path.join(data_dir, 'pdfs'^)
echo     }^)
echo     log_message('Browser started successfully!'^)
echo except Exception as e:
echo     log_message(f'Error starting Chrome: {e}'^)
echo     input('Press Enter to exit...'^)
echo     sys.exit(1^)
echo #
echo log_message('Accessing SF inspection site...'^)
echo driver.get('https://inspections.myhealthdepartment.com/san-francisco'^)
echo time.sleep(WAIT_TIME^)
echo #
echo if '403' in driver.title:
echo     log_message('ERROR: Site blocked access'^)
echo     driver.quit(^)
echo     input('Press Enter to exit...'^)
echo     sys.exit(1^)
echo #
echo log_message('Site accessed successfully!'^)
echo #
echo # Enter date range in search
echo log_message(f'Setting date range: {START_DATE} to {END_DATE}'^)
echo try:
echo     # Find date input fields (you may need to adjust selectors^)
echo     wait = WebDriverWait(driver, 10^)
echo     # Look for date inputs - adjust these selectors based on actual site
echo     date_inputs = driver.find_elements(By.CSS_SELECTOR, 'input[type="date"], input[placeholder*="date" i]'^)
echo     if len(date_inputs^) ^>= 2:
echo         # Clear and set start date
echo         date_inputs[0].clear(^)
echo         date_inputs[0].send_keys(START_DATE^)
echo         time.sleep(1^)
echo         #
echo         # Clear and set end date
echo         date_inputs[1].clear(^)
echo         date_inputs[1].send_keys(END_DATE^)
echo         time.sleep(1^)
echo         #
echo         # Submit search (might be Enter key or a button^)
echo         date_inputs[1].send_keys(Keys.RETURN^)
echo         time.sleep(WAIT_TIME^)
echo         log_message('Date range set successfully'^)
echo     else:
echo         log_message('Warning: Could not find date input fields'^)
echo except Exception as e:
echo     log_message(f'Could not set date range: {e}'^)
echo #
echo # Load processed IDs
echo processed_ids = load_processed_ids(^)
echo log_message(f'Loaded {len(processed_ids^)} previously processed inspections'^)
echo #
echo # Collect all inspections with pagination
echo all_inspections = []
echo show_more_clicks = 0
echo #
echo while show_more_clicks ^< MAX_SHOW_MORE_CLICKS:
echo     # Find current page inspections
echo     links = driver.find_elements(By.TAG_NAME, 'a'^)
echo     current_count = 0
echo     #
echo     for link in links:
echo         href = link.get_attribute('href'^) or ''
echo         if 'inspectionID=' in href:
echo             name = link.text.strip(^)
echo             if name:
echo                 parts = href.split('inspectionID='^)
echo                 if len(parts^) ^> 1:
echo                     inspection_id = parts[1]
echo                     if inspection_id not in [i['id'] for i in all_inspections]:
echo                         all_inspections.append({'id': inspection_id, 'url': href, 'name': name}^)
echo                         current_count += 1
echo #
echo     log_message(f'Found {current_count} new inspections (total: {len(all_inspections^)}^)'^)
echo #
echo     # Look for "Show More" button
echo     try:
echo         show_more_button = driver.find_element(By.XPATH, "//button[contains(text(^), 'Show More'^)] | //a[contains(text(^), 'Show More'^)]"^)
echo         if show_more_button.is_displayed(^) and show_more_button.is_enabled(^):
echo             driver.execute_script("arguments[0].scrollIntoView(^);", show_more_button^)
echo             time.sleep(1^)
echo             show_more_button.click(^)
echo             show_more_clicks += 1
echo             log_message(f'Clicked Show More button ({show_more_clicks}^)'^)
echo             time.sleep(WAIT_TIME^)
echo         else:
echo             log_message('Show More button not clickable'^)
echo             break
echo     except:
echo         log_message('No more Show More button found'^)
echo         break
echo #
echo log_message(f'Total inspections found: {len(all_inspections^)}'^)
echo #
echo # Filter out already processed
echo new_inspections = [i for i in all_inspections if i['id'] not in processed_ids]
echo log_message(f'New inspections to process: {len(new_inspections^)}'^)
echo #
echo if not new_inspections:
echo     log_message('No new inspections to process'^)
echo     driver.quit(^)
echo     input('Press Enter to exit...'^)
echo     sys.exit(0^)
echo #
echo # Process inspections
echo log_message('Starting collection...'^)
echo log_message('-'*50^)
echo #
echo collected = 0
echo pdfs_downloaded = 0
echo uploaded = 0
echo #
echo for i, insp in enumerate(new_inspections, 1^):
echo     try:
echo         log_message(f'[{i}/{len(new_inspections^)}] Processing: {insp["name"][:50]}'^)
echo         #
echo         # Skip if already processed
echo         if insp['id'] in processed_ids:
echo             log_message('  - Already processed, skipping'^)
echo             continue
echo #
echo         # Get inspection page
echo         driver.get(insp['url']^)
echo         time.sleep(2^)
echo         inspection_page_text = driver.find_element(By.TAG_NAME, 'body'^).text
echo         log_message('  - Inspection page scraped'^)
echo #
echo         # Download PDF
echo         pdf_url = f'https://inspections.myhealthdepartment.com/san-francisco/print/?task=getPrintable^&path=san-francisco^&pKey={insp["id"]},{insp["id"]}'
echo         pdf_path = os.path.join(data_dir, 'pdfs', f'{insp["id"]}.pdf'^)
echo         temp_pdf = os.path.join(data_dir, 'pdfs', 'download.pdf'^)
echo #
echo         driver.get(pdf_url^)
echo         time.sleep(5^)
echo #
echo         # Check for downloaded PDF
echo         pdf_downloaded = False
echo         if os.path.exists(temp_pdf^):
echo             os.rename(temp_pdf, pdf_path^)
echo             log_message(f'  - PDF saved as {insp["id"]}.pdf'^)
echo             pdfs_downloaded += 1
echo             pdf_downloaded = True
echo         else:
echo             import glob
echo             pdf_files = glob.glob(os.path.join(data_dir, 'pdfs', '*.pdf'^)^)
echo             if pdf_files:
echo                 latest_pdf = max(pdf_files, key=os.path.getctime^)
echo                 if not latest_pdf.endswith(f'{insp["id"]}.pdf'^):
echo                     os.rename(latest_pdf, pdf_path^)
echo                     log_message(f'  - PDF saved as {insp["id"]}.pdf'^)
echo                     pdfs_downloaded += 1
echo                     pdf_downloaded = True
echo #
echo         # Save data
echo         data = {
echo             'inspection_id': insp['id'],
echo             'facility_name': insp['name'],
echo             'inspection_page_text': inspection_page_text[:10000],
echo             'pdf_url': pdf_url,
echo             'pdf_downloaded': pdf_downloaded,
echo             'collected_at': datetime.now(^).isoformat(^),
echo             'url': insp['url']
echo         }
echo #
echo         json_file = os.path.join(data_dir, 'json', f'{insp["id"]}.json'^)
echo         with open(json_file, 'w', encoding='utf-8'^) as f:
echo             json.dump(data, f, indent=2, ensure_ascii=False^)
echo         log_message('  - Data saved locally'^)
echo         collected += 1
echo #
echo         # Upload to server
echo         if UPLOAD_ENABLED:
echo             if upload_to_server(data, pdf_path if pdf_downloaded else None^):
echo                 log_message('  - Uploaded to server!'^)
echo                 uploaded += 1
echo                 # Mark as processed only if uploaded successfully
echo                 save_processed_id(insp['id']^)
echo             else:
echo                 log_message('  - Server upload failed'^)
echo         else:
echo             # Mark as processed even without upload
echo             save_processed_id(insp['id']^)
echo #
echo     except Exception as e:
echo         log_message(f'  ERROR: {str(e^)[:100]}'^)
echo #
echo driver.quit(^)
echo print(^)
echo log_message('='*50^)
echo log_message('COLLECTION COMPLETE'^)
echo log_message('='*50^)
echo log_message(f'Inspections collected: {collected}/{len(new_inspections^)}'^)
echo log_message(f'PDFs downloaded: {pdfs_downloaded}'^)
echo if UPLOAD_ENABLED:
echo     log_message(f'Uploaded to server: {uploaded}'^)
echo log_message(f'Data saved to: {os.path.abspath(data_dir^)}'^)
echo log_message(f'Total processed inspections tracked: {len(load_processed_ids(^)^)}'^)
echo #
echo try:
echo     os.startfile(os.path.abspath(data_dir^)^)
echo except:
echo     pass
echo #
echo input('\nPress Enter to exit...'^)
) > "%SCRIPT_FILE%"

REM Create desktop shortcut
echo Creating desktop shortcut...
(
echo @echo off
echo title SF Full Scraper
echo cd /d "%INSTALL_DIR%"
echo "%PYTHON_EXE%" "%SCRIPT_FILE%"
echo pause
) > "%USERPROFILE%\Desktop\SF_Full_Scraper.bat"

REM Install packages
echo Installing required packages...
"%PYTHON_EXE%" -m pip install selenium requests webdriver-manager >nul 2>&1

echo.
echo ============================================
echo   INSTALLATION COMPLETE!
echo ============================================
echo.
echo Installed to: %INSTALL_DIR%
echo Desktop shortcut created: SF_Full_Scraper
echo.
echo Features:
echo   - Date range search (01/01/2025 to today)
echo   - Automatic "Show More" pagination
echo   - Duplicate detection
echo   - Progress tracking
echo   - Server upload with API key
echo   - Daily incremental updates
echo.
echo The script will:
echo   1. Search for all 2025 inspections
echo   2. Click "Show More" to load all results
echo   3. Skip already processed inspections
echo   4. Download PDFs and data
echo   5. Upload to your server
echo   6. Track progress for next run
echo.
echo Starting in 5 seconds...
timeout /t 5 >nul

REM Run the scraper
echo.
echo Running full scraper...
cd /d "%INSTALL_DIR%"
"%PYTHON_EXE%" "%SCRIPT_FILE%"

echo.
echo Press any key to close...
pause >nul