#!/home/chris/cleankitchens-env/bin/python3
"""
Process SF inspection PDFs into Weaviate and generate stories
Direct processing without MySQL middleman
"""

import os
import json
import PyPDF2
import re
import weaviate
from datetime import datetime
from anthropic import Anthropic
from dotenv import load_dotenv
import hashlib

# Load environment variables
load_dotenv('/home/chris/.env')

class SFWeaviateProcessor:
    def __init__(self):
        # Connect to Weaviate
        self.client = weaviate.connect_to_local()
        
        # Initialize Anthropic client
        self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
        
        # Create SF collection if not exists
        self.setup_collection()
    
    def setup_collection(self):
        """Create SFTemp collection if not exists"""
        try:
            # Check if collection exists
            collections = [col for col in self.client.collections.list_all()]
            if "SFTemp" not in collections:
                # Create collection
                self.client.collections.create(
                    name="SFTemp",
                    properties=[
                        weaviate.classes.config.Property(name="facility_name", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="address", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="inspection_date", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="inspection_status", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="inspector_name", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="violations", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="violation_count", data_type=weaviate.classes.config.DataType.INT),
                        weaviate.classes.config.Property(name="corrective_actions", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="observations", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="pdf_text", data_type=weaviate.classes.config.DataType.TEXT),
                        weaviate.classes.config.Property(name="inspection_id", data_type=weaviate.classes.config.DataType.TEXT),
                    ]
                )
                print("✓ Created SFTemp collection")
            
            self.collection = self.client.collections.get("SFTemp")
            
        except Exception as e:
            print(f"Collection setup error: {e}")
    
    def extract_pdf_data(self, pdf_path):
        """Extract text and parse inspection data from PDF"""
        try:
            with open(pdf_path, 'rb') as file:
                reader = PyPDF2.PdfReader(file)
                text = ""
                for page in reader.pages:
                    text += page.extract_text()
            
            data = {}
            
            # Inspection Date
            if match := re.search(r'Insp(?:ection)?\s+Date\s*(\d{2}/\d{2}/\d{4})', text, re.IGNORECASE):
                data['inspection_date'] = match.group(1)
            
            # Facility Name
            if match := re.search(r'Permit Name\s*([^\n]+)', text, re.IGNORECASE):
                data['facility_name'] = match.group(1).strip()
            
            # Address
            if match := re.search(r'Address\s*(\d+[^,\n]+(?:,\s*[^,\n]+)*,?\s*SAN FRANCISCO[^,\n]*CA\s*\d{5})', text, re.IGNORECASE):
                data['address'] = match.group(1).strip()
            
            # Inspector
            if match := re.search(r'Inspector\s+([A-Za-z\s]+?)(?:Inspector Email|Inspector Phone|\n)', text):
                data['inspector_name'] = match.group(1).strip()
            
            # Determine Status
            data['inspection_status'] = 'UNKNOWN'
            if 'CONDITIONAL PASS' in text or 'CONDITIONAL\nPASS' in text:
                data['inspection_status'] = 'CONDITIONAL PASS'
            elif 'CLOSURE' in text and 'PASS' not in text.replace('CLOSURE', ''):
                data['inspection_status'] = 'CLOSURE'
            elif 'PASS' in text and 'CONDITIONAL' not in text:
                data['inspection_status'] = 'PASS'
            
            # Extract violations
            violations = []
            violation_pattern = r'(\d{1,2})\s*-\s*([A-Z][A-Z\s,:;&]+?)(?:Corrective Action:|Observation:|California Retail Food Code:|\d{1,2}\s*-|$)'
            
            if matches := re.findall(violation_pattern, text, re.MULTILINE | re.DOTALL):
                for code, description in matches:
                    desc_clean = re.sub(r'\s+', ' ', description).strip()
                    if len(desc_clean) > 10:
                        violations.append(f"Code {code}: {desc_clean[:200]}")
            
            data['violations'] = json.dumps(violations)
            data['violation_count'] = len(violations)
            
            # Corrective Actions
            corrective_actions = []
            if matches := re.findall(r'Corrective Action:\s*([^:]+?)(?:Observation:|California Retail Food Code:|$)', text, re.DOTALL):
                for action in matches:
                    action_clean = re.sub(r'\s+', ' ', action).strip()
                    if action_clean:
                        corrective_actions.append(action_clean)
            data['corrective_actions'] = ' | '.join(corrective_actions)
            
            # Observations
            observations = []
            if matches := re.findall(r'Observation:\s*([^:]+?)(?:Corrective Action:|California Retail Food Code:|$)', text, re.DOTALL):
                for obs in matches:
                    obs_clean = re.sub(r'\s+', ' ', obs).strip()
                    if obs_clean:
                        observations.append(obs_clean)
            data['observations'] = ' | '.join(observations)
            
            data['pdf_text'] = text[:10000]  # Limit size
            data['inspection_id'] = os.path.splitext(os.path.basename(pdf_path))[0]
            
            return data
            
        except Exception as e:
            print(f"Error extracting PDF {pdf_path}: {e}")
            return None
    
    def load_pdfs_to_weaviate(self):
        """Load all PDFs into Weaviate"""
        pdf_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf/inspection_data/pdfs"
        json_dir = "/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf/inspection_data/json"
        
        pdf_files = [f for f in os.listdir(pdf_dir) if f.endswith('.pdf')]
        
        print(f"Loading {len(pdf_files)} PDFs to Weaviate...")
        
        loaded = 0
        for pdf_file in pdf_files[:5]:  # Process first 5 for testing
            pdf_path = os.path.join(pdf_dir, pdf_file)
            data = self.extract_pdf_data(pdf_path)
            
            if data:
                try:
                    # Load corresponding JSON for additional data
                    json_file = pdf_file.replace('.pdf', '.json')
                    json_path = os.path.join(json_dir, json_file)
                    if os.path.exists(json_path):
                        with open(json_path, 'r') as f:
                            json_data = json.load(f)
                            # Merge JSON data
                            if 'facility_name' not in data and 'facility_name' in json_data:
                                data['facility_name'] = json_data['facility_name']
                    
                    # Add to Weaviate
                    self.collection.data.insert(data)
                    loaded += 1
                    
                    status_icon = "🔴" if data['inspection_status'] == 'CLOSURE' else "🟡" if data['inspection_status'] == 'CONDITIONAL PASS' else "🟢"
                    print(f"{status_icon} Loaded: {data.get('facility_name', 'Unknown')[:30]} - {data['inspection_status']}")
                    
                except Exception as e:
                    print(f"Error loading to Weaviate: {e}")
        
        print(f"✓ Loaded {loaded} inspections to Weaviate")
        return loaded

class SFStoryGenerator:
    def __init__(self):
        # Connect to Weaviate
        self.client = weaviate.connect_to_local()
        self.collection = self.client.collections.get("SFTemp")
        
        # Initialize Anthropic client
        self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
        
        # Articles collection
        self.articles = self.client.collections.get("Articles")
    
    def phase1_analyze(self, inspection):
        """Phase 1: Analyze inspection and extract metadata"""
        
        violations = json.loads(inspection.properties.get('violations', '[]'))
        
        prompt = f"""Analyze this San Francisco restaurant inspection and extract structured metadata.

INSPECTION DATA:
Facility: {inspection.properties.get('facility_name', 'Unknown')}
Address: {inspection.properties.get('address', 'Unknown')}
Date: {inspection.properties.get('inspection_date', 'Unknown')}
Status: {inspection.properties.get('inspection_status', 'Unknown')}
Violations: {violations}
Corrective Actions: {inspection.properties.get('corrective_actions', 'None')}

Return ONLY a JSON object:
{{
  "cuisine_type": "detect from name",
  "neighborhood": "SF neighborhood",
  "violation_severity": "Critical/Major/Minor",
  "key_concerns": ["list main issues"],
  "education_focus": "main safety topic",
  "risk_level": "High/Medium/Low"
}}"""

        try:
            response = self.anthropic.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=500,
                temperature=0.1,
                messages=[{"role": "user", "content": prompt}]
            )
            return json.loads(response.content[0].text)
        except:
            return {"violation_severity": "Major", "education_focus": "Food safety"}
    
    def phase2_generate(self, inspection, metadata):
        """Phase 2: Generate article using Sonnet"""
        
        status = inspection.properties.get('inspection_status', 'Unknown')
        facility = inspection.properties.get('facility_name', 'Unknown Restaurant')
        
        prompt = f"""Write an educational news article about this San Francisco restaurant inspection.

FACILITY: {facility}
ADDRESS: {inspection.properties.get('address', 'Unknown')}
DATE: {inspection.properties.get('inspection_date', 'Unknown')}
STATUS: {status}
VIOLATIONS: {inspection.properties.get('violation_count', 0)}

DETAILS:
{inspection.properties.get('corrective_actions', 'None noted')}

METADATA: {json.dumps(metadata)}

Write an engaging, educational article that:
1. Reports the inspection factually
2. Educates about food safety importance
3. Explains why violations matter
4. Uses proper HTML with <p>, <h2>, <ul> tags
5. Minimum 400 words

Return ONLY valid JSON:
{{
  "title": "SEO-friendly title with restaurant name and SF",
  "content": "Full HTML article",
  "excerpt": "2-3 sentence summary",
  "meta_description": "Under 160 chars",
  "tags": ["san-francisco", "food-safety", etc],
  "slug": "url-friendly-slug"
}}"""

        try:
            response = self.anthropic.messages.create(
                model="claude-3-5-sonnet-20241022",  # Using Sonnet as requested
                max_tokens=2000,
                temperature=0.3,
                messages=[{"role": "user", "content": prompt}]
            )
            
            article = json.loads(response.content[0].text)
            
            # Add inspection data
            article['facility_name'] = facility
            article['inspection_date'] = inspection.properties.get('inspection_date', '')
            article['inspection_status'] = status
            article['violation_count'] = inspection.properties.get('violation_count', 0)
            article['city'] = 'san-francisco'
            article['state'] = 'california'
            article['country'] = 'usa'
            article['language'] = 'en'
            article['status'] = 'published'
            article['featured'] = False
            article['author_id'] = 2  # Claude author
            article['created_at'] = datetime.now().isoformat()
            
            return article
            
        except Exception as e:
            print(f"Generation error: {e}")
            return None
    
    def post_to_website(self, article):
        """Post article to Articles collection"""
        try:
            # Add to Articles collection
            result = self.articles.data.insert(article)
            print(f"  ✅ Posted: {article['title'][:60]}...")
            return result
        except Exception as e:
            print(f"  ❌ Post failed: {e}")
            return None
    
    def generate_and_post_stories(self, limit=5):
        """Generate and post stories to website"""
        
        # Get inspections from Weaviate
        response = self.collection.query.fetch_objects(limit=limit)
        
        print(f"\n{'='*60}")
        print("GENERATING SF STORIES WITH SONNET")
        print(f"{'='*60}\n")
        
        posted = []
        for i, inspection in enumerate(response.objects, 1):
            print(f"\n[{i}/{limit}] {inspection.properties.get('facility_name', 'Unknown')[:40]}...")
            
            # Phase 1: Analyze
            print("  Phase 1: Analyzing...")
            metadata = self.phase1_analyze(inspection)
            
            # Phase 2: Generate with Sonnet
            print("  Phase 2: Generating with Sonnet...")
            article = self.phase2_generate(inspection, metadata)
            
            if article:
                # Post to website
                result = self.post_to_website(article)
                if result:
                    posted.append(article)
                    
                    # Save local copy
                    filename = f"sf_article_{datetime.now().strftime('%Y%m%d_%H%M%S')}_{i}.json"
                    with open(f"/var/www/twin-digital-media/public_html/_sites/cleankitchens/data/sf/{filename}", 'w') as f:
                        json.dump(article, f, indent=2)
        
        print(f"\n{'='*60}")
        print(f"COMPLETE: Posted {len(posted)}/{limit} articles")
        print(f"{'='*60}\n")
        
        return posted

if __name__ == "__main__":
    # Step 1: Load PDFs to Weaviate
    processor = SFWeaviateProcessor()
    loaded = processor.load_pdfs_to_weaviate()
    
    if loaded > 0:
        # Step 2: Generate and post stories
        generator = SFStoryGenerator()
        articles = generator.generate_and_post_stories(limit=5)
        
        print("\n📰 Articles generated and posted to website!")
        print("Visit https://cleankitchens.org to review")