#!/usr/bin/env python3
"""
Two-Phase Article Generation Test
Phase 1: Analyze and extract metadata
Phase 2: Generate educational article
"""

import os
import json
import weaviate
import re
from datetime import datetime
from anthropic import Anthropic
from dotenv import load_dotenv

# Load environment variables
load_dotenv('/home/chris/.env')

class TwoPhaseGenerator:
    def __init__(self):
        # Connect to Weaviate
        self.client = weaviate.connect_to_local()
        self.collection = self.client.collections.get("ChicagoTemp")
        
        # Initialize Anthropic client
        self.anthropic = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
        
        # Government references
        self.gov_references = {
            'temperature': {
                'danger_zone': ('USDA Danger Zone', 'https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/danger-zone-40f-140f'),
                'cold_holding': ('FDA Cold Storage', 'https://www.fda.gov/food/buy-store-serve-safe-food/safe-food-storage'),
                'hot_holding': ('FDA Food Code 3-501.16', 'https://www.fda.gov/food/fda-food-code/food-code-2022')
            },
            'hygiene': {
                'handwashing': ('CDC Handwashing', 'https://www.cdc.gov/handwashing/when-how-handwashing.html'),
                'employee_health': ('Chicago Employee Health', 'https://www.chicago.gov/content/dam/city/depts/cdph/FoodProtection/EmployeeHealthPolicySample.pdf')
            },
            'pests': {
                'rodents': ('CDC Rodent Diseases', 'https://www.cdc.gov/rodents/diseases/index.html'),
                'prevention': ('EPA IPM', 'https://www.epa.gov/safepestcontrol/integrated-pest-management-ipm-principles')
            },
            'chicago': {
                'inspections': ('Chicago Food Inspections', 'https://data.cityofchicago.org/Health-Human-Services/Food-Inspections/4ijn-s7e5'),
                'health_dept': ('Chicago Health Department', 'https://www.chicago.gov/city/en/depts/cdph.html')
            }
        }
    
    def validate_article_json(self, data):
        """Validate article JSON has required fields"""
        required_fields = ['title', 'content', 'excerpt', 'meta_description', 'tags']
        for field in required_fields:
            if field not in data or not data[field]:
                return False, f"Missing or empty {field}"
        
        # Additional validation
        if len(data['title']) < 10:
            return False, "Title too short"
        if len(data['content']) < 100:
            return False, "Content too short"
        if not isinstance(data['tags'], list) or len(data['tags']) == 0:
            return False, "Tags must be non-empty list"
            
        return True, "Valid"
    
    def extract_json_manually(self, text):
        """Extract JSON from response text manually"""
        try:
            # Find JSON boundaries
            start = text.find('{')
            end = text.rfind('}') + 1
            
            if start == -1 or end == 0:
                raise ValueError("No JSON found in response")
                
            json_text = text[start:end]
            return json.loads(json_text)
        except:
            # Last resort: create minimal valid JSON
            return {
                "title": "Restaurant Inspection Results",
                "content": "<p>Inspection details unavailable due to parsing error.</p>",
                "excerpt": "Restaurant inspection results",
                "meta_description": "Chicago restaurant inspection results",
                "tags": ["chicago", "inspection", "food-safety"],
                "education_summary": "Inspection completed with violations noted."
            }
    
    def parse_json_with_retry(self, response_text, phase_name, max_retries=2):
        """Parse JSON with retry logic and validation"""
        
        for attempt in range(max_retries + 1):
            try:
                # Try parsing JSON
                if response_text.startswith('{'):
                    data = json.loads(response_text)
                    
                    # Validate for Phase 2 (articles)
                    if phase_name == "Phase 2":
                        is_valid, error = self.validate_article_json(data)
                        if is_valid:
                            return data
                        else:
                            print(f"⚠️  {phase_name} Attempt {attempt + 1}: {error}")
                    else:
                        return data
                        
                else:
                    print(f"⚠️  {phase_name} Attempt {attempt + 1}: Response doesn't start with curly brace")
                
            except json.JSONDecodeError as e:
                print(f"⚠️  {phase_name} Attempt {attempt + 1}: JSON parsing error - {e}")
                
            # If not last attempt, try manual extraction
            if attempt < max_retries:
                try:
                    response_text = self.extract_json_manually(response_text)
                    if isinstance(response_text, dict):
                        return response_text
                except:
                    pass
        
        # Final fallback
        print(f"❌ {phase_name} Failed after {max_retries + 1} attempts - using fallback")
        if phase_name == "Phase 2":
            return self.extract_json_manually("")  # Returns minimal valid JSON
        else:
            return None
    
    def get_recent_failed_inspections(self, limit=10):
        """Get random inspections with violations (any status)"""
        try:
            import random
            
            # Query for inspections with violations
            response = self.collection.query.fetch_objects(
                limit=1000  # Get more to find violations
            )
            
            # Filter for any inspection with violations
            violation_inspections = []
            for obj in response.objects:
                violations = obj.properties.get('violations', '').strip()
                if violations and len(violations) > 0:
                    violation_inspections.append(obj)
            
            # Randomly select from violation inspections
            if len(violation_inspections) > limit:
                selected = random.sample(violation_inspections, limit)
            else:
                selected = violation_inspections
            
            return selected
            
        except Exception as e:
            print(f"Error fetching inspections: {e}")
            return []
    
    def phase1_analyze_metadata(self, inspection):
        """Phase 1: Extract metadata and analyze inspection"""
        
        prompt = f"""Analyze this Chicago restaurant inspection and extract structured metadata.

INSPECTION DATA:
Facility: {inspection.properties.get('facility_name', 'Unknown')}
Type: {inspection.properties.get('facility_type', 'Unknown')}
Address: {inspection.properties.get('address', 'Unknown')}, Chicago, IL {inspection.properties.get('zip_code', '')}
Date: {inspection.properties.get('inspection_date', 'Unknown')}
Risk Level: {inspection.properties.get('risk_level', 'Unknown')}
Result: {inspection.properties.get('results', 'Unknown')}
Violations: {inspection.properties.get('violations', 'None')}

Analyze and return ONLY a JSON object with these fields:
{{
  "cuisine_type": "Detect from name/type (e.g., Mexican, Fast Food, etc.)",
  "neighborhood": "Chicago neighborhood from address",
  "service_style": "Quick Service/Full Service/Takeout/etc.",
  "price_range": "$, $$, or $$$",
  "is_chain": true/false,
  "chain_name": "Parent company if chain, empty if not",
  "violation_categories": ["temperature", "hygiene", "structural", etc.],
  "violation_severity": "Critical/Major/Minor",
  "critical_violation_count": number,
  "risk_score": 0-100,
  "compliance_score": 0-100,
  "key_concerns": ["specific food safety issues"],
  "education_focus": "Main topic to educate about"
}}

CRITICAL: Return ONLY valid JSON. No other text before or after.
Example: {{"title": "text", "content": "text"}}
Do not include markdown, explanations, or formatting outside the JSON."""

        try:
            response = self.anthropic.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=500,
                temperature=0.1,  # Very low for consistent analysis
                messages=[{"role": "user", "content": prompt}]
            )
            
            # Parse response with robust handling
            response_text = response.content[0].text.strip()
            metadata = self.parse_json_with_retry(response_text, "Phase 1")
            
            if metadata:
                print("✅ Phase 1 Complete - Metadata extracted")
                return metadata
            else:
                print(f"❌ Phase 1 Failed - Could not extract valid JSON")
                return None
                
        except Exception as e:
            print(f"❌ Phase 1 Error: {e}")
            return None
    
    def get_image_url(self, metadata):
        """Get appropriate image based on violation categories"""
        categories = metadata.get('violation_categories', [])
        if not categories:
            return '/assets/images/violations/general_1.jpg'
        
        category = categories[0].lower()
        if 'temperature' in category:
            return '/assets/images/violations/temperature_1.jpg'
        elif 'hygiene' in category or 'hand' in category:
            return '/assets/images/violations/handwashing_1.jpg'
        elif 'pest' in category or 'rodent' in category:
            return '/assets/images/violations/rodent_1.jpg'
        elif 'structural' in category:
            return '/assets/images/violations/structural_1.jpg'
        else:
            return '/assets/images/violations/general_1.jpg'
    
    def get_relevant_references(self, metadata):
        """Get government references based on violation categories"""
        refs = []
        
        categories = metadata.get('violation_categories', [])
        
        for cat in categories:
            if 'temp' in cat.lower():
                refs.append(self.gov_references['temperature']['danger_zone'])
            if 'hygiene' in cat.lower() or 'hand' in cat.lower():
                refs.append(self.gov_references['hygiene']['handwashing'])
            if 'pest' in cat.lower() or 'rodent' in cat.lower():
                refs.append(self.gov_references['pests']['rodents'])
        
        # Always include Chicago references
        refs.append(self.gov_references['chicago']['inspections'])
        
        return refs[:4]  # Limit to 4 references
    
    def phase2_generate_article(self, inspection, metadata):
        """Phase 2: Generate educational article using metadata"""
        
        # Get relevant government references
        refs = self.get_relevant_references(metadata)
        ref_html = '\n'.join([f'- <a href="{r[1]}">{r[0]}</a>' for r in refs])
        
        prompt = f"""You are a neutral local news reporter writing for CleanKitchens.org about a restaurant health inspection.

INSPECTION DATA:
Facility: {inspection.properties.get('facility_name', 'Unknown')}
Address: {inspection.properties.get('address', 'Unknown')}, Chicago, IL
Date: {inspection.properties.get('inspection_date', 'Unknown')}
Result: {inspection.properties.get('results', 'Unknown')}
Violations: {inspection.properties.get('violations', 'None')}

CONTEXT:
Neighborhood: {metadata.get('neighborhood', 'Chicago')}
Cuisine Type: {metadata.get('cuisine_type', 'Restaurant')}

GOVERNMENT RESOURCES:
{ref_html}

Write a comprehensive SEO-optimized news article (1,500-2,000 words / 10,000-12,000 characters):

OPENING (3-4 paragraphs, 2500+ chars): Paint a detailed picture of the location. Include the full address, describe the neighborhood character, mention 2-3 specific nearby landmarks (exact L/CTA stations like "Belmont Red Line", major intersections like "Clark and Diversey", nearby schools or landmarks). Describe the type of establishment, how long it's been there if relevant, what the area is like. Make Chicago residents feel like they know exactly where this is.

VIOLATIONS DETAIL (4-5 paragraphs, 4000+ chars): Thoroughly report EVERY violation using the complete inspection text. Quote extensively from inspector comments. Include all specific details: exact temperatures measured, specific equipment problems, precise locations of issues ("the three-compartment sink in the rear prep area"). Don't summarize - use the full violation descriptions. Explain what inspectors observed, what they instructed, and any citations issued.

EDUCATION & CONTEXT (3-4 paragraphs, 3500+ chars): Provide deep educational value about food safety. For each type of violation, explain the science behind why it matters. Naturally weave in government sources as hyperlinks: "The <a href='[url]'>CDC's food safety guidelines</a> explain that bacteria can double every 20 minutes between 40°F and 140°F..." Include specific statistics, explain potential illnesses, discuss proper procedures. Make this genuinely educational for readers.

TAGS TO GENERATE:
- Neighborhood tag (e.g., "lincoln-park", "loop", "uptown")
- Violation type tags (e.g., "temperature-violation", "pest-control", "handwashing")
- Cuisine tag (e.g., "chinese", "mexican", "fast-food")
- Additional: "chicago", "health-inspection", inspection year

Format response EXACTLY as JSON with no other text:
{{
  "title": "factual headline 60-80 chars",
  "content": "<p>First paragraph...</p><p>Second paragraph...</p>",
  "excerpt": "150 char summary",
  "meta_description": "160 char SEO description",
  "tags": ["specific", "relevant", "tags"],
  "education_summary": "One sentence about key food safety lesson"
}}

CRITICAL: Return ONLY valid JSON. No other text before or after.
Do not include markdown, explanations, or formatting outside the JSON.
Ensure all strings are properly escaped and quoted."""

        try:
            response = self.anthropic.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=4000,
                temperature=0.3,
                messages=[{"role": "user", "content": prompt}]
            )
            
            response_text = response.content[0].text.strip()
            
            # Parse JSON with robust handling
            article = self.parse_json_with_retry(response_text, "Phase 2")
            
            if article:
                print("✅ Phase 2 Complete - Article generated")
                return article
            else:
                print(f"❌ Phase 2 Failed - Could not extract valid JSON")
                return None
                
        except Exception as e:
            print(f"❌ Phase 2 Error: {e}")
            return None
    
    def save_article_to_weaviate(self, article_data, metadata, inspection):
        """Save the generated article to Weaviate Articles collection"""
        try:
            articles = self.client.collections.get("Articles")
            
            # Generate slug using inspection date
            import re
            from datetime import datetime as dt
            
            # Get inspection date and parse it
            inspection_date_str = inspection.properties.get('inspection_date', '')
            if inspection_date_str:
                # Parse the date string (assuming format like "12/23/2019" or "2019-12-23")
                try:
                    if '/' in inspection_date_str:
                        inspection_date = dt.strptime(inspection_date_str, '%m/%d/%Y')
                    else:
                        inspection_date = dt.strptime(inspection_date_str[:10], '%Y-%m-%d')
                except:
                    inspection_date = datetime.now()  # Fallback to current date
            else:
                inspection_date = datetime.now()
            
            # Create slug with date folders and title
            title = article_data['title']
            slug_text = title.lower()
            slug_text = re.sub(r'[^a-z0-9\s-]', '', slug_text)
            slug_text = re.sub(r'\s+', '-', slug_text)
            slug_text = slug_text[:80].strip('-')
            
            # Format: YYYY/MM/DD/article-title
            slug = f"{inspection_date.strftime('%Y/%m/%d')}/{slug_text}"
            
            # Combine all data
            article_object = {
                # Article content
                'title': article_data['title'],
                'slug': slug,
                'content': article_data['content'],
                'excerpt': article_data['excerpt'],
                'meta_description': article_data['meta_description'],
                
                # From original inspection
                'facility_name': inspection.properties.get('facility_name', ''),
                'establishment_name': inspection.properties.get('facility_name', ''),  # Duplicate for homepage
                'address': inspection.properties.get('address', ''),
                'city': 'Chicago',
                'state': 'IL',
                'inspection_date': inspection.properties.get('inspection_date', ''),
                'inspection_id': inspection.properties.get('inspection_id', ''),
                
                # From metadata analysis
                'cuisine_type': metadata.get('cuisine_type', ''),
                'neighborhood': metadata.get('neighborhood', ''),
                'service_style': metadata.get('service_style', ''),
                'price_range': metadata.get('price_range', ''),
                'is_chain': metadata.get('is_chain', False),
                'chain_name': metadata.get('chain_name', ''),
                'violation_category': ', '.join(metadata.get('violation_categories', [])),
                'violation_severity': metadata.get('risk_score', 0),  # Use risk score as severity number
                'critical_violations': metadata.get('key_concerns', []),  # Use key concerns as text array
                'risk_score': metadata.get('risk_score', 0),
                'compliance_score': metadata.get('compliance_score', 0),
                
                # Processing metadata
                'status': 'published',
                'source_city': 'Chicago',
                'published_date': inspection_date.isoformat(),  # Use inspection date for historical articles
                'created_at': datetime.now().isoformat(),
                
                # Image fields for homepage
                'image_url': self.get_image_url(metadata),
                'image_category': metadata.get('violation_categories', ['general'])[0],
                
                # Tags
                'tags': article_data.get('tags', [])
            }
            
            # Insert into Weaviate
            result = articles.data.insert(article_object)
            
            print(f"✅ Article saved to Weaviate with ID: {result}")
            return result
            
        except Exception as e:
            print(f"❌ Error saving to Weaviate: {e}")
            return None
    
    def process_inspection(self, inspection):
        """Process a single inspection through both phases"""
        print(f"\n{'='*60}")
        print(f"Processing: {inspection.properties.get('facility_name', 'Unknown')}")
        print(f"Date: {inspection.properties.get('inspection_date', 'Unknown')}")
        print(f"Violations: {inspection.properties.get('violation_count', 0)}")
        print(f"{'='*60}")
        
        # Phase 1: Analyze metadata
        print("\n📊 PHASE 1: Analyzing inspection...")
        metadata = self.phase1_analyze_metadata(inspection)
        
        if not metadata:
            print("Failed at Phase 1")
            return False
        
        print(f"Cuisine: {metadata.get('cuisine_type')}")
        print(f"Neighborhood: {metadata.get('neighborhood')}")
        print(f"Severity: {metadata.get('violation_severity')}")
        print(f"Risk Score: {metadata.get('risk_score')}")
        
        # Phase 2: Generate article
        print("\n📝 PHASE 2: Generating article...")
        article = self.phase2_generate_article(inspection, metadata)
        
        if not article:
            print("Failed at Phase 2")
            return False
        
        print(f"Title: {article.get('title')}")
        print(f"Content length: {len(article.get('content', ''))} chars")
        
        # Save to Weaviate
        print("\n💾 Saving to Articles collection...")
        article_id = self.save_article_to_weaviate(article, metadata, inspection)
        
        if article_id:
            print(f"\n✅ SUCCESS! Article created and saved")
            print(f"View at: https://cleankitchens.org/article/{article_id}")
            return True
        
        return False
    
    def run_test(self):
        """Run the test with recent inspections"""
        print("🔍 Fetching recent inspections with violations from ChicagoTemp...")
        
        inspections = self.get_recent_failed_inspections(limit=5)
        
        if not inspections:
            print("❌ No inspections with violations found")
            return
        
        print(f"Found {len(inspections)} recent inspections with violations")
        
        successes = 0
        failures = 0
        
        for inspection in inspections:
            if self.process_inspection(inspection):
                successes += 1
            else:
                failures += 1
        
        print(f"\n{'='*60}")
        print(f"FINAL RESULTS:")
        print(f"✅ Successful: {successes}")
        print(f"❌ Failed: {failures}")
        print(f"Success rate: {(successes/(successes+failures)*100):.1f}%")
        print(f"{'='*60}")

if __name__ == "__main__":
    generator = TwoPhaseGenerator()
    generator.run_test()
    generator.client.close()