#!/usr/bin/env python3
"""
Article Generator V2 for CleanKitchens - Educational Version with Weaviate
Combines best of both generators:
- Educational components and government references from old generator
- Image selection and meta data from new system
- Haiku model for cost savings
- Fixed vectorizer deprecation
"""

import os
import sys
import weaviate
import weaviate.classes.config as wvc
import requests
import json
import time
from datetime import datetime
import re

# Add paths
sys.path.insert(0, '/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/scripts')

class ArticleGeneratorV2:
    def __init__(self):
        # API configuration - Haiku for cost savings
        self.claude_api_key = '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'
        self.claude_model = 'claude-3-haiku-20240307'  # Cheaper model
        self.claude_api_url = 'https://api.anthropic.com/v1/messages'
        
        # Weaviate client
        self.weaviate_client = weaviate.connect_to_local(host="localhost", port=8080)
        
        # Educational reference library (from your original)
        self.educational_references = {
            'temperature': {
                'danger_zone': ('USDA Danger Zone', 'https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/danger-zone-40f-140f'),
                'hot_holding': ('FDA Food Code 3-501.16', 'https://www.fda.gov/food/fda-food-code/food-code-2022'),
                'cold_holding': ('FDA Cold Storage Requirements', 'https://www.fda.gov/food/buy-store-serve-safe-food/safe-food-storage'),
                'cooking_temps': ('USDA Safe Minimum Temperatures', 'https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/safe-temperature-chart')
            },
            'hygiene': {
                'handwashing': ('CDC Handwashing Guidelines', 'https://www.cdc.gov/handwashing/when-how-handwashing.html'),
                'gloves': ('FDA Glove Use Guidelines', 'https://www.fda.gov/food/fda-food-code/food-code-2022'),
                'illness': ('FDA Employee Health Policy', 'https://www.fda.gov/food/retail-food-protection/employee-health-and-personal-hygiene-handbook'),
                'contamination': ('FDA Preventing Cross-Contamination', 'https://www.fda.gov/food/buy-store-serve-safe-food/preventing-cross-contamination')
            },
            'pests': {
                'rodents': ('CDC Rodent Disease Information', 'https://www.cdc.gov/rodents/diseases/index.html'),
                'insects': ('FDA Pest Control Requirements', 'https://www.fda.gov/media/164194/download'),
                'prevention': ('EPA IPM Principles', 'https://www.epa.gov/safepestcontrol/integrated-pest-management-ipm-principles'),
                'diseases': ('CDC Disease Transmission', 'https://www.cdc.gov/rodents/diseases/direct.html')
            },
            'storage': {
                'general': ('FDA Safe Food Storage', 'https://www.fda.gov/food/buy-store-serve-safe-food/safe-food-storage'),
                'refrigeration': ('USDA Refrigeration Guidelines', 'https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/refrigeration'),
                'labeling': ('FDA Date Labeling', 'https://www.fda.gov/food/food-labeling-nutrition/food-product-dating')
            }
        }
        
        # Chicago landmarks and transit for local context
        self.chicago_context = {
            'neighborhoods': {
                'ROSELAND': {
                    'description': 'far South Side neighborhood',
                    'transit': 'CTA Red Line at 95th Street',
                    'landmarks': ['Fernwood Park', 'Roseland Community Hospital']
                },
                'LOOP': {
                    'description': 'downtown business district',
                    'transit': 'multiple CTA lines converge',
                    'landmarks': ['Millennium Park', 'Willis Tower']
                },
                'LINCOLN PARK': {
                    'description': 'North Side lakefront neighborhood',
                    'transit': 'Brown and Red Lines',
                    'landmarks': ['Lincoln Park Zoo', 'DePaul University']
                }
            }
        }
    
    def extract_field_from_raw(self, raw_data, field_name):
        """Extract a field from raw_data string"""
        if not raw_data:
            return ''
        pattern = f"{field_name}:([^,]+)"
        match = re.search(pattern, raw_data, re.IGNORECASE)
        return match.group(1).strip() if match else ''
    
    def extract_violations_from_raw(self, raw_data):
        """Extract violations from raw_data"""
        if not raw_data:
            return ''
        violations_match = re.search(r"Violations:([^:]+(?::|$))", raw_data, re.IGNORECASE)
        if violations_match:
            return violations_match.group(1).strip()
        return ''
    
    def get_relevant_references(self, violations_text):
        """Determine which educational references are most relevant"""
        violations_lower = violations_text.lower()
        relevant_refs = []
        
        # Check for temperature violations
        if any(term in violations_lower for term in ['temperature', 'cold', 'hot', 'holding', 'cooling']):
            relevant_refs.append(self.educational_references['temperature']['danger_zone'])
            if 'hot' in violations_lower:
                relevant_refs.append(self.educational_references['temperature']['hot_holding'])
            if 'cold' in violations_lower:
                relevant_refs.append(self.educational_references['temperature']['cold_holding'])
        
        # Check for hygiene violations
        if any(term in violations_lower for term in ['hand', 'wash', 'glove', 'hygiene', 'employee']):
            relevant_refs.append(self.educational_references['hygiene']['handwashing'])
            if 'glove' in violations_lower:
                relevant_refs.append(self.educational_references['hygiene']['gloves'])
        
        # Check for pest violations
        if any(term in violations_lower for term in ['rodent', 'mouse', 'mice', 'rat', 'roach', 'cockroach', 'pest']):
            relevant_refs.append(self.educational_references['pests']['rodents'])
            relevant_refs.append(self.educational_references['pests']['diseases'])
        
        # Check for contamination
        if any(term in violations_lower for term in ['contamination', 'cross', 'raw', 'cooked']):
            relevant_refs.append(self.educational_references['hygiene']['contamination'])
        
        return relevant_refs[:3]  # Return top 3 most relevant
    
    def build_claude_prompt(self, inspection):
        """Build the prompt with educational components"""
        # Get relevant educational references
        relevant_refs = self.get_relevant_references(inspection.get('violations', ''))
        
        # Build prompt with educational focus
        prompt = "You are a food safety educator and journalist creating educational content about restaurant health violations.\n\n"
        prompt += "Create an article that TEACHES while it INFORMS about food safety using real inspection data.\n\n"
        prompt += "CRITICAL: You must follow the EXACT format specified below.\n\n"
        
        prompt += "Your article should:\n"
        prompt += "1. Have a compelling, specific headline that includes the restaurant name\n"
        prompt += "2. Report the violations factually and objectively\n"
        prompt += "3. Focus on what readers can learn from these violations\n"
        prompt += "4. Maintain an educational tone throughout\n"
        prompt += "5. Be between 400-600 words\n"
        prompt += "6. Include specific details about violations found\n\n"
        
        prompt += "IMPORTANT FORMATTING:\n"
        prompt += "- Use <h2> for main sections, <h3> for subsections\n"
        prompt += "- Use <p> tags for paragraphs\n"
        prompt += "- Include hyperlinks to government sources\n\n"
        
        prompt += "INCLUDE 'The Food Safety Lesson' section with:\n"
        prompt += "- 2-3 sentences explaining the food safety principles\n"
        prompt += "- Links to government sources (FDA, CDC, USDA)\n"
        prompt += "- One practical tip for home food safety\n\n"
        
        prompt += "INCLUDE 'Frequently Asked Questions' section with 2-3 Q&As\n\n"
        
        # Add available references
        if relevant_refs:
            prompt += "Use these government references:\n"
            for ref in relevant_refs:
                prompt += f"- {ref[0]}: {ref[1]}\n"
            prompt += "\n"
        
        # Add the specific violation data
        prompt += "RESTAURANT DETAILS:\n"
        prompt += f"Name: {inspection['facility_name']}\n"
        prompt += f"Location: {inspection.get('address', 'Chicago, IL')}\n"
        prompt += f"Inspection Date: {inspection.get('inspection_date', 'Recent')}\n"
        prompt += f"Result: {inspection.get('results', 'Failed')}\n"
        
        if inspection.get('violations'):
            prompt += f"\nVIOLATIONS:\n{inspection['violations']}\n\n"
        
        # Output format
        prompt += "OUTPUT FORMAT:\n"
        prompt += "===META_DATA===\n"
        prompt += "title: [55-60 char title]\n"
        prompt += "description: [150-160 char description]\n"
        prompt += "===META_DATA===\n\n"
        prompt += "===CONTENT===\n"
        prompt += "[Your complete article HTML]\n"
        prompt += "===CONTENT===\n"
        
        return prompt
    
    def call_claude_api(self, prompt):
        """Call Claude API with Haiku model"""
        headers = {
            'x-api-key': self.claude_api_key,
            'anthropic-version': '2023-06-01',
            'content-type': 'application/json'
        }
        
        data = {
            'model': self.claude_model,
            'max_tokens': 2000,
            'messages': [
                {
                    'role': 'user',
                    'content': prompt
                }
            ],
            'temperature': 0.7
        }
        
        try:
            response = requests.post(
                self.claude_api_url,
                headers=headers,
                json=data,
                timeout=60
            )
            
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Claude API error: {response.status_code} - {response.text}")
                return None
                
        except Exception as e:
            print(f"Claude API exception: {str(e)}")
            return None
    
    def extract_sections(self, claude_response):
        """Extract META_DATA and CONTENT sections from Claude response"""
        try:
            text_content = claude_response['content'][0]['text']
            
            # Extract META_DATA
            meta_match = re.search(r'===META_DATA===\s*\n(.*?)\n===META_DATA===', text_content, re.DOTALL)
            content_match = re.search(r'===CONTENT===\s*\n(.*?)\n===CONTENT===', text_content, re.DOTALL)
            
            meta_data = {}
            if meta_match:
                meta_text = meta_match.group(1).strip()
                for line in meta_text.split('\n'):
                    if ':' in line:
                        key, value = line.split(':', 1)
                        meta_data[key.strip()] = value.strip()
            
            content = content_match.group(1).strip() if content_match else ""
            
            return meta_data, content
        
        except Exception as e:
            print(f"Error extracting sections: {e}")
            return {}, ""
    
    def select_image(self, inspection):
        """Select appropriate image using our image selector"""
        try:
            from image_selector import ImageSelector
            selector = ImageSelector()
            
            # Prepare data for image selector
            image_data = {
                'facility_name': inspection['facility_name'],
                'violations': inspection.get('violations', ''),
                'results': inspection.get('results', '')
            }
            
            selected = selector.select_image_for_article(image_data)
            return selected
            
        except Exception as e:
            print(f"Error selecting image: {e}")
            return {
                'image_url': '/assets/images/violations/general_1.jpg',
                'image_alt': 'Restaurant health inspection violation',
                'category': 'general'
            }
    
    def generate_schema(self, article_data):
        """Generate structured data schemas"""
        schemas = []
        
        # NewsArticle schema
        news_article = {
            "@context": "https://schema.org",
            "@type": "NewsArticle",
            "headline": article_data['title'],
            "description": article_data['meta_description'],
            "datePublished": article_data['published_date'],
            "dateModified": article_data['published_date'],
            "author": {
                "@type": "Organization",
                "name": "CleanKitchens Editorial Team"
            },
            "publisher": {
                "@type": "Organization",
                "name": "CleanKitchens",
                "logo": {
                    "@type": "ImageObject",
                    "url": "https://cleankitchens.com/logo.png"
                }
            },
            "image": f"https://cleankitchens.com{article_data['image_url']}",
            "mainEntityOfPage": {
                "@type": "WebPage",
                "@id": f"https://cleankitchens.com/{article_data['slug']}"
            }
        }
        schemas.append(news_article)
        
        # FAQPage schema if FAQ exists
        if 'faq' in article_data.get('content', '').lower():
            faq_schema = {
                "@context": "https://schema.org",
                "@type": "FAQPage",
                "mainEntity": []
            }
            schemas.append(faq_schema)
        
        return json.dumps(schemas)
    
    def save_article_to_weaviate(self, inspection, meta_data, content, image_data):
        """Save processed article to Weaviate with fixed vectorizer"""
        try:
            # Try to get existing collection
            try:
                collection = self.weaviate_client.collections.get("Articles")
                
                # Check if article already exists for this inspection
                existing = collection.query.fetch_objects(
                    limit=1,
                    return_properties=["inspection_id"]
                )
                
                # Check for specific inspection
                for obj in existing.objects:
                    if obj.properties.get('inspection_id') == inspection['inspection_id']:
                        print(f"⚠️ Article already exists for inspection {inspection['inspection_id']}")
                        return obj.properties
                        
            except Exception as e:
                # Create Articles collection with fixed vectorizer
                print(f"Collection doesn't exist, creating... (Error: {e})")
                collection = self.weaviate_client.collections.create(
                    name="Articles",
                    description="Restaurant inspection articles",
                    vectorizer_config=None,  # No vectorizer for now
                    properties=[
                        wvc.Property(name="title", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="slug", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="content", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="excerpt", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="meta_title", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="meta_description", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="city", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="state", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="establishment_name", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="published_date", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="image_url", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="image_alt", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="tags", data_type=wvc.DataType.TEXT_ARRAY),
                        wvc.Property(name="inspection_id", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="article_type", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="status", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="schema_json", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="latitude", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="longitude", data_type=wvc.DataType.TEXT),
                        wvc.Property(name="address", data_type=wvc.DataType.TEXT),
                    ]
                )
        except Exception as e:
            pass  # Collection already exists or other error
        
        try:
            # Get collection (should exist now)
            collection = self.weaviate_client.collections.get("Articles")
            # Generate slug
            slug = re.sub(r'[^a-z0-9]+', '-', meta_data.get('title', '').lower()).strip('-')
            if len(slug) > 100:
                slug = slug[:100].rsplit('-', 1)[0]
            
            # Create excerpt
            text_only = re.sub(r'<[^>]+>', '', content)
            excerpt = ' '.join(text_only.split()[:30]) + '...'
            
            # Determine tags based on violations
            tags = []
            violations_lower = inspection.get('violations', '').lower()
            if 'temperature' in violations_lower:
                tags.append('Temperature Violations')
            if 'rodent' in violations_lower or 'pest' in violations_lower:
                tags.append('Pest Control')
            if 'hand' in violations_lower:
                tags.append('Hygiene')
            if inspection.get('results') == 'Fail':
                tags.append('Failed Inspection')
            tags.append('Chicago')
            
            # Prepare article data
            article_data = {
                'title': meta_data.get('title', ''),
                'slug': slug,
                'content': content,
                'excerpt': excerpt,
                'meta_title': meta_data.get('title', ''),
                'meta_description': meta_data.get('description', ''),
                'city': inspection.get('city', 'Chicago'),
                'state': 'IL',
                'establishment_name': inspection['facility_name'],
                'published_date': datetime.now().isoformat(),
                'image_url': image_data['image_url'],
                'image_alt': image_data['image_alt'],
                'tags': tags,
                'inspection_id': inspection['inspection_id'],
                'article_type': 'inspection_report',
                'status': 'published',
                'latitude': '41.8781',  # Chicago default
                'longitude': '-87.6298',
                'address': inspection.get('address', '')
            }
            
            # Generate schema
            article_data['schema_json'] = self.generate_schema(article_data)
            
            # Save to Weaviate
            collection = self.weaviate_client.collections.get("Articles")
            result = collection.data.insert(article_data)
            
            print(f"✅ Article saved successfully: {slug}")
            return article_data
            
        except Exception as e:
            print(f"Error saving to Weaviate: {e}")
            return None
    
    def process_inspection(self, inspection):
        """Process a single inspection into an article"""
        try:
            # Build prompt
            prompt = self.build_claude_prompt(inspection)
            
            # Call Claude API
            print("Calling Claude API (Haiku)...")
            claude_response = self.call_claude_api(prompt)
            
            if not claude_response:
                print("Failed to get Claude response")
                return False
            
            # Extract sections
            meta_data, content = self.extract_sections(claude_response)
            
            if not meta_data or not content:
                print("Failed to extract content from response")
                return False
            
            # Select image
            image_data = self.select_image(inspection)
            
            # Save to Weaviate
            result = self.save_article_to_weaviate(inspection, meta_data, content, image_data)
            
            return result is not None
            
        except Exception as e:
            print(f"Error processing inspection: {e}")
            return False

def main():
    """Process all 10 inspections"""
    client = weaviate.connect_to_local(host="localhost", port=8080)
    
    try:
        # Get all inspections
        inspections_collection = client.collections.get("RawInspection")
        
        all_inspections = inspections_collection.query.fetch_objects(
            return_properties=["inspection_id", "dba_name", "city", "results", "raw_data", "inspection_date"],
            limit=10
        )
        
        print(f"Found {len(all_inspections.objects)} inspections")
        
        # Initialize generator
        generator = ArticleGeneratorV2()
        generated_count = 0
        
        for obj in all_inspections.objects:
            inspection = obj.properties
            
            print(f"\n{'='*60}")
            print(f"Processing: {inspection['dba_name']}")
            print(f"Inspection ID: {inspection['inspection_id']}")
            print(f"Result: {inspection['results']}")
            
            # Format for generator
            formatted_inspection = {
                'inspection_id': inspection['inspection_id'],
                'facility_name': inspection['dba_name'],
                'city': inspection.get('city', 'CHICAGO'),
                'results': inspection['results'],
                'inspection_date': inspection.get('inspection_date', ''),
                'raw_data': inspection.get('raw_data', ''),
                'address': generator.extract_field_from_raw(inspection.get('raw_data', ''), 'Address'),
                'violations': generator.extract_violations_from_raw(inspection.get('raw_data', ''))
            }
            
            # Process
            if generator.process_inspection(formatted_inspection):
                generated_count += 1
                print(f"✅ Article generated successfully")
            else:
                print(f"❌ Failed to generate article")
            
            # Rate limiting
            time.sleep(2)
        
        print(f"\n{'='*60}")
        print(f"✅ Generation complete!")
        print(f"   Generated: {generated_count} articles")
        
        # Clean up
        generator.weaviate_client.close()
        
    finally:
        client.close()

if __name__ == "__main__":
    main()