#!/usr/bin/env python3
"""
Test Article Generator for CleanKitchens - Weaviate Version
Processes 2-3 inspection records through complete pipeline:
1. Get inspection data from Weaviate
2. Send to Claude for article generation
3. Select appropriate image
4. Save complete article back to Weaviate
5. Display on live site
"""

import os
import sys
import weaviate
import requests
import json
import time
from datetime import datetime

# Add paths
sys.path.insert(0, '/var/www/twin-digital-media/public_html/_sites/cleankitchens/production/scripts')

class TestArticleGenerator:
    def __init__(self):
        # API configuration from environment
        self.claude_api_key = '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'
        self.claude_model = 'claude-3-5-sonnet-20241022'
        self.claude_api_url = 'https://api.anthropic.com/v1/messages'
        
        # Weaviate client
        self.weaviate_client = weaviate.connect_to_local(host="localhost", port=8080)
        
        # Educational reference library (from your original)
        self.educational_references = {
            'temperature': {
                'danger_zone': ('USDA Danger Zone', 'https://www.fsis.usda.gov/food-safety/safe-food-handling-and-preparation/food-safety-basics/danger-zone-40f-140f'),
                'hot_holding': ('FDA Food Code 3-501.16', 'https://www.fda.gov/food/fda-food-code/food-code-2022'),
                'cold_holding': ('FDA Cold Storage Requirements', 'https://www.fda.gov/food/buy-store-serve-safe-food/safe-food-storage'),
            },
            'hygiene': {
                'handwashing': ('CDC Handwashing Guidelines', 'https://www.cdc.gov/handwashing/when-how-handwashing.html'),
                'contamination': ('FDA Preventing Cross-Contamination', 'https://www.fda.gov/food/buy-store-serve-safe-food/preventing-cross-contamination')
            },
            'pests': {
                'rodents': ('CDC Rodent Disease Information', 'https://www.cdc.gov/rodents/diseases/index.html'),
                'diseases': ('CDC Disease Transmission', 'https://www.cdc.gov/rodents/diseases/direct.html')
            }
        }
        
        # Chicago landmarks and transit for local context
        self.chicago_context = {
            'neighborhoods': {
                'ROSELAND': {
                    'description': 'far South Side neighborhood',
                    'transit': 'CTA Red Line at 95th Street and Electric District line at 103rd Street/Roseland',
                    'landmarks': ['Fernwood Park', 'Roseland Community Hospital', 'Michigan Avenue corridor']
                },
                'DOWNTOWN': {
                    'description': 'Loop business district',
                    'transit': 'multiple CTA lines converge in the Loop',
                    'landmarks': ['Millennium Park', 'City Hall', 'Chicago River']
                },
                'LINCOLN PARK': {
                    'description': 'North Side lakefront neighborhood',
                    'transit': 'Brown and Red Lines',
                    'landmarks': ['Lincoln Park Zoo', 'DePaul University', 'North Avenue Beach']
                }
            }
        }
    
    def get_inspection_data(self, limit=2):
        """Get inspection records from Weaviate"""
        try:
            collection = self.weaviate_client.collections.get("RawInspection")
            
            # Get sample records
            response = collection.query.fetch_objects(
                limit=limit,
                return_properties=[
                    "inspection_id", "dba_name", "city", "results", 
                    "raw_data", "inspection_date", "source_api", "status"
                ]
            )
            
            inspections = []
            for obj in response.objects:
                # Parse the raw_data to extract key information
                raw_data = obj.properties.get('raw_data', '')
                
                # Extract address, violations, etc. from raw_data
                address = self.extract_field_from_raw(raw_data, 'Address')
                violations = self.extract_violations_from_raw(raw_data)
                facility_type = self.extract_field_from_raw(raw_data, 'Facility Type')
                zip_code = self.extract_field_from_raw(raw_data, 'Zip')
                
                inspection = {
                    'inspection_id': obj.properties.get('inspection_id'),
                    'facility_name': obj.properties.get('dba_name'),
                    'city': obj.properties.get('city'),
                    'results': obj.properties.get('results'),
                    'inspection_date': obj.properties.get('inspection_date'),
                    'address': address,
                    'violations': violations,
                    'facility_type': facility_type,
                    'zip_code': zip_code,
                    'raw_data': raw_data,
                    'uuid': str(obj.uuid)
                }
                inspections.append(inspection)
            
            return inspections
        
        except Exception as e:
            print(f"Error getting inspection data: {e}")
            return []
    
    def extract_field_from_raw(self, raw_data, field_name):
        """Extract specific field from raw_data string"""
        try:
            # Look for pattern "Field Name:Value"
            import re
            pattern = f"{field_name}:([^,]+)"
            match = re.search(pattern, raw_data)
            if match:
                return match.group(1).strip()
        except:
            pass
        return ""
    
    def extract_violations_from_raw(self, raw_data):
        """Extract violations from raw_data"""
        try:
            # Look for "Violations:" section
            if "Violations:" in raw_data:
                violations_part = raw_data.split("Violations:", 1)[1]
                # Clean up and format
                return violations_part.strip()
        except:
            pass
        return ""
    
    def parse_inspection_result(self, raw_data, city):
        """Parse inspection result and get city-specific explanation"""
        # Extract the actual result
        result = self.extract_field_from_raw(raw_data, 'Results')
        
        # Get city-specific rating system info
        city_systems = {
            'CHICAGO': {
                'type': 'Pass/Fail',
                'description': 'Chicago uses a Pass/Fail system where restaurants either pass inspection or fail due to critical violations',
                'fail_meaning': 'indicates critical food safety violations that pose immediate health risks',
                'pass_meaning': 'indicates the restaurant met basic food safety standards during inspection'
            },
            'NYC': {
                'type': 'Letter Grade',
                'description': 'NYC uses A/B/C letter grades based on violation point totals',
                'fail_meaning': 'typically corresponds to a C grade with significant violations',
                'pass_meaning': 'typically corresponds to an A or B grade with few violations'
            },
            'BOSTON': {
                'type': 'Pass/Fail',
                'description': 'Boston uses a Pass/Fail system similar to Chicago',
                'fail_meaning': 'indicates critical violations requiring immediate attention',
                'pass_meaning': 'indicates compliance with health code requirements'
            }
        }
        
        city_info = city_systems.get(city, city_systems['CHICAGO'])  # Default to Chicago
        
        return {
            'result': result,
            'system_type': city_info['type'],
            'system_description': city_info['description'],
            'result_meaning': city_info['fail_meaning'] if 'fail' in result.lower() else city_info['pass_meaning']
        }
    
    def get_neighborhood_context(self, address, zip_code):
        """Get local context for Chicago neighborhoods"""
        # Simple mapping based on zip codes and street names
        if '103' in address and '60628' in zip_code:
            return self.chicago_context['neighborhoods']['ROSELAND']
        elif 'clark' in address.lower() or 'loop' in address.lower():
            return self.chicago_context['neighborhoods']['DOWNTOWN']
        elif 'lincoln' in address.lower():
            return self.chicago_context['neighborhoods']['LINCOLN_PARK']
        else:
            # Default context
            return {
                'description': 'Chicago neighborhood',
                'transit': 'CTA bus and rail lines',
                'landmarks': ['local parks and community centers']
            }
    
    def get_relevant_references(self, violations_text):
        """Determine which educational references are most relevant"""
        violations_lower = violations_text.lower()
        relevant_refs = []
        
        # Check for temperature violations
        if any(term in violations_lower for term in ['temperature', 'cold', 'hot', 'holding']):
            relevant_refs.append(self.educational_references['temperature']['danger_zone'])
        
        # Check for hygiene violations
        if any(term in violations_lower for term in ['hand', 'wash', 'glove', 'hygiene']):
            relevant_refs.append(self.educational_references['hygiene']['handwashing'])
        
        # Check for pest violations
        if any(term in violations_lower for term in ['rodent', 'mouse', 'rat', 'roach', 'pest']):
            relevant_refs.append(self.educational_references['pests']['rodents'])
        
        # Check for contamination
        if any(term in violations_lower for term in ['contamination', 'cross']):
            relevant_refs.append(self.educational_references['hygiene']['contamination'])
        
        return relevant_refs[:3]  # Return top 3 most relevant
    
    def build_claude_prompt(self, inspection):
        """Build Claude prompt with your educational format"""
        # Get neighborhood context
        neighborhood = self.get_neighborhood_context(inspection['address'], inspection['zip_code'])
        
        # Format inspection date
        try:
            if isinstance(inspection['inspection_date'], str):
                date_obj = datetime.strptime(inspection['inspection_date'], '%m/%d/%Y')
            else:
                date_obj = inspection['inspection_date']
            inspection_date = date_obj.strftime('%B %d, %Y')
        except:
            inspection_date = inspection['inspection_date']
        
        # Get relevant educational references
        relevant_refs = self.get_relevant_references(inspection.get('violations', ''))
        
        # Parse inspection result and get city-specific rating information
        rating_info = self.parse_inspection_result(inspection['raw_data'], inspection['city'])
        
        # Build prompt (using your exact format)
        prompt = "You are a food safety educator and journalist creating educational content about restaurant health violations.\n\n"
        prompt += "Create an article that TEACHES while it INFORMS about food safety using real inspection data.\n\n"
        prompt += "CRITICAL: You must follow the EXACT format specified below. Each section must be wrapped with the specified markers.\n\n"
        
        prompt += "Your article should:\n"
        prompt += "1. Have a compelling, specific headline that includes the restaurant name\n"
        prompt += "2. Report the violations factually and objectively with neutral news tone\n"
        prompt += "3. Focus on what readers can learn from these violations\n"
        prompt += "4. Be between 600-900 words total\n"
        prompt += "5. Include local context (neighborhood, transit, landmarks)\n"
        prompt += "6. Use neutral news reporter tone - NO hyperbole, NO fictional characters\n"
        prompt += "7. Include specific details about violations found\n\n"
        
        prompt += "IMPORTANT FORMATTING:\n"
        prompt += "- Start content with <h2> (NOT <h1>) as this goes into a template\n"
        prompt += "- Use proper heading hierarchy: <h2> for main sections, <h3> for subsections\n"
        prompt += "- Use <p> tags for paragraphs\n"
        prompt += "- This is article CONTENT ONLY, not a complete web page\n"
        prompt += "- Do NOT include <html>, <head>, <body> or any page structure tags\n\n"
        
        prompt += "REQUIRED: Add 'The Food Safety Lesson' section with:\n"
        prompt += "- 2-3 sentences explaining the food safety principles involved\n"
        prompt += "- MUST include inline hyperlinks to government sources using HTML: <a href=\"URL\">text</a>\n"
        prompt += "- Every educational statement should link to its source (FDA, CDC, USDA)\n"
        prompt += "- One practical tip for home food safety\n"
        prompt += "- Format section with <h3>The Food Safety Lesson</h3>\n\n"
        
        prompt += "REQUIRED: Add 'Frequently Asked Questions' section with EXACTLY these 4 questions in this exact order:\n"
        prompt += "QUESTION 1: 'When was this restaurant last inspected?'\n"
        prompt += "QUESTION 2: 'What was the inspection result?'\n"
        prompt += "QUESTION 3: 'What does this rating mean?'\n"
        prompt += "QUESTION 4: 'What violations were found?'\n"
        prompt += "- Use the inspection data provided to answer each question factually\n"
        prompt += "- For Question 3, use the rating system information provided\n"
        prompt += "- Include hyperlinks to government sources in answers\n"
        prompt += "- Format as: <h3>Frequently Asked Questions</h3>\n"
        prompt += "<p><strong>Q: When was this restaurant last inspected?</strong><br>A: [Answer]</p>\n"
        prompt += "<p><strong>Q: What was the inspection result?</strong><br>A: [Answer]</p>\n"
        prompt += "<p><strong>Q: What does this rating mean?</strong><br>A: [Answer]</p>\n"
        prompt += "<p><strong>Q: What violations were found?</strong><br>A: [Answer]</p>\n\n"
        
        # Add available references
        if relevant_refs:
            prompt += "Use these government references where relevant:\n"
            for ref in relevant_refs:
                prompt += f"- {ref[0]}: {ref[1]}\n"
            prompt += "\n"
        
        # Add local context
        prompt += f"LOCAL CONTEXT for {inspection['city']}:\n"
        prompt += f"- Neighborhood: {neighborhood['description']}\n"
        prompt += f"- Transit: {neighborhood['transit']}\n"
        prompt += f"- Landmarks: {', '.join(neighborhood['landmarks'])}\n\n"
        
        # Add the specific violation data
        prompt += "RESTAURANT DETAILS:\n"
        prompt += f"Restaurant Name: {inspection['facility_name']}\n"
        prompt += f"Address: {inspection['address']}, Chicago, IL\n"
        prompt += f"Inspection Date: {inspection_date}\n"
        prompt += f"Inspection Result: {inspection['results']}\n"
        
        if inspection.get('facility_type'):
            prompt += f"Facility Type: {inspection['facility_type']}\n"
        
        # Add rating system information
        prompt += f"\nRATING SYSTEM INFORMATION:\n"
        prompt += f"System Type: {rating_info['system_type']}\n"
        prompt += f"System Description: {rating_info['system_description']}\n"
        prompt += f"Result Meaning: {rating_info['result_meaning']}\n"
        
        prompt += f"\nVIOLATIONS FOUND:\n{inspection['violations']}\n\n"
        
        # Output format specifications
        prompt += "OUTPUT FORMAT - You must structure your response EXACTLY as follows:\n\n"
        
        prompt += "===META_DATA===\n"
        prompt += "title: [55-60 char title - restaurant name + violation type]\n"
        prompt += "description: [150-160 char description mentioning location and violation]\n"
        prompt += "===META_DATA===\n\n"
        
        prompt += "===CONTENT===\n"
        prompt += "[Your complete article HTML with Food Safety Lesson and FAQ sections]\n"
        prompt += "[Use HTML tags: <h2>, <h3>, <p>, <strong>, <a href=\"URL\">text</a>]\n"
        prompt += "===CONTENT===\n"
        
        return prompt
    
    def call_claude_api(self, prompt):
        """Call Claude API"""
        headers = {
            'x-api-key': self.claude_api_key,
            'anthropic-version': '2023-06-01',
            'content-type': 'application/json'
        }
        
        data = {
            'model': self.claude_model,
            'max_tokens': 4000,
            'messages': [
                {
                    'role': 'user',
                    'content': prompt
                }
            ],
            'temperature': 0.7
        }
        
        try:
            response = requests.post(
                self.claude_api_url,
                headers=headers,
                json=data,
                timeout=60
            )
            
            if response.status_code == 200:
                return response.json()
            else:
                print(f"Claude API error: {response.status_code} - {response.text}")
                return None
                
        except Exception as e:
            print(f"Claude API exception: {str(e)}")
            return None
    
    def extract_sections(self, claude_response):
        """Extract META_DATA and CONTENT sections from Claude response"""
        try:
            text_content = claude_response['content'][0]['text']
            
            # Extract META_DATA
            import re
            meta_match = re.search(r'===META_DATA===\s*\n(.*?)\n===META_DATA===', text_content, re.DOTALL)
            content_match = re.search(r'===CONTENT===\s*\n(.*?)\n===CONTENT===', text_content, re.DOTALL)
            
            meta_data = {}
            if meta_match:
                meta_text = meta_match.group(1).strip()
                for line in meta_text.split('\n'):
                    if ':' in line:
                        key, value = line.split(':', 1)
                        meta_data[key.strip()] = value.strip()
            
            content = content_match.group(1).strip() if content_match else ""
            
            return meta_data, content
        
        except Exception as e:
            print(f"Error extracting sections: {e}")
            return {}, ""
    
    def select_image(self, inspection):
        """Select appropriate image using image selector"""
        try:
            from image_selector import ImageSelector
            selector = ImageSelector()
            
            # Prepare data for image selector
            image_data = {
                'facility_name': inspection['facility_name'],
                'violations': inspection['violations'],
                'is_closure': inspection['results'].lower() in ['fail', 'closed'],
                'article_type': 'violation'
            }
            
            return selector.select_image_for_article(image_data)
        
        except Exception as e:
            print(f"Error selecting image: {e}")
            return {
                'image_url': '/assets/images/violations/general_1.jpg',
                'image_alt': 'Restaurant health inspection violation',
                'category': 'general'
            }
    
    def save_article_to_weaviate(self, inspection, meta_data, content, image_data):
        """Save processed article to Weaviate"""
        try:
            # Check if Articles collection exists, create if not
            try:
                collection = self.weaviate_client.collections.get("Articles")
                
                # Check if article already exists for this inspection_id
                existing = collection.query.fetch_objects(
                    where={
                        "path": ["inspection_id"],
                        "operator": "Equal",
                        "valueText": inspection['inspection_id']
                    },
                    limit=1
                )
                
                if existing.objects:
                    print(f"⚠️ Article already exists for inspection {inspection['inspection_id']}")
                    return existing.objects[0].properties
            except:
                # Create Articles collection
                import weaviate.classes as wvc
                collection = self.weaviate_client.collections.create(
                    "Articles",
                    vectorizer_config=wvc.config.Configure.Vectorizer.text2vec_transformers(),
                    properties=[
                        wvc.config.Property(name="title", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="slug", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="content", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="excerpt", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="meta_description", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="city", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="state", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="establishment_name", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="published_date", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="image_url", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="image_alt", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="tags", data_type=wvc.config.DataType.TEXT_ARRAY),
                        wvc.config.Property(name="inspection_id", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="article_type", data_type=wvc.config.DataType.TEXT),
                        wvc.config.Property(name="status", data_type=wvc.config.DataType.TEXT)
                    ]
                )
            
            # Generate slug
            import re
            slug = re.sub(r'[^a-z0-9]+', '-', meta_data.get('title', '').lower()).strip('-')
            if len(slug) > 100:
                slug = slug[:100].rsplit('-', 1)[0]
            
            # Create excerpt from content
            import re
            text_only = re.sub(r'<[^>]+>', '', content)
            excerpt = ' '.join(text_only.split()[:30]) + '...'
            
            # Prepare article data
            article_data = {
                'title': meta_data.get('title', ''),
                'slug': slug,
                'content': content,
                'excerpt': excerpt,
                'meta_description': meta_data.get('description', ''),
                'city': inspection['city'],
                'state': 'IL',
                'establishment_name': inspection['facility_name'],
                'published_date': datetime.now().isoformat(),
                'image_url': image_data['image_url'],
                'image_alt': image_data['image_alt'],
                'tags': ['Chicago', 'Health Inspection', 'Food Safety'],
                'inspection_id': inspection['inspection_id'],
                'article_type': 'violation',
                'status': 'published'
            }
            
            # Insert into Weaviate
            result = collection.data.insert(article_data)
            
            print(f"✓ Saved article to Weaviate: {result}")
            return result
        
        except Exception as e:
            print(f"Error saving to Weaviate: {e}")
            return None
    
    def process_test_articles(self):
        """Main test processing"""
        print("=== Test Article Generator ===")
        print("Processing 2-3 inspection records through complete pipeline\n")
        
        # Get inspection data
        inspections = self.get_inspection_data(limit=2)
        
        if not inspections:
            print("No inspection data found!")
            return
        
        print(f"Found {len(inspections)} inspection records to process\n")
        
        for i, inspection in enumerate(inspections, 1):
            print(f"=== Processing Article {i}/{len(inspections)} ===")
            print(f"Restaurant: {inspection['facility_name']}")
            print(f"Result: {inspection['results']}")
            print(f"Address: {inspection['address']}")
            
            try:
                # Build Claude prompt
                print("Building Claude prompt...")
                prompt = self.build_claude_prompt(inspection)
                
                # Call Claude API
                print("Calling Claude API...")
                claude_response = self.call_claude_api(prompt)
                
                if not claude_response:
                    print("Failed to get Claude response")
                    continue
                
                # Extract sections
                print("Extracting article sections...")
                meta_data, content = self.extract_sections(claude_response)
                
                if not content:
                    print("Failed to extract content")
                    continue
                
                print(f"✓ Article generated: {meta_data.get('title', 'Unknown')}")
                print(f"  Content length: {len(content)} characters")
                
                # Select image
                print("Selecting appropriate image...")
                image_data = self.select_image(inspection)
                print(f"✓ Selected image: {image_data['image_url']}")
                
                # Save to Weaviate
                print("Saving to Weaviate...")
                result = self.save_article_to_weaviate(inspection, meta_data, content, image_data)
                
                if result:
                    print(f"✓ Article saved successfully!")
                    print(f"  Title: {meta_data.get('title')}")
                    print(f"  URL will be: /{meta_data.get('title', '').lower().replace(' ', '-')}")
                else:
                    print("Failed to save article")
                
                print()
                
                # Rate limiting
                if i < len(inspections):
                    print("Waiting 3 seconds before next article...\n")
                    time.sleep(3)
                
            except Exception as e:
                print(f"Error processing article: {e}")
                continue
        
        print("=== Test Complete ===")
        print("Articles should now appear on your live site!")

def main():
    generator = TestArticleGenerator()
    
    try:
        generator.process_test_articles()
    finally:
        generator.weaviate_client.close()

if __name__ == "__main__":
    main()