#!/usr/bin/env python3
"""

Comprehensive CleanKitchens Content Processor
Handles: Bulk upload, Daily processing, Pattern story generation
Uses Claude Haiku for cost-effective content generation (~$0.001/article)
"""

import os
import sys
import json
import time
import hashlib
import requests
from datetime import datetime, timedelta
from pathlib import Path
import weaviate
from weaviate.classes.query import Filter
from anthropic import Anthropic
import pandas as pd

# Load environment variables
from dotenv import load_dotenv
load_dotenv('/home/chris/.env')

# Import violation lookup system
from violation_codes_lookup import violation_lookup

class CleanKitchensProcessor:
    def _safe_str(self, value, default=''):
        """Safely convert value to string, handling NaN, None, and floats"""
        if value is None:
            return default
        if isinstance(value, float):
            if pd.isna(value):
                return default
            return str(int(value)) if value == int(value) else str(value)
        return str(value).strip() if str(value).strip() else default
    
    def __init__(self):
        self.weaviate_client = weaviate.connect_to_local()
        self.anthropic_client = Anthropic(api_key=os.getenv('ANTHROPIC_API_KEY'))
        
        # Cost tracking
        self.articles_generated = 0
        self.total_cost = 0.0
        self.cost_per_article = 0.001  # Haiku cost estimate with caching
        
        # Collections
        self.articles = self.weaviate_client.collections.get("Articles")
        self.raw_inspections = self.weaviate_client.collections.get("RawInspections")
        self.tag_pages = self.weaviate_client.collections.get("TagPages")
        self.pattern_stories = self.weaviate_client.collections.get("PatternStories")
        
        # Cached system prompt for cost savings
        self.cached_system_prompt = self._build_cached_system_prompt()
    
    def __del__(self):
        if hasattr(self, 'weaviate_client'):
            self.weaviate_client.close()
    
    def _build_cached_system_prompt(self):
        """Build system prompt that can be cached to save costs"""
        return """You are a professional food safety journalist writing for CleanKitchens.org. Write factual, neutral news articles about restaurant health inspections.

CRITICAL SAFETY RULES:
- Use ONLY government data provided
- NO speculation, fake quotes, or fabricated information  
- Use neutral, factual language: "received citations for", "violations found", "failed inspection"
- NEVER say "was closed" or "shut down" - use "cited for violations" or "did not meet standards"
- Include educational angle with government resources
- Hyperlink government sources when referencing data
- NO libel risk - stick to official inspection results only

REQUIRED ARTICLE STRUCTURE:
1. Headline (60-80 characters, factual, include restaurant name and violation type)
   - END titles with inspection date in MM/DD/YY format (e.g., "Restaurant Name Cited for Violations 08/15/25")
2. Lead paragraph (what happened, when, where - just the facts)
3. Violation details (specific citations from inspection) 
4. Educational context (what these violations mean for food safety)
5. Government resources (link to health department, FDA guidelines)
6. Local context (area description, nearby landmarks if relevant)
7. Historical context (if pattern of violations exists)

GOVERNMENT SOURCES TO REFERENCE:
- Local health department: [include link to original inspection data]
- FDA Food Code: https://www.fda.gov/food/fda-food-code/food-code-2022
- CDC Food Safety: https://www.cdc.gov/foodsafety/
- State health department guidelines

FORMAT RESPONSE AS VALID JSON:
{{
    "title": "headline here",
    "content": "full article content with <a> tags for links",
    "excerpt": "brief summary (150 chars)",
    "meta_description": "SEO description (160 chars)",
    "image_category": "violation type (e.g., 'temperature', 'cleanliness', 'equipment')"
}

Always maintain journalistic integrity while educating readers about food safety."""
    
    # =============================================================================
    # SCENARIO 1: BULK UPLOAD PROCESSOR
    # =============================================================================
    
    def process_bulk_upload(self, csv_file_path, batch_size=100):
        """
        Process large CSV file of historical inspections
        Groups by date: individual stories for failures, group stories for passes
        """
        print(f"🔄 Starting bulk upload processing from {csv_file_path}")
        print(f"Batch size: {batch_size}")
        
        import pandas as pd
        
        # Read entire CSV to group by dates
        print("📖 Reading CSV file...")
        df = pd.read_csv(csv_file_path)
        
        # Convert to inspection data and group by date
        all_inspections = []
        for _, row in df.iterrows():
            inspection_data = self._row_to_inspection_data(row)
            if not self._is_duplicate(inspection_data):
                all_inspections.append(inspection_data)
        
        print(f"📊 Found {len(all_inspections)} unique inspections")
        
        # Group by inspection date and combine duplicates
        grouped_by_date = self._group_inspections_by_date(all_inspections)
        
        total_processed = 0
        
        for inspection_date, date_inspections in grouped_by_date.items():
            print(f"\n📅 Processing inspections for {inspection_date}")
            
            # Separate passes and failures
            passes = []
            failures = []
            
            for inspection in date_inspections:
                if self._is_passing_inspection(inspection):
                    passes.append(inspection)
                else:
                    failures.append(inspection)
            
            # Process individual failure stories
            for failure in failures:
                try:
                    print(f"❌ Processing failure: {failure.get('establishment_name', 'Unknown')}")
                    
                    # Save raw data
                    raw_id = self._save_raw_inspection(failure)
                    
                    # Enrich with local metadata
                    enriched_data = self._enrich_with_local_data(failure)
                    
                    # Generate individual failure article
                    article_data = self._generate_article_haiku(enriched_data)
                    
                    # Save article
                    article_id = self._save_article(article_data)
                    self._update_raw_inspection(raw_id, article_id)
                    
                    total_processed += 1
                    
                except Exception as e:
                    print(f"❌ Error processing failure: {e}")
                    continue
            
            # Process group pass stories (if 5+ passes on same date)
            if len(passes) >= 5:
                try:
                    print(f"✅ Processing {len(passes)} passes as group story")
                    
                    # Save all raw pass data
                    raw_ids = []
                    for pass_inspection in passes:
                        raw_id = self._save_raw_inspection(pass_inspection)
                        raw_ids.append(raw_id)
                    
                    # Generate group pass article
                    group_article = self._generate_group_pass_article_haiku(passes, inspection_date)
                    
                    # Save group article
                    article_id = self._save_article(group_article)
                    
                    # Link all raw inspections to group article
                    for raw_id in raw_ids:
                        self._update_raw_inspection(raw_id, article_id)
                    
                    total_processed += 1
                    
                except Exception as e:
                    print(f"❌ Error processing group passes: {e}")
                    continue
            
            # Rate limiting
            time.sleep(0.5)
            
            # Progress update
            if total_processed % 50 == 0:
                print(f"💾 Checkpoint: {total_processed} articles generated, estimated cost: ${self.total_cost:.2f}")
        
        print(f"\n🎉 Bulk upload complete!")
        print(f"Total articles: {total_processed}")
        print(f"Total cost: ${self.total_cost:.2f}")
    
    # =============================================================================
    # SCENARIO 2: DAILY VIOLATION PROCESSOR
    # =============================================================================
    
    def process_daily_violations(self):
        """
        Check for new violations and process them
        """
        print("🔍 Checking for new daily violations...")
        
        # Download latest data from Chicago Health Department
        new_data = self._download_latest_violations()
        
        if not new_data:
            print("📭 No new violations found")
            return
        
        print(f"📥 Found {len(new_data)} new violations to process")
        
        for violation_data in new_data:
            try:
                # Check for duplicates
                if self._is_duplicate(violation_data):
                    continue
                
                # Save to temp/raw storage
                raw_id = self._save_raw_inspection(violation_data)
                
                # Enrich with local context
                enriched_data = self._enrich_with_local_data(violation_data)
                
                # Generate article
                article_data = self._generate_article_haiku(enriched_data)
                
                # Save article
                article_id = self._save_article(article_data)
                
                # Link raw to article
                self._update_raw_inspection(raw_id, article_id)
                
                # Auto-tag and pattern check
                self._auto_tag_article(article_id, article_data)
                self._check_pattern_triggers(article_data)
                
                print(f"✅ New article: {article_data['title'][:50]}...")
                
            except Exception as e:
                print(f"❌ Error processing daily violation: {e}")
                continue
    
    # =============================================================================
    # SCENARIO 3: PATTERN STORY GENERATOR
    # =============================================================================
    
    def generate_pattern_stories(self):
        """
        Analyze patterns and generate investigative stories
        """
        print("🔍 Analyzing patterns for story generation...")
        
        patterns = self._detect_patterns()
        
        for pattern in patterns:
            try:
                if pattern['significance_score'] >= 80:  # High significance threshold
                    print(f"📝 Generating pattern story: {pattern['pattern_type']}")
                    
                    # Generate investigative article with Haiku
                    story_data = self._generate_pattern_story_haiku(pattern)
                    
                    # Save pattern story
                    self._save_pattern_story(story_data)
                    
                    print(f"✅ Pattern story created: {story_data['title'][:50]}...")
                    
            except Exception as e:
                print(f"❌ Error generating pattern story: {e}")
                continue
    
    # =============================================================================
    # CORE CONTENT GENERATION WITH HAIKU
    # =============================================================================
    
    def _generate_article_haiku(self, inspection_data):
        """Generate article using Claude Haiku with cached system prompt for cost savings"""
        
        # Build data-specific prompt (much shorter, reduces costs)
        data_prompt = self._build_data_prompt(inspection_data)
        
        try:
            response = self.anthropic_client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=2000,
                temperature=0.3,  # Lower temperature for factual reporting
                system=self.cached_system_prompt,  # Cached system prompt
                messages=[{
                    "role": "user", 
                    "content": data_prompt
                }]
            )
            
            content = response.content[0].text
            
            # Parse structured response
            article_data = self._parse_article_response(content, inspection_data)
            
            # Track cost
            self.articles_generated += 1
            self.total_cost += self.cost_per_article
            
            return article_data
            
        except Exception as e:
            print(f"❌ Error generating article with Haiku: {e}")
            raise
    
    def _build_data_prompt(self, data):
        """Build data-specific prompt for use with cached system prompt"""
        
        # Get local context for the area
        local_context = self._format_local_context(data)
        
        # Get violation explanations
        violations_text = self._safe_str(data.get('violations', ''))
        violation_explanations = violation_lookup.explain_violations_for_article(violations_text)
        violation_summary = violation_lookup.get_violation_summary(violations_text)
        
        return f"""INSPECTION DATA:
Restaurant: {data.get('establishment_name', 'Unknown')}
Address: {data.get('address', 'Unknown')}, {data.get('city', 'Unknown')}, {data.get('state', 'Unknown')}
Inspection Date: {data.get('inspection_date', 'Unknown')}
Inspection ID: {data.get('inspection_id', 'Unknown')}
Results: {data.get('results', 'Unknown')}
Violations: {data.get('violations', 'None reported')}
Risk Level: {data.get('risk_level', 'Unknown')}

VIOLATION EXPLANATIONS:
{violation_explanations}

VIOLATION SUMMARY:
- Total violations: {violation_summary['total_violations']}
- Critical violations: {violation_summary['priority_count']}
- Serious violations: {violation_summary['priority_foundation_count']}  
- Minor violations: {violation_summary['core_count']}
- Worst severity level: {violation_summary['worst_severity']}

LOCAL CONTEXT:
{local_context}}

Write a factual news article about this inspection following the guidelines above."""
    
    def _build_article_prompt(self, data):
        """Build comprehensive prompt for article generation"""
        
        # Get local context for the area
        local_context = self._format_local_context(data)
        
        # Get violation explanations
        violations_text = self._safe_str(data.get('violations', ''))
        violation_explanations = violation_lookup.explain_violations_for_article(violations_text)
        violation_summary = violation_lookup.get_violation_summary(violations_text)
        
        prompt = f"""You are a professional food safety journalist writing for CleanKitchens.org. Write a factual, neutral news article about this restaurant health inspection.

CRITICAL SAFETY RULES:
- Use ONLY government data provided
- NO speculation, fake quotes, or fabricated information
- Use neutral, factual language: "received citations for", "violations found", "failed inspection"
- NEVER say "was closed" or "shut down" - use "cited for violations" or "did not meet standards"
- Include educational angle with government resources
- Hyperlink government sources when referencing data
- NO libel risk - stick to official inspection results only

INSPECTION DATA:
Restaurant: {data.get('establishment_name', 'Unknown')}
Address: {data.get('address', 'Unknown')}, {data.get('city', 'Unknown')}, {data.get('state', 'Unknown')}
Inspection Date: {data.get('inspection_date', 'Unknown')}
Inspection ID: {data.get('inspection_id', 'Unknown')}
Results: {data.get('results', 'Unknown')}
Violations: {data.get('violations', 'None reported')}
Risk Level: {data.get('risk_level', 'Unknown')}

VIOLATION EXPLANATIONS:
{violation_explanations}

VIOLATION SUMMARY:
- Total violations: {violation_summary['total_violations']}
- Critical violations: {violation_summary['priority_count']}
- Serious violations: {violation_summary['priority_foundation_count']}  
- Minor violations: {violation_summary['core_count']}
- Worst severity level: {violation_summary['worst_severity']}

LOCAL CONTEXT:
{local_context}

REQUIRED ARTICLE STRUCTURE:
1. Headline (60-80 characters, factual, include restaurant name and violation type)
2. Lead paragraph (what happened, when, where - just the facts)
3. Violation details (specific citations from inspection)
4. Educational context (what these violations mean for food safety)
5. Government resources (link to health department, FDA guidelines)
6. Local context (area description, nearby landmarks if relevant)
7. Historical context (if pattern of violations exists)

GOVERNMENT SOURCES TO REFERENCE:
- Local health department: [include link to original inspection data]
- FDA Food Code: https://www.fda.gov/food/fda-food-code/food-code-2022
- CDC Food Safety: https://www.cdc.gov/foodsafety/
- State health department guidelines

FORMAT RESPONSE AS VALID JSON:
{{
    "title": "headline here",
    "content": "full article content with <a> tags for links",
    "excerpt": "brief summary (150 chars)",
    "meta_description": "SEO description (160 chars)",
    "meta_keywords": "keyword1, keyword2, keyword3",
    "internal_links": ["url1", "url2"],
    "government_citations": ["gov_url1", "gov_url2"],
    "educational_angle": "key food safety lesson",
    "auto_tags": ["tag1", "tag2", "tag3"]
}}

Write the article now:"""

        return prompt
    
    def _generate_pattern_story_haiku(self, pattern_data):
        """Generate investigative pattern story using Haiku"""
        
        articles = pattern_data['articles']
        
        prompt = f"""You are an investigative food safety journalist writing for CleanKitchens.org. Write a data-driven investigative article about a pattern detected in restaurant inspections.

CRITICAL SAFETY RULES:
- Use ONLY verified government inspection data
- NO speculation about causes or blame
- Focus on pattern analysis and data trends
- Use neutral language: "data shows", "inspections reveal", "pattern indicates"
- Include educational resources about food safety
- NEVER make accusations - just report the factual pattern
- Encourage readers to "learn more" and "stay informed"

PATTERN DATA:
Pattern Type: {pattern_data['pattern_type']}
Total Articles: {pattern_data['article_count']}
Date Range: {pattern_data['date_range']}
Significance Score: {pattern_data['significance_score']}
Location: {pattern_data.get('location', 'Multiple locations')}

ARTICLES IN PATTERN:
{self._format_articles_for_prompt(articles)}

REQUIRED STRUCTURE:
1. Investigative headline (pattern-focused, factual)
2. Lead paragraph (what pattern was found, significance)
3. Data analysis (numbers, trends, timeframe)
4. Individual case examples (3-4 specific inspections)
5. Educational context (what this means for food safety)
6. Government resources and data sources
7. Call to action (encouraging food safety awareness)

GOVERNMENT SOURCES TO CITE:
- Health department inspection database
- FDA resources on food safety
- CDC guidelines
- Local health department reports

FORMAT AS JSON:
{
    "title": "investigative headline",
    "content": "full investigative article with links",
    "subtitle": "compelling subtitle",
    "pattern_summary": "brief pattern description",
    "data_points": ["key statistic 1", "key statistic 2"],
    "government_citations": ["source1", "source2"],
    "article_ids": {json.dumps([a['id'] for a in articles])},
    "meta_description": "SEO description",
    "significance_level": "High/Medium/Low based on data"
}}

Write the investigative article now:"""

        try:
            response = self.anthropic_client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=3000,
                temperature=0.2,  # Very low for investigative accuracy
                messages=[{"role": "user", "content": prompt}]
            )
            
            content = response.content[0].text
            story_data = json.loads(content)
            
            # Add pattern metadata
            story_data.update({
                'pattern_id': pattern_data['pattern_id'],
                'pattern_type': pattern_data['pattern_type'],
                'article_count': pattern_data['article_count'],
                'published_date': datetime.now().isoformat(),
                'auto_generated': True
            })
            
            self.total_cost += 0.002  # Pattern stories cost more
            
            return story_data
            
        except Exception as e:
            print(f"❌ Error generating pattern story: {e}")
            raise
    
    # =============================================================================
    # METADATA ENRICHMENT
    # =============================================================================
    
    def _enrich_with_local_data(self, inspection_data):
        """Enrich inspection data with local context metadata"""
        
        address = inspection_data.get('address', '')
        city = inspection_data.get('city', '')
        
        # Get coordinates if not present
        if not inspection_data.get('latitude'):
            coords = self._geocode_address(f"{address}, {city}")
            if coords:
                inspection_data['latitude'] = coords['lat']
                inspection_data['longitude'] = coords['lng']
        
        # Enrich with local context
        inspection_data.update({
            'nearby_landmarks': self._get_nearby_landmarks(inspection_data),
            'nearby_transit': self._get_nearby_transit(inspection_data),
            'nearby_schools': self._get_nearby_schools(inspection_data),
            'nearby_attractions': self._get_nearby_attractions(inspection_data),
            'area_demographics': self._get_area_demographics(inspection_data),
            'neighborhood': self._determine_neighborhood(inspection_data),
            'district': self._determine_district(inspection_data)
        })
        
        return inspection_data
    
    def _get_nearby_landmarks(self, data):
        """Get nearby landmarks for local context"""
        # In production, this would use Google Places API or similar
        # For now, return Chicago-specific landmarks based on area
        city = self._safe_str(data.get('city', '')).lower()
        neighborhood = self._safe_str(data.get('neighborhood', '')).lower()
        
        chicago_landmarks = {
            'loop': ['Millennium Park', 'Art Institute', 'Willis Tower'],
            'lincoln park': ['Lincoln Park Zoo', 'North Avenue Beach'],
            'wicker park': ['Wicker Park', 'Bucktown'],
            'downtown': ['Navy Pier', 'Chicago River', 'Magnificent Mile']
        }
        
        for area, landmarks in chicago_landmarks.items():
            if area in neighborhood:
                return landmarks[:2]  # Return top 2
        
        return ['Downtown Chicago'] if 'chicago' in city else []
    
    def _get_nearby_transit(self, data):
        """Get nearby public transit options"""
        # Simplified for Chicago - would use transit APIs in production
        neighborhood = self._safe_str(data.get('neighborhood', '')).lower()
        
        chicago_transit = {
            'loop': ['Red Line', 'Blue Line', 'Green Line'],
            'lincoln park': ['Red Line', 'Brown Line'],
            'wicker park': ['Blue Line'],
            'downtown': ['Red Line', 'Blue Line', 'Green Line']
        }
        
        for area, transit in chicago_transit.items():
            if area in neighborhood:
                return transit[:2]
        
        return []
    
    def _get_nearby_schools(self, data):
        """Get nearby schools for family context"""
        # Would use education APIs in production
        return []  # Simplified for now
    
    def _get_nearby_attractions(self, data):
        """Get nearby tourist attractions"""
        # Would use tourism APIs in production
        neighborhood = self._safe_str(data.get('neighborhood', '')).lower()
        
        if 'downtown' in neighborhood or 'loop' in neighborhood:
            return ['Navy Pier', 'Millennium Park']
        
        return []
    
    def _get_area_demographics(self, data):
        """Get area demographic description"""
        neighborhood = self._safe_str(data.get('neighborhood', '')).lower()
        
        if 'downtown' in neighborhood or 'loop' in neighborhood:
            return 'high-traffic business and tourist area'
        elif 'lincoln park' in neighborhood:
            return 'popular residential and dining district'
        
        return 'local neighborhood'
    
    def _generate_group_pass_article_haiku(self, passes, inspection_date):
        """Generate positive group article for restaurants that passed on same date"""
        
        restaurant_list = []
        neighborhoods = set()
        
        for inspection in passes:
            name = inspection.get('establishment_name', 'Unknown')
            address = inspection.get('address', 'Unknown')
            neighborhood = inspection.get('neighborhood', inspection.get('city', 'Unknown'))
            
            restaurant_list.append(f"- {name} ({address})")
            neighborhoods.add(neighborhood)
        
        area_description = ', '.join(list(neighborhoods)[:3])  # Top 3 neighborhoods
        
        prompt = f"""You are a positive food safety journalist writing for CleanKitchens.org. Write an uplifting news article about restaurants that passed their health inspections on the same day.

CRITICAL SAFETY RULES:
- Use ONLY government inspection data provided
- Focus on POSITIVE food safety news
- Use encouraging language: "successfully passed", "maintained standards", "demonstrated compliance"
- Include educational angle about what passing means
- Hyperlink government sources and food safety resources
- NO speculation - stick to factual inspection results

GROUP INSPECTION DATA:
Inspection Date: {inspection_date}
Number of Restaurants: {len(passes)}
Area: {area_description}

RESTAURANTS THAT PASSED:
{chr(10).join(restaurant_list)}

ARTICLE FOCUS:
- Celebrate successful food safety compliance
- Explain what it means to pass inspection
- Educational value about food safety standards
- Encourage consumer confidence in these establishments
- Link to government food safety resources

REQUIRED STRUCTURE:
1. Positive headline (celebrating successful inspections)
   - END titles with inspection date in MM/DD/YY format (e.g., "Chicago Restaurants Shine with Successful Health Inspections 08/15/25")
2. Lead paragraph (X restaurants passed inspections on DATE)
3. List of establishments with addresses
4. Educational context (what passing inspection means)
5. Food safety standards explanation
6. Government resources and guidelines
7. Encouragement for dining confidence

GOVERNMENT SOURCES TO REFERENCE:
- Local health department inspection standards
- FDA Food Code: https://www.fda.gov/food/fda-food-code/food-code-2022
- CDC Food Safety guidelines: https://www.cdc.gov/foodsafety/

FORMAT RESPONSE AS VALID JSON:
{{
    "title": "positive headline celebrating passes",
    "content": "full positive article content with <a> tags for links",
    "excerpt": "brief positive summary (150 chars)",
    "meta_description": "SEO description highlighting successful inspections (160 chars)",
    "meta_keywords": "food safety, passed inspection, clean restaurants",
    "internal_links": ["url1", "url2"],
    "government_citations": ["gov_url1", "gov_url2"],
    "educational_angle": "key food safety compliance lesson",
    "auto_tags": ["passed-inspection", "food-safety-success", "chicago"],
    "article_type": "group_pass",
    "restaurant_count": "NUMBER_OF_RESTAURANTS"
}}

Write the positive article now:"""

        try:
            response = self.anthropic_client.messages.create(
                model="claude-3-haiku-20240307",
                max_tokens=2000,
                temperature=0.3,
                messages=[{"role": "user", "content": prompt}]
            )
            
            content = response.content[0].text
            article_data = self._parse_article_response(content, {
                'inspection_date': inspection_date,
                'establishment_name': f"{len(passes)} Restaurants",
                'city': list(neighborhoods)[0] if neighborhoods else 'Chicago',
                'state': 'IL',
                'inspection_id': f"group_{inspection_date}_{len(passes)}",
                'results': 'Pass'
            })
            
            # Generate auto-tags for group article
            group_data = {
                'establishment_name': ', '.join([p.get('establishment_name', '') for p in passes[:5]]),  # First 5 names
                'violations': '',  # Group passes have no violations
                'results': 'Pass',
                'city': list(neighborhoods)[0] if neighborhoods else 'Chicago',
                'content': article_data.get('content', ''),
                'title': article_data.get('title', ''),
                'inspection_date': inspection_date,
                'establishment_type': 'restaurant'  # Default for group
            }
            auto_tags = self._auto_tag_article(group_data)
            auto_tags.append('group-pass')  # Add special tag for group articles
            
            # Add group-specific metadata
            article_data.update({
                'article_type': 'group_pass',
                'restaurant_count': len(passes),
                'establishments': [p.get('establishment_name') for p in passes],
                'group_inspection_ids': [p.get('inspection_id') for p in passes],
                'auto_tags': auto_tags,  # Add comprehensive auto-tags
                'violations': '',  # No violations for passes
                'results': 'Pass',
                'risk_level': '',
                'establishment_type': 'restaurant'
            })
            
            self.total_cost += self.cost_per_article
            
            return article_data
            
        except Exception as e:
            print(f"❌ Error generating group pass article: {e}")
            raise
    
    # =============================================================================
    # UTILITY FUNCTIONS
    # =============================================================================
    
    def _group_inspections_by_date(self, inspections):
        """Group inspections by date and combine duplicates"""
        from collections import defaultdict
        
        grouped = defaultdict(list)
        
        for inspection in inspections:
            date_key = inspection.get('inspection_date', 'unknown')
            
            # Check if this location already exists for this date
            existing = None
            for existing_inspection in grouped[date_key]:
                if (existing_inspection.get('establishment_name') == inspection.get('establishment_name') and
                    existing_inspection.get('address') == inspection.get('address')):
                    existing = existing_inspection
                    break
            
            if existing:
                # Combine duplicate entries
                self._combine_duplicate_inspections(existing, inspection)
            else:
                # Add new inspection
                grouped[date_key].append(inspection)
        
        return dict(grouped)
    
    def _combine_duplicate_inspections(self, existing, duplicate):
        """Combine duplicate inspection entries for same location/date"""
        
        # Combine violations
        existing_violations = self._safe_str(existing.get('violations', ''))
        duplicate_violations = self._safe_str(duplicate.get('violations', ''))
        
        if duplicate_violations and duplicate_violations not in existing_violations:
            existing_str = str(existing_violations) if not pd.isna(existing_violations) else ''
            duplicate_str = str(duplicate_violations) if not pd.isna(duplicate_violations) else ''
            combined_violations = f"{existing_str}; {duplicate_str}".strip('; ')
            existing['violations'] = combined_violations
        
        # Take worst result
        existing_result = self._safe_str(existing.get('results', '')).lower()
        duplicate_result = self._safe_str(duplicate.get('results', '')).lower()
        
        failure_indicators = ['fail', 'conditional', 'out of business', 'not ready']
        
        if any(indicator in duplicate_result for indicator in failure_indicators):
            existing['results'] = duplicate.get('results')
        
        # Combine inspection IDs for tracking
        existing_ids = existing.get('combined_inspection_ids', [existing.get('inspection_id')])
        if duplicate.get('inspection_id') not in existing_ids:
            existing_ids.append(duplicate.get('inspection_id'))
        existing['combined_inspection_ids'] = existing_ids
        
        print(f"🔗 Combined duplicate: {existing.get('establishment_name')} ({len(existing_ids)} entries)")
    
    def _is_passing_inspection(self, inspection):
        """Determine if inspection is a pass or failure"""
        result = self._safe_str(inspection.get('results', '')).lower()
        violations = self._safe_str(inspection.get('violations', '')).lower()
        
        # Clear failure indicators
        failure_indicators = [
            'fail', 'conditional', 'out of business', 'not ready',
            'critical', 'serious', 'major violation'
        ]
        
        # Check result
        if any(indicator in result for indicator in failure_indicators):
            return False
        
        # Check violations
        if violations and any(indicator in violations for indicator in failure_indicators):
            return False
        
        # If result contains pass-like terms
        pass_indicators = ['pass', 'approved', 'satisfactory']
        if any(indicator in result for indicator in pass_indicators):
            return True
        
        # If no violations text, assume pass
        if not violations or (isinstance(violations, str) and violations.strip() == '') or (isinstance(violations, float) and pd.isna(violations)):
            return True
        
        # Default to failure if unclear (conservative approach)
        return False
    
    def _row_to_inspection_data(self, row):
        """Convert CSV row to inspection data dict"""
        return {
            'inspection_id': str(row.get('inspection_id', '')),
            'establishment_name': row.get('dba_name', ''),
            'address': row.get('address', ''),
            'city': row.get('city', ''),
            'state': row.get('state', 'IL'),
            'zip_code': str(row.get('zip', '')),
            'inspection_date': row.get('inspection_date', ''),
            'inspection_type': row.get('inspection_type', ''),
            'results': row.get('results', ''),
            'violations': row.get('violations', ''),
            'license_number': str(row.get('license_', '')),
            'establishment_type': row.get('facility_type', ''),
            'risk_level': row.get('risk', ''),
            'latitude': row.get('latitude', None),
            'longitude': row.get('longitude', None)
        }
    
    def _is_duplicate(self, inspection_data):
        """Check if inspection already exists"""
        inspection_id = inspection_data.get('inspection_id')
        if not inspection_id:
            return False
        
        # Check in both raw and processed
        try:
            existing = self.raw_inspections.query.fetch_objects(
                {"path": ["inspection_id"], "operator": "Equal", "valueText": inspection_id},
                limit=1
            )
            return len(existing.objects) > 0
        except:
            return False
    
    def _save_raw_inspection(self, inspection_data):
        """Save raw inspection data"""
        data_hash = hashlib.md5(json.dumps(inspection_data, sort_keys=True).encode()).hexdigest()
        
        raw_data = {
            'inspection_id': inspection_data.get('inspection_id'),
            'raw_data': json.dumps(inspection_data),
            'data_hash': data_hash,
            'download_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
            'processed': False
        }
        
        return self.raw_inspections.data.insert(raw_data)
    
    def _save_article(self, article_data):
        """Save generated article to Weaviate with duplicate checking"""
        
        # Check if article with this slug already exists
        slug = article_data.get('slug')
        if slug:
            try:
                # Check if article exists using Weaviate v4 API
                try:
                    existing = self.articles.query.fetch_objects(
                        where=Filter.by_property("slug").equal(slug),
                        limit=1
                    )
                except TypeError:
                    # Fallback for older API
                    existing_list = self.articles.query.fetch_objects(limit=1000)
                    filtered = [obj for obj in existing_list.objects if obj.properties.get('slug') == slug]
                    existing = type('obj', (object,), {'objects': filtered})()
                
                if hasattr(existing, 'objects') and existing.objects:
                    print(f"⚠️  Article with slug '{slug}' already exists, skipping duplicate")
                    return existing.objects[0].uuid
                    
            except Exception as e:
                print(f"❌ Error checking for duplicate article: {e}")
        
        # Insert new article
        article_id = self.articles.data.insert(article_data)
        
        # Generate tag pages for this article's tags
        self._create_tag_pages(article_data.get('auto_tags', []))
        
        return article_id
    
    def _create_tag_pages(self, tags):
        """Create or update tag pages for given tags"""
        for tag in tags:
            if not tag or (isinstance(tag, str) and not tag.strip()) or (isinstance(tag, float) and pd.isna(tag)):
                continue
                
            try:
                # Check if tag page already exists
                # Check if tag page exists using Weaviate v4 API
                try:
                    existing = self.tag_pages.query.fetch_objects(
                        where=Filter.by_property("tag_name").equal(tag),
                        limit=1
                    )
                except TypeError:
                    # Fallback for older API
                    existing = self.tag_pages.query.fetch_objects(limit=100)
                    existing = [obj for obj in existing.objects if obj.properties.get('tag_name') == tag]
                    existing = type('obj', (object,), {'objects': existing})()
                
                if hasattr(existing, 'objects') and existing.objects:
                    # Update existing tag page with new article count
                    tag_obj = existing.objects[0]
                    current_count = tag_obj.properties.get('article_count', 0)
                    
                    self.tag_pages.data.update(
                        uuid=tag_obj.uuid,
                        properties={
                            'article_count': current_count + 1,
                            'last_updated': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
                        }
                    )
                else:
                    # Create new tag page
                    tag_page_data = {
                        'tag_name': tag,
                        'slug': self._generate_slug(f"tag-{tag}"),
                        'title': f"{tag} - Chicago Restaurant Health Inspections",
                        'description': f"All Chicago restaurant health inspection articles tagged with '{tag}'. See the latest violations, closures, and passes.",
                        'article_count': 1,
                        'created_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                        'last_updated': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
                    }
                    
                    self.tag_pages.data.insert(tag_page_data)
                    
            except Exception as e:
                print(f"❌ Error creating tag page for '{tag}': {e}")
                continue
    
    def _update_raw_inspection(self, raw_id, article_id):
        """Update raw inspection with article link"""
        self.raw_inspections.data.update(
            uuid=raw_id,
            properties={
                'processed': True,
                'processed_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),
                'article_id': article_id
            }
        )
    
    def _parse_article_response(self, content, inspection_data):
        """Parse Haiku response into structured article data"""
        try:
            # Try to extract JSON from response
            start_idx = content.find('{')
            end_idx = content.rfind('}') + 1
            json_str = content[start_idx:end_idx]
            
            # Clean control characters that cause JSON parsing issues
            import re
            json_str = re.sub(r'[\x00-\x1f\x7f-\x9f]', '', json_str)
            
            parsed = json.loads(json_str)
            
            # Use inspection date as publish date for historical accuracy
            inspection_date = inspection_data.get('inspection_date')
            if inspection_date:
                # Convert inspection date to ISO format
                try:
                    if isinstance(inspection_date, str):
                        # Parse common date formats
                        from dateutil import parser
                        parsed_date = parser.parse(inspection_date)
                        publish_date = parsed_date.strftime('%Y-%m-%dT%H:%M:%S.%fZ')
                    else:
                        publish_date = inspection_date.strftime('%Y-%m-%dT%H:%M:%S.%fZ') if hasattr(inspection_date, 'strftime') else datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
                except:
                    publish_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
            else:
                publish_date = datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ')
            
            # Generate comprehensive auto-tags
            combined_data = {**inspection_data, **parsed}
            auto_tags = self._auto_tag_article(combined_data)
            
            # Add metadata from inspection + auto-tags
            parsed.update({
                'slug': self._generate_slug(parsed['title'], inspection_date),
                'published_date': publish_date,
                'last_updated': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),  # When we generated the article
                'establishment_name': inspection_data.get('establishment_name'),
                'inspection_id': inspection_data.get('inspection_id'),
                'inspection_date': publish_date,  # Use same formatted date
                'city': inspection_data.get('city'),
                'state': inspection_data.get('state'),
                'ai_generated': True,
                'content_version': 'v2',
                'data_source': 'chicago_health_dept',
                'processing_date': datetime.now().strftime('%Y-%m-%dT%H:%M:%S.%fZ'),  # When we processed it
                'auto_tags': auto_tags,  # Add comprehensive auto-tags
                'violations': inspection_data.get('violations', ''),  # Preserve violation text for tagging
                'results': inspection_data.get('results', ''),  # Preserve results for tagging
                'risk_level': inspection_data.get('risk_level', ''),  # Preserve risk level
                'establishment_type': inspection_data.get('establishment_type', ''),  # Preserve type
                'image_url': self._select_violation_image(inspection_data.get('violations', ''))  # Add appropriate image
            })
            
            return parsed
            
        except Exception as e:
            print(f"❌ Error parsing article response: {e}")
            raise
    
    def _generate_slug(self, title, inspection_date=None):
        """Generate URL slug from title with optional date prefix"""
        import re
        
        # Clean title for slug
        safe_title = self._safe_str(title, 'untitled')
        slug = re.sub(r'[^a-zA-Z0-9\s-]', '', safe_title.lower())
        slug = re.sub(r'\s+', '-', slug.strip() if isinstance(slug, str) else str(slug))
        
        # Add date prefix if inspection_date provided
        if inspection_date:
            try:
                from dateutil import parser
                if isinstance(inspection_date, str):
                    parsed_date = parser.parse(inspection_date)
                else:
                    parsed_date = inspection_date
                
                # Format as MMDDYY (e.g., 081525 for August 15, 2025)
                date_prefix = parsed_date.strftime('%m%d%y')
                slug = f"{date_prefix}-{slug}"
                
            except Exception as e:
                print(f"⚠️  Could not parse date for slug: {e}")
        
        return slug[:100]  # Limit length
    
    def _select_violation_image(self, violations_text):
        """Select appropriate image based on violation content"""
        import random
        
        if not violations_text:
            return f'/assets/images/violations/general_{random.randint(1,3)}.jpg'
        
        violations_lower = self._safe_str(violations_text).lower()
        
        # Map violation keywords to specific images
        if any(word in violations_lower for word in ['temperature', 'temp', 'cold', 'hot', 'tcs']):
            return f'/assets/images/violations/temperature_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['rodent', 'mouse', 'mice', 'rat', 'droppings']):
            return f'/assets/images/violations/rodent_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['roach', 'cockroach', 'insect', 'fly', 'flies']):
            return f'/assets/images/violations/pest_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['hand', 'wash', 'soap', 'sanitizer', 'hygiene']):
            return f'/assets/images/violations/handwashing_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['clean', 'sanitize', 'dirty', 'soil', 'grease']):
            return f'/assets/images/violations/sanitation_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['storage', 'store', 'label', 'date', 'fifo']):
            return f'/assets/images/violations/storage_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['cross', 'contamination', 'raw', 'cooked']):
            return f'/assets/images/violations/cross_contamination_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['license', 'permit', 'certificate', 'document']):
            return f'/assets/images/violations/documentation_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['floor', 'wall', 'ceiling', 'repair', 'maintain']):
            return f'/assets/images/violations/structural_{random.randint(1,3)}.jpg'
        elif any(word in violations_lower for word in ['plumb', 'water', 'sink', 'drain', 'leak']):
            return f'/assets/images/violations/plumbing_{random.randint(1,3)}.jpg'
        elif 'closure' in violations_lower or 'closed' in violations_lower:
            return f'/assets/images/violations/closure_{random.randint(1,3)}.jpg'
        else:
            # Default to general violation image
            return f'/assets/images/violations/general_{random.randint(1,3)}.jpg'
    
    # =============================================================================
    # PATTERN DETECTION & AUTO-TAGGING
    # =============================================================================
    
    def _auto_tag_article(self, article_data):
        """Auto-generate reader-friendly, relevant tags for article"""
        auto_tags = []
        
        # Extract data for tagging
        establishment_name = self._safe_str(article_data.get('establishment_name', '')).lower()
        violations = self._safe_str(article_data.get('violations', '')).lower()
        results = self._safe_str(article_data.get('results', '')).lower()
        city = self._safe_str(article_data.get('city', '')).lower()
        establishment_type = self._safe_str(article_data.get('establishment_type', '')).lower()
        content = self._safe_str(article_data.get('content', '')).lower()
        title = self._safe_str(article_data.get('title', '')).lower()
        
        # CORE INSPECTION RESULT TAGS
        if 'fail' in results:
            auto_tags.append('Failed Inspection')
        elif 'pass' in results:
            if 'condition' in results:
                auto_tags.append('Conditional Pass')
            else:
                auto_tags.append('Passed Inspection')
        
        # 1. LOCATION TAGS
        if 'chicago' in city:
            auto_tags.append('Chicago')
        
        # Add neighborhood tags based on common Chicago areas
        neighborhood_keywords = {
            'loop': ['loop', 'downtown'],
            'lincoln-park': ['lincoln park'],
            'wicker-park': ['wicker park'],
            'river-north': ['river north'],
            'gold-coast': ['gold coast'],
            'old-town': ['old town'],
            'bucktown': ['bucktown'],
            'logan-square': ['logan square'],
            'wrigleyville': ['wrigleyville'],
            'chinatown': ['chinatown'],
            'little-italy': ['little italy'],
            'pilsen': ['pilsen'],
            'ukrainian-village': ['ukrainian village']
        }
        
        for neighborhood, keywords in neighborhood_keywords.items():
            if any(keyword in establishment_name or keyword in content for keyword in keywords):
                auto_tags.append(neighborhood)
        
        # 2. ESTABLISHMENT TYPE TAGS
        establishment_type_mapping = {
            'restaurant': ['restaurant', 'diner', 'cafe', 'bistro', 'grill'],
            'fast-food': ['mcdonalds', 'burger king', 'kfc', 'taco bell', 'subway', 'wendys', 'pizza hut', 'dominos'],
            'pizza': ['pizza', 'pizzeria'],
            'coffee-shop': ['coffee', 'starbucks', 'dunkin'],
            'bar': ['bar', 'pub', 'tavern', 'lounge'],
            'bakery': ['bakery', 'pastry'],
            'grocery': ['grocery', 'market', 'store'],
            'catering': ['catering', 'cater'],
            'food-truck': ['truck', 'mobile']
        }
        
        for tag, keywords in establishment_type_mapping.items():
            if any(keyword in establishment_name or keyword in establishment_type for keyword in keywords):
                auto_tags.append(tag)
        
        # 3. CHAIN TAGS
        chain_keywords = {
            'mcdonalds': ['mcdonald', 'mcdonalds'],
            'starbucks': ['starbucks'],
            'subway': ['subway'],
            'dunkin': ['dunkin'],
            'taco-bell': ['taco bell'],
            'pizza-hut': ['pizza hut'],
            'kfc': ['kfc', 'kentucky fried'],
            'burger-king': ['burger king'],
            'wendys': ['wendys', "wendy's"],
            'dominos': ['dominos', "domino's"],
            'chipotle': ['chipotle'],
            'panera': ['panera']
        }
        
        for chain, keywords in chain_keywords.items():
            if any(keyword in establishment_name for keyword in keywords):
                auto_tags.append(chain)
        
        # 4. VIOLATION TYPE TAGS
        violation_keywords = {
            'temperature-control': ['temperature', 'temp', 'cold', 'hot', 'refrigerat', 'freezer', 'tcs foods'],
            'cleanliness': ['clean', 'sanitary', 'wash', 'dirty', 'soil', 'debris'],
            'rodent': ['rodent', 'rat', 'mice', 'mouse', 'droppings'],
            'roach': ['roach', 'cockroach', 'insect', 'pest'],
            'cross-contamination': ['cross contamination', 'contaminat', 'separate', 'raw', 'ready-to-eat'],
            'handwashing': ['hand wash', 'handwash', 'soap', 'towel', 'sink'],
            'mold': ['mold', 'mildew', 'fungus'],
            'plumbing': ['plumb', 'drain', 'pipe', 'leak', 'sewage', 'waste water'],
            'lighting': ['light', 'illuminat', 'bulb', 'fixture'],
            'structural': ['ceiling', 'wall', 'floor', 'repair', 'maintain', 'construct'],
            'equipment': ['equipment', 'machine', 'appliance', 'repair', 'broken'],
            'garbage': ['garbage', 'trash', 'waste', 'dumpster', 'refuse'],
            'food-storage': ['storage', 'store', 'shelf', 'container'],
            'employee-health': ['employee', 'worker', 'staff', 'health', 'illness']
        }
        
        for violation_tag, keywords in violation_keywords.items():
            if any(keyword in violations or keyword in content or keyword in title for keyword in keywords):
                auto_tags.append(violation_tag)
        
        # 5. SEVERITY TAGS
        if 'fail' in results:
            auto_tags.append('failed-inspection')
        elif 'pass' in results and 'condition' in results:
            auto_tags.append('conditional-pass')
        elif 'pass' in results:
            auto_tags.append('passed-inspection')
        
        if 'out of business' in results:
            auto_tags.append('closure')
        
        # Critical violation indicators
        critical_keywords = ['critical', 'priority', 'immediate', 'citation', 'serious']
        if any(keyword in violations or keyword in content for keyword in critical_keywords):
            auto_tags.append('critical-violation')
        
        # 6. SEASONAL/TEMPORAL TAGS
        inspection_date = article_data.get('inspection_date', '')
        if inspection_date:
            try:
                from dateutil import parser
                date_obj = parser.parse(str(inspection_date))
                
                # Add year tag
                auto_tags.append(f"year-{date_obj.year}")
                
                # Add season tags
                month = date_obj.month
                if month in [12, 1, 2]:
                    auto_tags.append('winter')
                elif month in [3, 4, 5]:
                    auto_tags.append('spring')
                elif month in [6, 7, 8]:
                    auto_tags.append('summer')
                elif month in [9, 10, 11]:
                    auto_tags.append('fall')
                
            except:
                pass
        
        # 7. RISK LEVEL TAGS
        risk_level = self._safe_str(article_data.get('risk_level', '')).lower()
        if 'high' in risk_level or 'risk 1' in risk_level:
            auto_tags.append('high-risk')
        elif 'medium' in risk_level or 'risk 2' in risk_level:
            auto_tags.append('medium-risk')
        elif 'low' in risk_level or 'risk 3' in risk_level:
            auto_tags.append('low-risk')
        
        # ESTABLISHMENT TYPE TAGS (Simple and relevant)
        if 'restaurant' in establishment_name or 'restaurant' in establishment_type:
            auto_tags.append('Restaurant')
        elif 'coffee' in establishment_name or 'starbucks' in establishment_name or 'dunkin' in establishment_name:
            auto_tags.append('Coffee Shop')
        elif 'bar' in establishment_name or 'tavern' in establishment_name or 'pub' in establishment_name:
            auto_tags.append('Bar')
        elif 'pizza' in establishment_name:
            auto_tags.append('Pizza')
        elif 'grocery' in establishment_name or 'market' in establishment_name:
            auto_tags.append('Grocery Store')
        elif 'bakery' in establishment_name:
            auto_tags.append('Bakery')
        else:
            auto_tags.append('Food Service')
        
        # MAJOR VIOLATION TYPES (Only the important ones people care about)
        if any(word in violations for word in ['temperature', 'temp', 'tcs']):
            auto_tags.append('Temperature Violations')
        if any(word in violations for word in ['rodent', 'rat', 'mice']):
            auto_tags.append('Rodent Problem')
        if any(word in violations for word in ['roach', 'cockroach']):
            auto_tags.append('Pest Problem')
        if any(word in violations for word in ['clean', 'sanitary', 'dirty']):
            auto_tags.append('Cleanliness Issues')
        if any(word in violations for word in ['hand', 'wash', 'soap']):
            auto_tags.append('Handwashing Issues')
        
        # POPULAR CHAINS (Only major ones people search for)
        major_chains = {
            "McDonald's": ['mcdonald'],
            'Starbucks': ['starbucks'],
            'Subway': ['subway'],
            "Dunkin'": ['dunkin'],
            'Taco Bell': ['taco bell'],
            'Pizza Hut': ['pizza hut'],
            'KFC': ['kfc'],
            'Burger King': ['burger king'],
            'Chipotle': ['chipotle'],
            'Panera': ['panera']
        }
        
        for chain_name, keywords in major_chains.items():
            if any(keyword in establishment_name for keyword in keywords):
                auto_tags.append(chain_name)
        
        # Always add Chicago for all articles
        auto_tags.append('Chicago')
        
        # Remove duplicates and limit to 8 relevant tags max
        auto_tags = list(set([tag for tag in auto_tags if tag and isinstance(tag, str) and tag.strip()]))[:8]
        
        return auto_tags
    
    def _check_pattern_triggers(self, article_data):
        """Check if new article triggers any patterns"""
        # Implementation for pattern checking
        pass
    
    def _detect_patterns(self):
        """Detect patterns in existing articles"""
        # Implementation for pattern detection
        return []
    
    # =============================================================================
    # DATA SOURCE INTEGRATION
    # =============================================================================
    
    def _download_latest_violations(self):
        """Download latest violations from Chicago Health Department"""
        # Implementation for daily data download
        return []
    
    def _geocode_address(self, address):
        """Get coordinates for address"""
        # Implementation for geocoding
        return None
    
    def _determine_neighborhood(self, inspection_data):
        """Determine neighborhood from coordinates/address"""
        # Simplified Chicago neighborhood detection
        return 'Downtown'
    
    def _determine_district(self, inspection_data):
        """Determine district/ward from coordinates"""
        return 'District 1'
    
    def _format_local_context(self, data):
        """Format local context for prompt"""
        context = []
        
        if data.get('nearby_landmarks'):
            context.append(f"Near landmarks: {', '.join(data['nearby_landmarks'])}")
        
        if data.get('nearby_transit'):
            context.append(f"Transit access: {', '.join(data['nearby_transit'])}")
        
        if data.get('area_demographics'):
            context.append(f"Area: {data['area_demographics']}")
        
        return '\n'.join(context) if context else 'No local context available'
    
    def _format_articles_for_prompt(self, articles):
        """Format articles for pattern story prompt"""
        formatted = []
        for article in articles[:5]:  # Limit to top 5
            formatted.append(f"- {article.get('title', 'Unknown')}: {article.get('establishment_name', 'Unknown')} ({article.get('inspection_date', 'Unknown')})")
        return '\n'.join(formatted)
    
    def _save_pattern_story(self, story_data):
        """Save pattern story to Weaviate"""
        return self.pattern_stories.data.insert(story_data)

# =============================================================================
# MAIN EXECUTION
# =============================================================================

def main():
    processor = CleanKitchensProcessor()
    
    print("CleanKitchens Comprehensive Processor")
    print("=====================================")
    print("1. Bulk Upload Processing")
    print("2. Daily Violation Check")
    print("3. Pattern Story Generation")
    print("4. All Processing (Daily + Patterns)")
    
    choice = input("\nSelect processing mode (1-4): ")
    
    if choice == "1":
        csv_file = input("Enter path to CSV file: ")
        batch_size = int(input("Enter batch size (default 100): ") or 100)
        processor.process_bulk_upload(csv_file, batch_size)
    
    elif choice == "2":
        processor.process_daily_violations()
    
    elif choice == "3":
        processor.generate_pattern_stories()
    
    elif choice == "4":
        processor.process_daily_violations()
        processor.generate_pattern_stories()
    
    else:
        print("Invalid choice")

if __name__ == "__main__":
    main()