#!/usr/bin/env python3
"""
Update Weaviate schema for comprehensive metadata and pattern detection
"""

import weaviate
import weaviate.classes.config as wvc

def update_articles_schema():
    """Update Articles collection with all metadata fields needed"""
    
    client = weaviate.connect_to_local()
    
    try:
        # Delete existing collection if it exists to recreate with new schema
        if client.collections.exists("Articles"):
            print("Deleting existing Articles collection...")
            client.collections.delete("Articles")
        
        # Create comprehensive Articles collection
        print("Creating new Articles collection with full metadata...")
        articles = client.collections.create(
            name="Articles",
            description="Restaurant health inspection articles with comprehensive metadata",
            
            # No vectorizer for now - can add later if needed
            
            # Properties for all our data
            properties=[
                # Core article data
                wvc.Property(name="title", data_type=wvc.DataType.TEXT),
                wvc.Property(name="content", data_type=wvc.DataType.TEXT),
                wvc.Property(name="excerpt", data_type=wvc.DataType.TEXT),
                wvc.Property(name="slug", data_type=wvc.DataType.TEXT),
                wvc.Property(name="published_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="meta_description", data_type=wvc.DataType.TEXT),
                wvc.Property(name="meta_keywords", data_type=wvc.DataType.TEXT),
                
                # Restaurant/Establishment data
                wvc.Property(name="establishment_name", data_type=wvc.DataType.TEXT),
                wvc.Property(name="establishment_type", data_type=wvc.DataType.TEXT),  # restaurant, grocery, etc
                wvc.Property(name="cuisine_type", data_type=wvc.DataType.TEXT),        # italian, chinese, etc
                wvc.Property(name="chain_name", data_type=wvc.DataType.TEXT),          # mcdonalds, subway, etc
                wvc.Property(name="license_number", data_type=wvc.DataType.TEXT),
                
                # Location data
                wvc.Property(name="address", data_type=wvc.DataType.TEXT),
                wvc.Property(name="city", data_type=wvc.DataType.TEXT),
                wvc.Property(name="state", data_type=wvc.DataType.TEXT),
                wvc.Property(name="zip_code", data_type=wvc.DataType.TEXT),
                wvc.Property(name="latitude", data_type=wvc.DataType.NUMBER),
                wvc.Property(name="longitude", data_type=wvc.DataType.NUMBER),
                wvc.Property(name="neighborhood", data_type=wvc.DataType.TEXT),
                wvc.Property(name="district", data_type=wvc.DataType.TEXT),
                wvc.Property(name="ward", data_type=wvc.DataType.TEXT),
                
                # Inspection data
                wvc.Property(name="inspection_id", data_type=wvc.DataType.TEXT),
                wvc.Property(name="inspection_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="inspection_type", data_type=wvc.DataType.TEXT),
                wvc.Property(name="results", data_type=wvc.DataType.TEXT),              # pass, fail, etc
                wvc.Property(name="violations", data_type=wvc.DataType.TEXT),          # full violation text
                wvc.Property(name="violation_count", data_type=wvc.DataType.INT),
                wvc.Property(name="critical_violations", data_type=wvc.DataType.INT),
                wvc.Property(name="risk_level", data_type=wvc.DataType.TEXT),          # high, medium, low
                wvc.Property(name="is_closure", data_type=wvc.DataType.BOOL),
                wvc.Property(name="follow_up_required", data_type=wvc.DataType.BOOL),
                
                # Local context metadata (for pattern detection and story enrichment)
                wvc.Property(name="nearby_landmarks", data_type=wvc.DataType.TEXT_ARRAY),    # ["Millennium Park", "Art Institute"]
                wvc.Property(name="nearby_transit", data_type=wvc.DataType.TEXT_ARRAY),      # ["Red Line", "Blue Line"]
                wvc.Property(name="nearby_schools", data_type=wvc.DataType.TEXT_ARRAY),      # ["Lincoln Elementary"]
                wvc.Property(name="nearby_attractions", data_type=wvc.DataType.TEXT_ARRAY), # ["Navy Pier", "Shedd Aquarium"]
                wvc.Property(name="area_demographics", data_type=wvc.DataType.TEXT),        # "high-traffic tourist area"
                wvc.Property(name="local_events", data_type=wvc.DataType.TEXT_ARRAY),       # ongoing events near location
                
                # Auto-generated tags and categorization
                wvc.Property(name="auto_tags", data_type=wvc.DataType.TEXT_ARRAY),          # ["temperature", "mcdonalds", "loop"]
                wvc.Property(name="violation_categories", data_type=wvc.DataType.TEXT_ARRAY), # ["temperature", "cleanliness"]
                wvc.Property(name="severity_tags", data_type=wvc.DataType.TEXT_ARRAY),      # ["critical", "repeat"]
                wvc.Property(name="location_tags", data_type=wvc.DataType.TEXT_ARRAY),      # ["downtown", "loop", "chicago"]
                wvc.Property(name="chain_tags", data_type=wvc.DataType.TEXT_ARRAY),         # ["fast-food", "burger-chain"]
                
                # Pattern detection metadata
                wvc.Property(name="pattern_ids", data_type=wvc.DataType.TEXT_ARRAY),        # patterns this article belongs to
                wvc.Property(name="similar_articles", data_type=wvc.DataType.TEXT_ARRAY),  # IDs of similar articles
                wvc.Property(name="trend_indicators", data_type=wvc.DataType.TEXT_ARRAY),  # ["seasonal", "chain-wide"]
                
                # Internal linking and cross-references
                wvc.Property(name="internal_links", data_type=wvc.DataType.TEXT_ARRAY),     # URLs linked to in article
                wvc.Property(name="related_articles", data_type=wvc.DataType.TEXT_ARRAY),  # related article IDs
                wvc.Property(name="government_citations", data_type=wvc.DataType.TEXT_ARRAY), # gov links cited
                wvc.Property(name="news_citations", data_type=wvc.DataType.TEXT_ARRAY),    # news articles cited
                
                # Image and media
                wvc.Property(name="image_url", data_type=wvc.DataType.TEXT),
                wvc.Property(name="image_alt", data_type=wvc.DataType.TEXT),
                wvc.Property(name="image_category", data_type=wvc.DataType.TEXT),          # type of image used
                
                # Processing metadata
                wvc.Property(name="data_source", data_type=wvc.DataType.TEXT),             # "chicago_health_dept"
                wvc.Property(name="processing_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="last_updated", data_type=wvc.DataType.DATE),
                wvc.Property(name="content_version", data_type=wvc.DataType.TEXT),         # v1, v2 for tracking
                wvc.Property(name="ai_generated", data_type=wvc.DataType.BOOL),
                wvc.Property(name="human_reviewed", data_type=wvc.DataType.BOOL),
                
                # Raw data preservation
                wvc.Property(name="raw_inspection_data", data_type=wvc.DataType.TEXT),     # original JSON
                wvc.Property(name="data_hash", data_type=wvc.DataType.TEXT),              # for duplicate detection
            ]
        )
        
        print("✓ Articles collection created successfully")
        
        # Create RawInspections collection for storing unprocessed data
        if client.collections.exists("RawInspections"):
            print("Deleting existing RawInspections collection...")
            client.collections.delete("RawInspections")
            
        print("Creating RawInspections collection...")
        raw_inspections = client.collections.create(
            name="RawInspections",
            description="Raw inspection data before processing",
            properties=[
                wvc.Property(name="inspection_id", data_type=wvc.DataType.TEXT),
                wvc.Property(name="raw_data", data_type=wvc.DataType.TEXT),
                wvc.Property(name="data_hash", data_type=wvc.DataType.TEXT),
                wvc.Property(name="source_url", data_type=wvc.DataType.TEXT),
                wvc.Property(name="download_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="processed", data_type=wvc.DataType.BOOL),
                wvc.Property(name="processed_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="article_id", data_type=wvc.DataType.TEXT),  # link to generated article
            ]
        )
        
        print("✓ RawInspections collection created successfully")
        
        # Create TagPages collection for auto-generated tag pages
        if client.collections.exists("TagPages"):
            client.collections.delete("TagPages")
            
        print("Creating TagPages collection...")
        tag_pages = client.collections.create(
            name="TagPages",
            description="Auto-generated tag pages",
            properties=[
                wvc.Property(name="tag_slug", data_type=wvc.DataType.TEXT),
                wvc.Property(name="tag_name", data_type=wvc.DataType.TEXT),
                wvc.Property(name="tag_type", data_type=wvc.DataType.TEXT),  # location, chain, violation, etc
                wvc.Property(name="overview_content", data_type=wvc.DataType.TEXT),
                wvc.Property(name="article_count", data_type=wvc.DataType.INT),
                wvc.Property(name="created_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="last_updated", data_type=wvc.DataType.DATE),
                wvc.Property(name="auto_generated", data_type=wvc.DataType.BOOL),
            ]
        )
        
        print("✓ TagPages collection created successfully")
        
        # Create PatternStories collection for investigative articles
        if client.collections.exists("PatternStories"):
            client.collections.delete("PatternStories")
            
        print("Creating PatternStories collection...")
        pattern_stories = client.collections.create(
            name="PatternStories",
            description="Auto-generated pattern investigation articles",
            properties=[
                wvc.Property(name="pattern_id", data_type=wvc.DataType.TEXT),
                wvc.Property(name="title", data_type=wvc.DataType.TEXT),
                wvc.Property(name="content", data_type=wvc.DataType.TEXT),
                wvc.Property(name="slug", data_type=wvc.DataType.TEXT),
                wvc.Property(name="pattern_type", data_type=wvc.DataType.TEXT),  # chain, geographic, temporal
                wvc.Property(name="article_ids", data_type=wvc.DataType.TEXT_ARRAY),  # articles in pattern
                wvc.Property(name="significance_score", data_type=wvc.DataType.NUMBER),
                wvc.Property(name="date_range_start", data_type=wvc.DataType.DATE),
                wvc.Property(name="date_range_end", data_type=wvc.DataType.DATE),
                wvc.Property(name="published_date", data_type=wvc.DataType.DATE),
                wvc.Property(name="auto_generated", data_type=wvc.DataType.BOOL),
                wvc.Property(name="location", data_type=wvc.DataType.TEXT),
                wvc.Property(name="image_url", data_type=wvc.DataType.TEXT),
                wvc.Property(name="meta_description", data_type=wvc.DataType.TEXT),
            ]
        )
        
        print("✓ PatternStories collection created successfully")
        
        print("\n" + "="*60)
        print("WEAVIATE SCHEMA UPDATE COMPLETE")
        print("Collections created:")
        print("  - Articles (comprehensive metadata)")
        print("  - RawInspections (unprocessed data)")
        print("  - TagPages (auto-generated tag pages)")
        print("  - PatternStories (investigation articles)")
        print("="*60)
        
    finally:
        client.close()

if __name__ == "__main__":
    update_articles_schema()