
    wfh#                        d dl mZ d dlZd dlZd dlmZ d dlmZmZ d dl	m
Z
 d dlmZ erd dlZd dlmZ d dlmZ  ej&                  e      Z G d	 d
e
      Zy)    )annotationsN)defaultdict)TYPE_CHECKINGAny)MSEEvaluator)append_to_last_row)Tensor)SparseEncoderc                       e Zd ZdZ	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ	 	 	 d		 	 	 	 	 	 	 	 	 d
 fdZ	 	 	 	 	 	 ddZ	 	 d	 	 	 	 	 	 	 	 	 ddZd fdZ xZ	S )SparseMSEEvaluatora[  
    This evaluator extends :class:`~sentence_transformers.evaluation.MSEEvaluator` but is specifically designed for sparse encoder models.

    Note that this evaluator doesn't take benefit of the sparse tensor torch representation yet, so memory issues may occur.

    Computes the mean squared error (x100) between the computed sentence embedding
    and some target sentence embedding.

    The MSE is computed between ``||teacher.encode(source_sentences) - student.encode(target_sentences)||``.

    For multilingual knowledge distillation (https://arxiv.org/abs/2004.09813), source_sentences are in English
    and target_sentences are in a different language like German, Chinese, Spanish...

    Args:
        source_sentences (List[str]): Source sentences to embed with the teacher model.
        target_sentences (List[str]): Target sentences to embed with the student model.
        teacher_model (SparseEncoder, optional): The teacher model to compute the source sentence embeddings.
        show_progress_bar (bool, optional): Show progress bar when computing embeddings. Defaults to False.
        batch_size (int, optional): Batch size to compute sentence embeddings. Defaults to 32.
        name (str, optional): Name of the evaluator. Defaults to "".
        write_csv (bool, optional): Write results to CSV file. Defaults to True.
        max_active_dims (Optional[int], optional): The maximum number of active dimensions to use.
            `None` uses the model's current `max_active_dims`. Defaults to None.

    Example:
        ::

            import logging

            from datasets import load_dataset

            from sentence_transformers import SparseEncoder
            from sentence_transformers.sparse_encoder.evaluation import SparseMSEEvaluator

            logging.basicConfig(format="%(message)s", level=logging.INFO)

            # Load a model
            student_model = SparseEncoder("prithivida/Splade_PP_en_v1")
            teacher_model = SparseEncoder("naver/splade-cocondenser-ensembledistil")

            # Load any dataset with some texts
            dataset = load_dataset("sentence-transformers/stsb", split="validation")
            sentences = dataset["sentence1"] + dataset["sentence2"]

            # Given queries, a corpus and a mapping with relevant documents, the SparseMSEEvaluator computes different MSE metrics.
            mse_evaluator = SparseMSEEvaluator(
                source_sentences=sentences,
                target_sentences=sentences,
                teacher_model=teacher_model,
                name="stsb-dev",
            )
            results = mse_evaluator(student_model)
            '''
            MSE evaluation (lower = better) on the stsb-dev dataset:
            MSE (*100):     0.034905
            Model Sparsity: Active Dimensions: 54.6, Sparsity Ratio: 0.9982
            '''
            # Print the results
            print(f"Primary metric: {mse_evaluator.primary_metric}")
            # => Primary metric: stsb-dev_negative_mse
            print(f"Primary metric value: {results[mse_evaluator.primary_metric]:.4f}")
            # => Primary metric value: -0.0349
    c	           	         || _         t        t              | _        t        	|   |||||||       | j                  j                  ddg       t        j                  d       y )N)source_sentencestarget_sentencesteacher_modelshow_progress_bar
batch_sizename	write_csvactive_dimssparsity_ratiozhThe SparseMSEEvaluator is not handling the mse compute with sparse tensors yet. Memory issues may occur.)
max_active_dimsr   listsparsity_statssuper__init__csv_headersextendloggerwarning)
selfr   r   r   r   r   r   r   r   	__class__s
            /home/chris/cleankitchens-env/lib/python3.12/site-packages/sentence_transformers/sparse_encoder/evaluation/SparseMSEEvaluator.pyr   zSparseMSEEvaluator.__init__U   sl      /)$/--'/! 	 	
 	0@ ABv	
    c                   t        t              | _        t        |   ||||      }| j                  j                         D ])  \  }}t        |      t        |      z  | j                  |<   + |j                  | j                  | j                  | j                               | j                  ||||       t        j                  d| j                  d   dd| j                  d   d       |\| j                  rPt        t         j"                  j%                  || j&                        | j                  d   | j                  d   g       |S )N)modeloutput_pathepochstepsz#Model Sparsity: Active Dimensions: r   z.1fz, Sparsity Ratio: r   z.4f)r   r   r   r   __call__itemssumlenupdateprefix_name_to_metricsr    store_metrics_in_model_card_datar   infor   r   ospathjoincsv_file)	r    r%   r&   r'   r(   metricskeyvaluer!   s	           r"   r)   zSparseMSEEvaluator.__call__p   sE    *$/'"Ku\a"b--335 	?JC'*5zCJ'>D$	? 	t2243F3F		RS--eWeUK1$2E2Em2TUX1YYklpll  AQ  mR  SV  lW  X	
 "t~~[$--8$$]3T5H5HIY5Z[
 r#   c           	         |j                   |f| j                  | j                  dd| j                  d|}|j	                  |      }|j                         D ]#  \  }}| j                  |   j                  |       % |S )NFT)r   r   convert_to_sparse_tensorsave_to_cpur   )encoder   r   r   sparsityr*   r   append)r    r%   	sentenceskwargs
embeddingsstatr6   r7   s           r"   embed_inputszSparseMSEEvaluator.embed_inputs   s     "U\\
"44%* 00
 

 ~~j)**, 	3JC$++E2	3r#   c                B    |j                   j                  | |||       y )N)r'   step)model_card_dataset_evaluation_metrics)r    r%   r5   r'   rD   s        r"   r/   z3SparseMSEEvaluator.store_metrics_in_model_card_data   s"     	44T7%VZ4[r#   c                Z    t         |          }| j                  | j                  |d<   |S )Nr   )r   get_config_dictr   )r    config_dictr!   s     r"   rH   z"SparseMSEEvaluator.get_config_dict   s3    g-/+-1-A-AK)*r#   )NF     TN)r   	list[str]r   rL   r   boolr   intr   strr   rM   r   z
int | None)NrP   )
r%   r
   r&   z
str | Noner'   rN   r(   rN   returnzdict[str, float])r%   r
   r>   zstr | list[str] | np.ndarrayrQ   r	   )r   r   )
r%   r
   r5   dict[str, Any]r'   rN   rD   rN   rQ   None)rQ   rR   )
__name__
__module____qualname____doc__r   r)   rB   r/   rH   __classcell__)r!   s   @r"   r   r      s   >H "'&*
#
 $

  
 
 
 
 $
< #'   	
  
2 0
 
0 \\  \ 	\
 \ 
\ r#   r   )
__future__r   loggingr1   collectionsr   typingr   r    sentence_transformers.evaluationr   sentence_transformers.utilr   numpynptorchr	   $sentence_transformers.sparse_encoderr
   	getLoggerrT   r   r    r#   r"   <module>re      sF    "  	 # % 9 9B			8	$V Vr#   