
    wfh                        d dl mZ d dlZd dlZd dlmZ d dlmZmZm	Z	 d dl
mZ d dlmZ erd dlZd dlmZ d dlmZ d d	lmZ  ej,                  e      Z G d
 de      Zy)    )annotationsN)defaultdict)TYPE_CHECKINGAnyLiteral)EmbeddingSimilarityEvaluator)append_to_last_row)Tensor)SimilarityFunction)SparseEncoderc                       e Zd ZdZ	 	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d	 fdZd
 fdZ	 d	 	 	 	 	 	 	 	 	 d fdZ	 	 	 	 	 	 ddZ	 d	 	 	 	 	 	 	 	 	 ddZd fdZ	 xZ
S )"SparseEmbeddingSimilarityEvaluatora  
    This evaluator extends :class:`~sentence_transformers.evaluation.EmbeddingSimilarityEvaluator` but is specifically designed for sparse encoder models.

    Evaluate a model based on the similarity of the embeddings by calculating the Spearman and Pearson rank correlation
    in comparison to the gold standard labels.
    The metrics are the cosine similarity as well as euclidean and Manhattan distance
    The returned score is the Spearman correlation with a specified metric.

    Args:
        sentences1 (List[str]): List with the first sentence in a pair.
        sentences2 (List[str]): List with the second sentence in a pair.
        scores (List[float]): Similarity score between sentences1[i] and sentences2[i].
        batch_size (int, optional): The batch size for processing the sentences. Defaults to 16.
        main_similarity (Optional[Union[str, SimilarityFunction]], optional): The main similarity function to use.
            Can be a string (e.g. "cosine", "dot") or a SimilarityFunction object. Defaults to None.
        similarity_fn_names (List[str], optional): List of similarity function names to use. If None, the
            ``similarity_fn_name`` attribute of the model is used. Defaults to None.
        name (str, optional): The name of the evaluator. Defaults to "".
        show_progress_bar (bool, optional): Whether to show a progress bar during evaluation. Defaults to False.
        write_csv (bool, optional): Whether to write the evaluation results to a CSV file. Defaults to True.
        max_active_dims (Optional[int], optional): The maximum number of active dimensions to use.
            `None` uses the model's current `max_active_dims`. Defaults to None.

    Example:
        ::

            import logging

            from datasets import load_dataset

            from sentence_transformers import SparseEncoder, SimilarityFunction
            from sentence_transformers.sparse_encoder.evaluation import SparseEmbeddingSimilarityEvaluator

            logging.basicConfig(format="%(message)s", level=logging.INFO)

            # Load a model
            model = SparseEncoder("naver/splade-cocondenser-ensembledistil")

            # Load the STSB dataset (https://huggingface.co/datasets/sentence-transformers/stsb)
            eval_dataset = load_dataset("sentence-transformers/stsb", split="validation")

            # Initialize the evaluator
            dev_evaluator = SparseEmbeddingSimilarityEvaluator(
                sentences1=eval_dataset["sentence1"],
                sentences2=eval_dataset["sentence2"],
                scores=eval_dataset["score"],
                main_similarity=SimilarityFunction.COSINE, # even though the model is trained with dot, we need to set it to cosine for evaluation as the score in the dataset is cosine similarity
                name="sts_dev",
            )
            results = dev_evaluator(model)
            '''
            EmbeddingSimilarityEvaluator: Evaluating the model on the sts_dev dataset:
            Cosine-Similarity:      Pearson: 0.8429 Spearman: 0.8366
            Model Sparsity: Active Dimensions: 78.3, Sparsity Ratio: 0.9974
            '''
            # Print the results
            print(f"Primary metric: {dev_evaluator.primary_metric}")
            # => Primary metric: sts_dev_spearman_cosine
            print(f"Primary metric value: {results[dev_evaluator.primary_metric]:.4f}")
            # => Primary metric value: 0.8366

    c                l    |
| _         t        t              | _        t        |   |||||||||	d 
      S )N)

sentences1
sentences2scores
batch_sizemain_similaritysimilarity_fn_namesnameshow_progress_bar	write_csv	precision)max_active_dimsr   listsparsity_statssuper__init__)selfr   r   r   r   r   r   r   r   r   r   	__class__s              /home/chris/cleankitchens-env/lib/python3.12/site-packages/sentence_transformers/sparse_encoder/evaluation/SparseEmbeddingSimilarityEvaluator.pyr   z+SparseEmbeddingSimilarityEvaluator.__init__V   sO      /)$/w!!!+ 3/   
 	
    c                ^    t         |   |       | j                  j                  ddg       y )Nactive_dimssparsity_ratio)r   _append_csv_headerscsv_headersextend)r   r   r    s     r!   r&   z6SparseEmbeddingSimilarityEvaluator._append_csv_headersr   s+    #$780@ ABr"   c                   t        t              | _        t        |   ||||      }| j                  j                         D ])  \  }}t        |      t        |      z  | j                  |<   + |j                  | j                  | j                  | j                               | j                  ||||       t        j                  d| j                  d   dd| j                  d   d       |\| j                  rPt        t         j"                  j%                  || j&                        | j                  d   | j                  d   g       |S )N)modeloutput_pathepochstepsz#Model Sparsity: Active Dimensions: r$   z.1fz, Sparsity Ratio: r%   z.4f)r   r   r   r   __call__itemssumlenupdateprefix_name_to_metricsr    store_metrics_in_model_card_dataloggerinfor   r	   ospathjoincsv_file)	r   r*   r+   r,   r-   metricskeyvaluer    s	           r!   r.   z+SparseEmbeddingSimilarityEvaluator.__call__v   sE    *$/'"Ku\a"b--335 	?JC'*5zCJ'>D$	? 	t2243F3F		RS--eWeUK1$2E2Em2TUX1YYklpll  AQ  mR  SV  lW  X	
 "t~~[$--8$$]3T5H5HIY5Z[
 r"   c           	         |j                   |f| j                  | j                  dd| j                  d|}|j	                  |      }|j                         D ]#  \  }}| j                  |   j                  |       % |S )NT)r   r   convert_to_sparse_tensorsave_to_cpur   )encoder   r   r   sparsityr/   r   append)r   r*   	sentenceskwargs
embeddingsstatr<   r=   s           r!   embed_inputsz/SparseEmbeddingSimilarityEvaluator.embed_inputs   s     "U\\
"44%) 00
 

 ~~j)**, 	3JC$++E2	3r"   c                B    |j                   j                  | |||       y )N)r,   step)model_card_dataset_evaluation_metrics)r   r*   r;   r,   rJ   s        r!   r4   zCSparseEmbeddingSimilarityEvaluator.store_metrics_in_model_card_data   s"     	44T7%VZ4[r"   c                Z    t         |          }| j                  | j                  |d<   |S )Nr   )r   get_config_dictr   )r   config_dictr    s     r!   rN   z2SparseEmbeddingSimilarityEvaluator.get_config_dict   s3    g-/+-1-A-AK)*r"   )   NN FTN)r   	list[str]r   rR   r   zlist[float]r   intr   zstr | SimilarityFunction | Noner   z?list[Literal['cosine', 'euclidean', 'manhattan', 'dot']] | Noner   strr   boolr   rU   r   z
int | None)r   rR   returnNone)NrX   )
r*   r   r+   z
str | Noner,   rS   r-   rS   rV   zdict[str, float])r*   r   rD   zstr | list[str] | np.ndarrayrV   r
   )r   r   )
r*   r   r;   dict[str, Any]r,   rS   rJ   rS   rV   rW   )rV   rY   )__name__
__module____qualname____doc__r   r&   r.   rH   r4   rN   __classcell__)r    s   @r!   r   r      s'   =H ;?_c"'&*

 
 	

 
 9
 ]
 
  
 
 $
8C
 ce"1;KN\_	* 0
 
* Z[\"\-;\DG\SV\	\
 r"   r   )
__future__r   loggingr7   collectionsr   typingr   r   r    sentence_transformers.evaluationr   sentence_transformers.utilr	   numpynptorchr
   *sentence_transformers.similarity_functionsr   2sentence_transformers.sparse_encoder.SparseEncoderr   	getLoggerrZ   r5   r    r"   r!   <module>rl      sO    "  	 # . . I 9MP 
		8	$R)E Rr"   