
    wfh?                        d dl mZ d dlZd dlZd dlmZmZmZ d dlZ	d dl
m
Z
 d dlmZ d dlmZ d dlmZ erd dlmZ  ej&                  e      Zed	   Zd
ddddddddddddd	Zdddddddddd d!d"d#d	Z G d$ d%e      Zy)&    )annotationsN)TYPE_CHECKINGCallableLiteral)tqdm)CrossEncoderRerankingEvaluator)SentenceEvaluator)is_datasets_available)CrossEncoder)climatefeverdbpediafeverfiqa2018hotpotqamsmarconfcorpusnqquoraretrievalscidocsarguanascifact
touche2020z+sentence-transformers/NanoClimateFEVER-bm25z&sentence-transformers/NanoDBPedia-bm25z$sentence-transformers/NanoFEVER-bm25z'sentence-transformers/NanoFiQA2018-bm25z'sentence-transformers/NanoHotpotQA-bm25z&sentence-transformers/NanoMSMARCO-bm25z'sentence-transformers/NanoNFCorpus-bm25z!sentence-transformers/NanoNQ-bm25z-sentence-transformers/NanoQuoraRetrieval-bm25z&sentence-transformers/NanoSCIDOCS-bm25z&sentence-transformers/NanoArguAna-bm25z&sentence-transformers/NanoSciFact-bm25z)sentence-transformers/NanoTouche2020-bm25ClimateFEVERDBPediaFEVERFiQA2018HotpotQAMSMARCONFCorpusNQQuoraRetrievalSCIDOCSArguAnaSciFact
Touche2020c            	           e Zd ZdZdddddddej
                  df		 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fd	Z	 d	 	 	 	 	 	 	 	 	 dd
ZddZddZ	d Z
d Z xZS )CrossEncoderNanoBEIREvaluatora  
    This class evaluates a CrossEncoder model on the NanoBEIR collection of Information Retrieval datasets.

    The collection is a set of datasets based on the BEIR collection, but with a significantly smaller size, so it can
    be used for quickly evaluating the retrieval performance of a model before committing to a full evaluation.
    The datasets are available on Hugging Face in the `NanoBEIR with BM25 collection <https://huggingface.co/collections/sentence-transformers/nanobeir-with-bm25-rankings-67bdcbc629f007c15bf358d8>`_.
    This evaluator will return the same metrics as the CrossEncoderRerankingEvaluator (i.e., MRR@k, nDCG@k, MAP), for each dataset and on average.

    Rather than reranking all documents for each query, the evaluator will only rerank the ``rerank_k`` documents from
    a BM25 ranking. When your logging is set to INFO, the evaluator will print the MAP, MRR@k, and nDCG@k for each dataset
    and the average over all datasets.

    Note that the maximum score is 1.0 by default, because all positive documents are included in the ranking. This
    can be toggled off by setting ``always_rerank_positives=False``, at which point the maximum score will be bound by
    the number of positive documents that BM25 ranks in the top ``rerank_k`` documents.

    .. note::
        This evaluator outputs its results using keys in the format ``NanoBEIR_R{rerank_k}_{aggregate_key}_{metric}``,
        where ``metric`` is one of ``map``, ``mrr@{at_k}``, or ``ndcg@{at_k}``, and ``rerank_k``, ``aggregate_key`` and
        ``at_k`` are the parameters of the evaluator. The primary metric is ``ndcg@{at_k}``. By default, the name of
        the primary metric is ``NanoBEIR_R100_mean_ndcg@10``.

        For the results of each dataset, the keys are in the format ``Nano{dataset_name}_R{rerank_k}_{metric}``,
        for example ``NanoMSMARCO_R100_mrr@10``.

        These can be used as ``metric_for_best_model`` alongside ``load_best_model_at_end=True`` in the
        :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments` to automatically load the
        best model based on a specific metric of interest.

    Args:
        dataset_names (List[str]): The names of the datasets to evaluate on. If not specified, use all datasets except arguana and touche2020.
        rerank_k (int): The number of documents to rerank from the BM25 ranking. Defaults to 100.
        at_k (int, optional): Only consider the top k most similar documents to each query for the evaluation. Defaults to 10.
        always_rerank_positives (bool): If True, always evaluate with all positives included. If False, only include
            the positives that are already in the documents list. Always set to True if your ``samples`` contain ``negative``
            instead of ``documents``. When using ``documents``, setting this to True will result in a more useful evaluation
            signal, but setting it to False will result in a more realistic evaluation. Defaults to True.
        batch_size (int): Batch size to compute sentence embeddings. Defaults to 64.
        show_progress_bar (bool): Show progress bar when computing embeddings. Defaults to False.
        write_csv (bool): Write results to CSV file. Defaults to True.
        aggregate_fn (Callable[[list[float]], float]): The function to aggregate the scores. Defaults to np.mean.
        aggregate_key (str): The key to use for the aggregated score. Defaults to "mean".

    Example:
        ::

            from sentence_transformers.cross_encoder import CrossEncoder
            from sentence_transformers.cross_encoder.evaluation import CrossEncoderNanoBEIREvaluator
            import logging

            logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(message)s")

            # Load a model
            model = CrossEncoder("cross-encoder/ms-marco-MiniLM-L6-v2")

            # Load & run the evaluator
            dataset_names = ["msmarco", "nfcorpus", "nq"]
            evaluator = CrossEncoderNanoBEIREvaluator(dataset_names)
            results = evaluator(model)
            '''
            NanoBEIR Evaluation of the model on ['msmarco', 'nfcorpus', 'nq'] dataset:
            Evaluating NanoMSMARCO
            CrossEncoderRerankingEvaluator: Evaluating the model on the NanoMSMARCO dataset:
                     Base  -> Reranked
            MAP:     48.96 -> 60.35
            MRR@10:  47.75 -> 59.63
            NDCG@10: 54.04 -> 66.86

            Evaluating NanoNFCorpus
            CrossEncoderRerankingEvaluator: Evaluating the model on the NanoNFCorpus dataset:
            Queries: 50   Positives: Min 1.0, Mean 50.4, Max 463.0        Negatives: Min 54.0, Mean 92.8, Max 100.0
                     Base  -> Reranked
            MAP:     26.10 -> 34.61
            MRR@10:  49.98 -> 58.85
            NDCG@10: 32.50 -> 39.30

            Evaluating NanoNQ
            CrossEncoderRerankingEvaluator: Evaluating the model on the NanoNQ dataset:
            Queries: 50   Positives: Min 1.0, Mean 1.1, Max 2.0   Negatives: Min 98.0, Mean 99.0, Max 100.0
                     Base  -> Reranked
            MAP:     41.96 -> 70.98
            MRR@10:  42.67 -> 73.55
            NDCG@10: 50.06 -> 75.99

            CrossEncoderNanoBEIREvaluator: Aggregated Results:
                     Base  -> Reranked
            MAP:     39.01 -> 55.31
            MRR@10:  46.80 -> 64.01
            NDCG@10: 45.54 -> 60.72
            '''
            print(evaluator.primary_metric)
            # NanoBEIR_R100_mean_ndcg@10
            print(results[evaluator.primary_metric])
            # 0.60716840988382
    Nd   
   T    Fmeanc
                   t         |           |t        D 
cg c]	  }
|
dvs|
 }}
|| _        || _        || _        || _        || _        || _        || _	        || _
        |	| _        d|dd| j                   | _        | j                          | j
                  | j                  | j                  | j                  | j                  d}t        | j                  dd      D cg c]  } | j                  |fi | c}| _        d	|	 d
| _        dddd| j
                   d| j
                   g| _        d| j
                   | _        y c c}
w c c}w )N)r   r   
NanoBEIR_Rd_)at_kalways_rerank_positivesshow_progress_bar
batch_size	write_csvzLoading NanoBEIR datasetsF)descleaveNanoBEIR_evaluation_z_results.csvepochstepsMAPMRR@NDCG@ndcg@)super__init__dataset_name_to_iddataset_namesrerank_kr0   r1   r2   r3   r4   aggregate_fnaggregate_keyname_validate_dataset_namesr   _load_dataset
evaluatorscsv_filecsv_headersprimary_metric)selfrA   rB   r0   r1   r3   r2   r4   rC   rD   keyreranking_kwargsrE   	__class__s                v/home/chris/cleankitchens-env/lib/python3.12/site-packages/sentence_transformers/cross_encoder/evaluation/nano_beir.pyr?   z&CrossEncoderNanoBEIREvaluator.__init__   sl    	 ,>gS#MfBfSgMg* 	'>$!2$"(* !Ad.@.@-AB	$$& II'+'C'C!%!7!7//
 T//6QY^_
 Dt8'78

  4M?,O#WetDII;5G5QUQZQZP[I\] %dii[1? h.
s   	EE%Ec                *	   i }i }|dk7  r|dk(  rd| }	nd| d| d}	nd}	t         j                  d| j                   d|	 d	       t        | j                  d
| j
                         D ]  }
t         j                  d|
j                           |
||||      }|D ]S  }|j                  dd      \  }}}||vrg ||<   ||   || d| j                   d| <   ||   j                  ||          U t         j                  d        i }|D ]  }| j                  ||         ||<    |'| j                  rt        j                  j                  || j                        }t        j                  j!                  |      sJt#        |dd      }|j%                  dj                  | j&                               |j%                  d       nt#        |dd      }|||d   |d| j(                      |d| j(                      g}|j%                  dj                  t+        t,        |                   |j%                  d       |j/                          t         j                  d       t         j                  dt1        t-        | j(                              z   d       t         j                  ddt1        t-        | j(                              z   d|d   d z  d!d"|d   d z  d!       t         j                  d#| j(                   d$|d%| j(                      d z  d!d"|d| j(                      d z  d!       t         j                  d&| j(                   d'|d(| j(                      d z  d!d"|d| j(                      d z  d!       d|d   d)d*|d   |d   z
  d+d,d| j(                   |d| j(                      d)d*|d| j(                      |d%| j(                      z
  d+d,d| j(                   |d| j(                      d)d*|d| j(                      |d(| j(                      z
  d+d,i}| j3                  || j                        }| j5                  ||||       | j3                  || j                        }|j7                  |       |S )-Nz after epoch z
 in epoch z after z steps z$NanoBEIR Evaluation of the model on z dataset:zEvaluating datasets)r5   disablezEvaluating r/      )maxsplit_Rwzutf-8)modeencoding,
amapzmrr@r=   z2CrossEncoderNanoBEIREvaluator: Aggregated Results: z       Base  -> RerankedzMAP:z   base_mapr(   z.2fz -> r;   z:  z	base_mrr@r<   z: z
base_ndcg@z.4fz (z+.4f))loggerinforA   r   rH   r2   rE   splitrB   appendrC   r4   ospathjoinrI   isfileopenwriterJ   r0   r_   strcloselenprefix_name_to_metrics store_metrics_in_model_card_dataupdate)rL   modeloutput_pathr8   r9   argskwargsper_metric_resultsper_dataset_resultsout_txt	evaluator
evaluationkdataset	_rerank_kmetricagg_resultscsv_pathfOutoutput_datamodel_card_metricss                        rP   __call__z&CrossEncoderNanoBEIREvaluator.__call__   sp      B;{)%1&ugWUG6BG:4;M;M:NhW^V__`abdoo4IW[WmWmSmn 		IKK+inn%567"5+ueDJ A-.WWS1W-E*F!3313&v.NXYZm#wir$--&$JK"6*11*Q-@A KKO		 ( 	PF"&"3"34Fv4N"OK	P "t~~ww||K?H77>>(+H3A

388D$4$456

4  H3A E"d499+./eDII;/0K JJsxxC 567JJtJJLHIsSTYY0011IJK3S^,,-SZ1H31Ns0SSWXcdiXjmpXpqtWuv	
 	499+Sy-D!E!KC PPTU`cghlhqhqgrasUtwzUz{~T  A	
 	DII;bz$))-E!F!LS QQUVadijnjsjsitbuVvy|V|  ~A  VB  C	

 k%(-RE0B[Q[E\0\]a/bbcd499+;dii[/A#B3"Gr+X\]a]f]f\gVhJilw  {D  EI  EN  EN  DO  yP  mQ  KQ  RV  JW  WX  !YDII;K%		{0C$DS#IKZ_`d`i`i_jXkLloz  ~H  IM  IR  IR  HS  |T  pU  MU  VZ  L[  [\  "]

 "889KTYYW--e5GPUV11+tyyI"";/""    c                T    dt         |j                             d| j                   }|S )NNanorX   )dataset_name_to_human_readablelowerrB   )rL   dataset_namehuman_readable_names      rP   _get_human_readable_namez6CrossEncoderNanoBEIREvaluator._get_human_readable_name!  s4     $%CLDVDVDX%Y$ZZ\]a]j]j\kl""r   c                   t               st        d      ddlm} t        |j                            } ||dd      }t        t        |d   |d               } ||d	d      }t        t        |d   |d               } ||d
d      }	dd}
|	j                  |
||| j                  d      }	| j                  |      }t        dt        |	      |d|S )Nzqdatasets is not available. Please install it to use the CrossEncoderNanoBEIREvaluator via `pip install datasets`.r   )load_datasetcorpustrain)re   _idtextqueries	relevancec                    || d      }| d   D cg c]  }||   	 }}| d   d | D cg c]  }||   	 }}|||dS c c}w c c}w )Nzquery-idzpositive-corpus-idszbm25-ranked-ids)querypositive	documents )	samplecorpus_mappingquery_mappingrB   r   positive_id	positivesdocument_idr   s	            rP   mapperz;CrossEncoderNanoBEIREvaluator._load_dataset.<locals>.mapper3  sr    !&"45EHNOdHef4fIfHNO`HabkckHlm4mIm%&  gms	   <A)r   r   rB   )	fn_kwargs)samplesrE   )r   dict[str, str]r   r   rB   intr   )r
   
ValueErrordatasetsr   r@   r   dictzipr_   rB   r   r   list)rL   r   ir_evaluator_kwargsr   dataset_pathr   r   r   r   r   r   r   s               rP   rG   z+CrossEncoderNanoBEIREvaluator._load_dataset%  s    $& D  	*),*<*<*>?lHGDc&-@A|YgFSAB {'J		 MM)7-eierers " 
	
 #;;LI- 
O$
 "
 	
r   c           	        t        | j                        dk(  rt        d      | j                  D cg c]  }|j                         t        vs| c}x}r,t        d| dt        t        j                                      y c c}w )Nr   zDdataset_names cannot be empty. Use None to evaluate on all datasets.zDataset(s) z@ not found in the NanoBEIR collection. Valid dataset names are: )ro   rA   r   r   r@   r   keys)rL   r   missing_datasetss      rP   rF   z5CrossEncoderNanoBEIREvaluator._validate_dataset_namesI  s    t!!"a'cdd-1-?-? 
)<CUCUCW_qCqL 
 
 
 ./ 0,,01C1H1H1J,K+LN 
  
s   BBc                `    | j                   | j                  | j                  | j                  dS )NrA   rB   r0   r1   r   )rL   s    rP   get_config_dictz-CrossEncoderNanoBEIREvaluator.get_config_dictT  s,    !//II'+'C'C	
 	
r   )rA   zlist[DatasetNameType] | NonerB   r   r0   r   r1   boolr3   r   r2   r   r4   r   rC   zCallable[[list[float]], float]rD   rm   )NrR   rR   )
rs   r   rt   z
str | Noner8   r   r9   r   returnzdict[str, float])r   DatasetNameTyper   rm   )r   r   r   r   )__name__
__module____qualname____doc__npr+   r?   r   r   rG   rF   r   __classcell__)rO   s   @rP   r'   r'   D   s    ^D 7;(,"'79ww#.23.2 .2 	.2
 "&.2 .2  .2 .2 5.2 .2b bdJ#!J#0:J#JMJ#[^J#	J#X#"
H	
r   r'   )
__future__r   loggingrg   typingr   r   r   numpyr   r   8sentence_transformers.cross_encoder.evaluation.rerankingr   2sentence_transformers.evaluation.SentenceEvaluatorr	   sentence_transformers.utilr
   0sentence_transformers.cross_encoder.CrossEncoderr   	getLoggerr   rc   r   r@   r   r'   r   r   rP   <module>r      s    "  	 3 3   c P <M			8	$" B739979
-E777= " #
&" "V
$5 V
r   