
    wfh
                    p    d dl mZ d dlmZ d dlmZmZ d dlmZm	Z	 d dl
Z
d dlmZ e G d de             Zy)	    )annotations)
Collection)	dataclassfield)AnyCallableN)SentenceTransformerDataCollatorc                  b    e Zd ZU dZded<    ed       Zded<    eedd	      Zd
ed<   ddZ	y)CrossEncoderDataCollatora  Collator for a CrossEncoder model.
    This encodes the text columns to {column}_input_ids and {column}_attention_mask columns.
    This works with the two text dataset that is used as the example in the training overview:
    https://www.sbert.net/docs/sentence_transformer/training_overview.html

    It is important that the columns are in the expected order. For example, if your dataset has columns
    "answer", "question" in that order, then the MultipleNegativesRankingLoss will consider
    "answer" as the anchor and "question" as the positive, and it will (unexpectedly) optimize for
    "given the answer, what is the question?".
    r   tokenize_fnc                 
    g dS )N)labellabelsscorescores r       o/home/chris/cleankitchens-env/lib/python3.12/site-packages/sentence_transformers/cross_encoder/data_collator.py<lambda>z!CrossEncoderDataCollator.<lambda>   s    Ci r   )default_factoryz	list[str]valid_label_columnsF)r   initreprzset[tuple[str]]_warned_columnsc                   t        |d   j                               }i }d|v r|j                  d       |d   d   |d<   | j                  D ]  }||v st	        |d   |   t
              r*|D cg c]  }t        j                  ||          c}|d<   n)t        j                  |D cg c]  }||   	 c}      |d<   |j                  |        n |D ]t  }|j                  d      rK|d t        d        |v r:t        j                  |D cg c]  }||   	 c}t        j                        ||<   _|D cg c]  }||   	 c}||<   v |S c c}w c c}w c c}w c c}w )Nr   dataset_namer   _prompt_length)dtype)listkeysremover   
isinstancer   torchtensorendswithlenint)selffeaturescolumn_namesbatchlabel_columnrowcolumn_names          r   __call__z!CrossEncoderDataCollator.__call__   sv   HQK,,./ \)/$,QK$?E.! !44 		L|+hqk,7DQY%Z#ell3|3D&E%ZE'N &+\\PX2Y3|3D2Y%ZE'N##L1		 ( 	HK##$45+F^M]I^H^:_co:o%*\\x2X3{3C2X`e`i`i%jk">F!Gs#k"2!GE+	H  &[ 3Z 3Y "Hs   -E&E

E
5EN)r)   zlist[dict[str, Any]]returnzdict[str, torch.Tensor])
__name__
__module____qualname____doc____annotations__r   r   setr   r/   r   r   r   r   r      s9    	 %*;i%jj',SuSX'YO_Yr   r   )
__future__r   collections.abcr   dataclassesr   r   typingr   r   r#   #sentence_transformers.data_collatorr	   r   r   r   r   <module>r<      s3    " & (    O .> . .r   