
    #iǆ                       d dl mZ d dlZd dlZd dlmZmZ d dlmZm	Z	 d dl
mZ d dlmZmZ d dlmZ d dlZd dlZd dlmZmZmZ d d	lmZ 	 d dlZd d
lmZ  e       rd dlmZ  ej@                  e!      Z"dZ#dZ$dhZ% G d de      Z& G d de      Z' G d d      Z( G d de(e      Z) G d de)      Z*d$dZ+	 	 	 	 	 	 	 	 d%dZ, G d de)      Z- G d de(ee      Z. G d d e.      Z/ G d! d"e.      Z0g d#Z1y# e$ r dZY w xY w)&    )annotationsN)ABCabstractmethod)defaultdictdeque)Iterator)
accumulatecycle)Any)BatchSamplerConcatDatasetSubsetRandomSampler)ExplicitEnum)is_datasets_available)Datasetl            l            dataset_namec                       e Zd ZdZdZdZdZdZy)BatchSamplersa   
    Stores the acceptable string identifiers for batch samplers.

    The batch sampler is responsible for determining how samples are grouped into batches during training.
    Valid options are:

    - ``BatchSamplers.BATCH_SAMPLER``: **[default]** Uses :class:`~sentence_transformers.base.sampler.DefaultBatchSampler`, the default
      PyTorch batch sampler.
    - ``BatchSamplers.NO_DUPLICATES``: Uses :class:`~sentence_transformers.sampler.NoDuplicatesBatchSampler`,
      ensuring no duplicate samples in a batch.
    - ``BatchSamplers.NO_DUPLICATES_HASHED``: Uses :class:`~sentence_transformers.sampler.NoDuplicatesBatchSampler`
      with ``precompute_hashes=True``, a variant that precomputes hashes for faster duplicate checks at a small memory cost.
      Requires the ``xxhash`` library to be installed.

      Both are recommended for losses that use in-batch negatives, such as:

        - :class:`~sentence_transformers.sentence_transformer.losses.MultipleNegativesRankingLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.CachedMultipleNegativesRankingLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.MultipleNegativesSymmetricRankingLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.CachedMultipleNegativesSymmetricRankingLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.MegaBatchMarginLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.GISTEmbedLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.CachedGISTEmbedLoss`
    - ``BatchSamplers.GROUP_BY_LABEL``: Uses :class:`~sentence_transformers.sampler.GroupByLabelBatchSampler`,
      which constructs each batch by drawing at least 2 samples from each of at least 2 distinct labels.
      This guarantees every batch contains multiple classes, which is required for in-batch triplet mining.
      Recommended for:

        - :class:`~sentence_transformers.sentence_transformer.losses.BatchAllTripletLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchHardSoftMarginTripletLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchHardTripletLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchSemiHardTripletLoss`

    If you want to use a custom batch sampler, then you can subclass
    :class:`~sentence_transformers.base.sampler.DefaultBatchSampler` and pass the class (not an instance) to the
    ``batch_sampler`` argument in :class:`~sentence_transformers.sentence_transformer.training_args.SentenceTransformerTrainingArguments`
    (or :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments`, etc.).
    Alternatively, you can pass a function that accepts ``dataset``, ``batch_size``, ``drop_last``,
    ``valid_label_columns``, ``generator``, and ``seed`` and returns a
    :class:`~sentence_transformers.base.sampler.DefaultBatchSampler` instance.

    Usage:
        ::

            from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
            from sentence_transformers.sentence_transformer.training_args import BatchSamplers
            from sentence_transformers.sentence_transformer.losses import MultipleNegativesRankingLoss
            from datasets import Dataset

            model = SentenceTransformer("microsoft/mpnet-base")
            train_dataset = Dataset.from_dict({
                "anchor": ["It's nice weather outside today.", "He drove to work."],
                "positive": ["It's so sunny.", "He took the car to the office."],
            })
            loss = MultipleNegativesRankingLoss(model)
            args = SentenceTransformerTrainingArguments(
                output_dir="checkpoints",
                batch_sampler=BatchSamplers.NO_DUPLICATES,
            )
            trainer = SentenceTransformerTrainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                loss=loss,
            )
            trainer.train()
    batch_samplerno_duplicatesno_duplicates_hashedgroup_by_labelN)__name__
__module____qualname____doc__BATCH_SAMPLERNO_DUPLICATESNO_DUPLICATES_HASHEDGROUP_BY_LABEL     s/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/sentence_transformers/base/sampler.pyr   r   !   s     BH $M#M1%Nr"   r   c                      e Zd ZdZdZdZy)MultiDatasetBatchSamplersa  
    Stores the acceptable string identifiers for multi-dataset batch samplers.

    The multi-dataset batch sampler is responsible for determining in what order batches are sampled from multiple
    datasets during training. Valid options are:

    - ``MultiDatasetBatchSamplers.ROUND_ROBIN``: Uses :class:`~sentence_transformers.base.sampler.RoundRobinBatchSampler`,
      which uses round-robin sampling from each dataset until one is exhausted.
      With this strategy, it's likely that not all samples from each dataset are used, but each dataset is sampled
      from equally.
    - ``MultiDatasetBatchSamplers.PROPORTIONAL``: **[default]** Uses :class:`~sentence_transformers.base.sampler.ProportionalBatchSampler`,
      which samples from each dataset in proportion to its size.
      With this strategy, all samples from each dataset are used and larger datasets are sampled from more frequently.

    If you want to use a custom multi-dataset batch sampler, then you can subclass
    :class:`~sentence_transformers.base.sampler.MultiDatasetDefaultBatchSampler` and pass the class (not an instance) to the
    ``multi_dataset_batch_sampler`` argument in :class:`~sentence_transformers.sentence_transformer.training_args.SentenceTransformerTrainingArguments`.
    (or :class:`~sentence_transformers.cross_encoder.training_args.CrossEncoderTrainingArguments`, etc.). Alternatively,
    you can pass a function that accepts ``dataset`` (a :class:`~torch.utils.data.ConcatDataset`), ``batch_samplers``
    (i.e. a list of batch sampler for each of the datasets in the :class:`~torch.utils.data.ConcatDataset`), ``generator``,
    and ``seed`` and returns a :class:`~sentence_transformers.base.sampler.MultiDatasetDefaultBatchSampler` instance.

    Usage:
        ::

            from sentence_transformers import SentenceTransformer, SentenceTransformerTrainer, SentenceTransformerTrainingArguments
            from sentence_transformers.sentence_transformer.training_args import MultiDatasetBatchSamplers
            from sentence_transformers.sentence_transformer.losses import CoSENTLoss
            from datasets import Dataset, DatasetDict

            model = SentenceTransformer("microsoft/mpnet-base")
            train_general = Dataset.from_dict({
                "sentence_A": ["It's nice weather outside today.", "He drove to work."],
                "sentence_B": ["It's so sunny.", "He took the car to the bank."],
                "score": [0.9, 0.4],
            })
            train_medical = Dataset.from_dict({
                "sentence_A": ["The patient has a fever.", "The doctor prescribed medication.", "The patient is sweating."],
                "sentence_B": ["The patient feels hot.", "The medication was given to the patient.", "The patient is perspiring."],
                "score": [0.8, 0.6, 0.7],
            })
            train_legal = Dataset.from_dict({
                "sentence_A": ["This contract is legally binding.", "The parties agree to the terms and conditions."],
                "sentence_B": ["Both parties acknowledge their obligations.", "By signing this agreement, the parties enter into a legal relationship."],
                "score": [0.7, 0.8],
            })
            train_dataset = DatasetDict({
                "general": train_general,
                "medical": train_medical,
                "legal": train_legal,
            })

            loss = CoSENTLoss(model)
            args = SentenceTransformerTrainingArguments(
                output_dir="checkpoints",
                multi_dataset_batch_sampler=MultiDatasetBatchSamplers.PROPORTIONAL,
            )
            trainer = SentenceTransformerTrainer(
                model=model,
                args=args,
                train_dataset=train_dataset,
                loss=loss,
            )
            trainer.train()
    round_robinproportionalN)r   r   r   r   ROUND_ROBINPROPORTIONALr!   r"   r#   r%   r%   l   s    @D  K!Lr"   r%   c                  ,     e Zd ZdZd fdZddZ xZS )SetEpochMixinz
    Required for a BatchSampler as the Trainer will call set_epoch on the BatchSampler at the beginning of each epoch.
    The BatchSampler can then set the generator seed accordingly.
    c                2    t        |   |i | d| _        y Nr   )super__init__epoch)selfargskwargs	__class__s      r#   r/   zSetEpochMixin.__init__   s    $)&)
r"   c                    || _         y N)r0   )r1   r0   s     r#   	set_epochzSetEpochMixin.set_epoch   s	    
r"   returnNone)r0   intr9   r:   )r   r   r   r   r/   r7   __classcell__r4   s   @r#   r+   r+      s    
r"   r+   c                  F     e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZ xZS )DefaultBatchSampleraF  
    This sampler is the default batch sampler used in the SentenceTransformer library.
    It is equivalent to the PyTorch BatchSampler.

    Args:
        dataset (Dataset): The dataset to sample from.
        batch_size (int): Number of samples per batch.
        drop_last (bool): If True, drop the last incomplete batch if the dataset size
            is not divisible by the batch size.
        valid_label_columns (List[str], optional): List of column names to check for labels.
            The first column name from ``valid_label_columns`` found in the dataset will
            be used as the label column.
        generator (torch.Generator, optional): Optional random number generator for shuffling
            the indices.
        seed (int): Seed for the random number generator to ensure reproducibility. Defaults to 0.
    c                T    t         |   |||       || _        || _        || _        y )N
batch_size	drop_last)r.   r/   valid_label_columns	generatorseed)r1   datasetrB   rC   rD   rE   rF   r4   s          r#   r/   zDefaultBatchSampler.__init__   s/     	Z9M#6 "	r"   NNr   rG   r   rB   r;   rC   boolrD   list[str] | NonerE   torch.Generator | NonerF   r;   r9   r:   )r   r   r   r   r/   r<   r=   s   @r#   r?   r?      sa    , 15,0  	
 . *  
 r"   r?   c                  h     e Zd ZdZ	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZedd       Zd	dZd
dZ xZ	S )GroupByLabelBatchSamplera  
    Batch sampler that groups samples by label for in-batch triplet mining.

    Samples are shuffled within each label, then interleaved in round-robin
    fashion to produce a stream where labels are well-mixed. This stream is
    chunked into batches of exactly ``batch_size``. Every batch is guaranteed
    to contain multiple distinct labels, each with at least 2 samples.

    Labels take turns emitting 2 samples each. The stream stops when fewer
    than 2 labels remain, so the dominant label's tail ends up in the
    remainder. Produces excellent per-batch balance.

    Recommended for:
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchAllTripletLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchHardSoftMarginTripletLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchHardTripletLoss`
        - :class:`~sentence_transformers.sentence_transformer.losses.BatchSemiHardTripletLoss`

    Args:
        dataset (Dataset): The dataset to sample from.
        batch_size (int): Number of samples per batch. Must be an even number >= 4.
        drop_last (bool): If True, drop the last incomplete batch if the dataset size
            is not divisible by the batch size.
        valid_label_columns (List[str], optional): List of column names to check for labels.
            The first column name from ``valid_label_columns`` found in the dataset will
            be used as the label column.
        generator (torch.Generator, optional): Optional random number generator for shuffling
            the indices.
        seed (int): Seed for the random number generator to ensure reproducibility. Defaults to 0.
    c           
        t         |   ||||||       || _        | j                  dk  s| j                  dz  dk(  rt	        d| j                   d      | j                  || j                        }t        t              }t        |      D ]  \  }	}
||
   j                  |	        |j                         D 
ci c])  \  }
}t        |      dk\  s|
|d t        |      dz  dz   + c}}
| _        t        | j                        dk  r"t	        dt        | j                         d      t        d	 | j                  j                         D        d
      }|d   dt!        fd|D              z  | _        y c c}}
w )NrB   rC   rD   rE   rF            z0batch_size must be an even number >= 4, but got .z^GroupByLabelBatchSampler requires at least 2 distinct labels with >= 2 samples each, but only z label(s) qualified.c              3  8   K   | ]  }t        |      d z    yw)rR   Nlen).0idxs     r#   	<genexpr>z4GroupByLabelBatchSampler.__init__.<locals>.<genexpr>)  s     F#CAFs   T)reversec              3  6   K   | ]  }t        |        y wr6   )min)rX   pcaps     r#   rZ   z4GroupByLabelBatchSampler.__init__.<locals>.<genexpr>+  s     %Aac!Sk%As   )r.   r/   rG   rB   
ValueError_determine_labels_to_userD   r   list	enumerateappenditemsrW   groupssortedvaluessum_stream_length)r1   rG   rB   rC   rD   rE   rF   labelsrf   
sample_idxlabelindicespairsr_   r4   s                @r#   r/   z!GroupByLabelBatchSampler.__init__  s    	! 3 	 	
 ??Q$//A"5":OPTP_P_O``abcc..w8P8PQ'24'8!*6!2 	-J5M  ,	- KQ,,.
8Fw\_`g\hlm\mE72S\Q.233
 t{{a,--AC  F1C1C1EFPTUAh#%A5%A"AA
s   >E=E=c                z    |xs g D ]  }|| j                   v s| |   c S  t        d| d| j                    d      )Nz None of the valid_label_columns z3 are in the dataset, which only has these columns: rT   )column_namesr`   )rG   rD   column_names      r#   ra   z1GroupByLabelBatchSampler._determine_labels_to_use-  sa    .4" 	,Kg222{++	, ./B.C D--4-A-A,B!E
 	
r"   c              #    K   | j                   r>| j                  2| j                   j                  | j                  | j                  z          i }| j                  j                         D ]F  \  }t        j                  t              | j                         }t        fd|D              ||<   H t        |      }g }t        |      dk\  rt        j                  t        |      | j                         D cg c]  }||   	 }}|D ]  }|j                  ||   j                                |j                  ||   j                                t        |      | j                  k\  s`|d | j                    || j                  d  } |D cg c]
  }||   s	| }}t        |      dk\  r| j                  st        |      dk\  r| y y y c c}w c c}w w)NrE   c              3  (   K   | ]	  }|     y wr6   r!   )rX   irn   s     r#   rZ   z4GroupByLabelBatchSampler.__iter__.<locals>.<genexpr>?  s     !;'!*!;s   rR   rQ   )rE   rF   manual_seedr0   rf   re   torchrandpermrW   r   rb   rd   popleftrB   rC   )r1   queuesrm   permremaining_labelsbatchrv   rn   s          @r#   __iter__z!GroupByLabelBatchSampler.__iter__7  s    >>dii3NN&&tyy4::'=> )+"kk//1 	<NE7>>#g,$..ID!!;d!;;F5M	<  <"#q(-2^^C@P<Q]a]k]k-l () #    * 5VE]2245VE]2245u:0 1$//22!$//"34E5 4DU%ve}UU "#q( ~~#e*/K #2~   Vs1   C7G:GA#G*&G
GGG0+Gc                    | j                   | j                  z  }| j                  s!| j                   | j                  z  dk\  r|dz  }|S )NrQ   rS   )rj   rB   rC   )r1   ns     r#   __len__z GroupByLabelBatchSampler.__len__U  sA    4??2~~$"5"5"G1"LFAr"   rH   rI   )rG   r   rD   rK   r9   z	list[Any]r9   zIterator[list[int]]r9   r;   )
r   r   r   r   r/   staticmethodra   r   r   r<   r=   s   @r#   rN   rN      s    H 15,0)B)B )B 	)B
 .)B *)B )B 
)BV 
 
<r"   rN   c                T    t        j                  |       }|t        k\  r	|t        z  }|S r6   )xxhashxxh64_intdigest_XXHASH_INT64_MAX_XXHASH_UINT64_MAX)valuehasheds     r#   _xxhash_int64r   \  s+    ##E*F""$$Mr"   c           	        |D cg c]	  }||vs| }}|rt        | |d            n+t        t        t        | j                               g             }|sdt	        |      D cg c]  }g  c}iS g }t	        |      D ]G  }g }	|D ]-  }| |   |   }
|	j                  t        t        |
                   / |j                  |	       I d|iS c c}w c c}w )Nr   __hashes)rW   nextiterrh   rangerd   r   str)r~   columnsexclude_columnscolumnactive_columnsrB   _hashesrow_idx
row_hashesr   s              r#   _hash_batchr   d  s    
 ,3TfO6SfTNT2@U>!,-.c$tTYT`T`TbOcegJhFiJz):;AR;<< F$ " "
$ 	9F&M'*E mCJ78		9
 	j!"  U <s   	CC$	Cc                  l     e Zd Z	 	 	 	 	 	 d	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d fdZddZddZd	dZ xZS )
NoDuplicatesBatchSamplerc
                   t         |   ||||||       t        |j                        t        | j                  xs g       z  x}
r|j                  t        |
            }|| _        || _        || _	        |	| _
        d| _        | j                  rWt        t        d      | j                  9t        j                         xs d}t!        dt#        d|dz
              }|| _	        yyy)aH
  
        This sampler creates batches such that each batch contains samples where the values are unique,
        even across columns. This is useful when losses consider other samples in a batch to be in-batch
        negatives, and you want to ensure that the negatives are not duplicates of the anchor/positive sample.

        Recommended for:
            - :class:`~sentence_transformers.sentence_transformer.losses.MultipleNegativesRankingLoss`
            - :class:`~sentence_transformers.sentence_transformer.losses.CachedMultipleNegativesRankingLoss`
            - :class:`~sentence_transformers.sentence_transformer.losses.MultipleNegativesSymmetricRankingLoss`
            - :class:`~sentence_transformers.sentence_transformer.losses.CachedMultipleNegativesSymmetricRankingLoss`
            - :class:`~sentence_transformers.sentence_transformer.losses.MegaBatchMarginLoss`
            - :class:`~sentence_transformers.sentence_transformer.losses.GISTEmbedLoss`
            - :class:`~sentence_transformers.sentence_transformer.losses.CachedGISTEmbedLoss`

        Args:
            dataset (Dataset): The dataset to sample from.
            batch_size (int): Number of samples per batch.
            drop_last (bool): If True, drop the last incomplete batch if the dataset size
                is not divisible by the batch size.
            valid_label_columns (List[str], optional): List of column names to check for labels.
                The first column name from ``valid_label_columns`` found in the dataset will
                be used as the label column.
            generator (torch.Generator, optional): Optional random number generator for shuffling
                the indices.
            seed (int): Seed for the random number generator to ensure reproducibility. Defaults to 0.
            precompute_hashes (bool, optional): If True, precompute xxhash 64-bit values for dataset
                fields using ``datasets.map`` to speed up duplicate checks. Requires ``xxhash`` to
                be installed and uses additional memory: in theory roughly
                ``len(dataset) * num_columns * 8`` bytes for the dense int64 hash matrix,
                although actual memory usage may therefore differ in practice. Defaults to False.
            precompute_num_proc (int, optional): Number of processes for hashing with ``datasets.map``.
                If set to ``None``, defaults to ``min(8, cpu_count - 1)`` when ``precompute_hashes``
                is True.
            precompute_batch_size (int, optional): Batch size for ``datasets.map`` hashing.
                Defaults to 1000.
        rP   NzlNoDuplicatesBatchSampler with precompute_hashes=True requires `xxhash`. Install `xxhash` to use this option.rS      )r.   r/   setrq   rD   remove_columnsrb   rG   precompute_hashesprecompute_num_procprecompute_batch_size_row_hashesr   ImportErroros	cpu_countmaxr]   )r1   rG   rB   rC   rD   rE   rF   r   r   r   label_columnsr   default_workersr4   s                r#   r/   z!NoDuplicatesBatchSampler.__init__z  s    ` 	! 3 	 	
   4 45D<T<T<ZXZ8[[[=[,,T--@AG!2#6 %:".2!!~!;  ''/LLN/a	"%aQ	A)>"?+:(	 0 "r"   c           
        | j                   r| j                  y t        | j                  j                        }| j                  j                  t        d| j                  | j                  ||t        dd      }dd l
}	 |j                  j                  d      }t        ||j                        r|j                         }t        ||j                   |j"                  f      st%        d      t'        |      }|dk(  r-t)        j*                  dt(        j,                  	      | _        	 ~y |j.                  j1                  d
      }t3        |d   |d   z
        }|dk  s+t)        j4                  t)        j6                  |      |k(        st%        d      |j8                  j1                  d
      j;                  t(        j,                  d
      }|j<                  ||z  k7  rt%        d      |j?                  ||f      | _        ~y # ~w xY w)NT)r   r   zHashing dataset values)batchedrB   num_procr   	fn_kwargsdescr   r   z)Expected a list column for hashed values.)r   r   dtypeF)zero_copy_onlyrS   z!Hashed rows have varying lengths.)copyz$Unexpected hashed value buffer size.) r   r   rb   rG   rq   mapr   r   r   _EXCLUDE_DATASET_COLUMNSpyarrowdatar   
isinstanceChunkedArraycombine_chunks	ListArrayLargeListArrayr`   rW   npzerosint64offsetsto_numpyr;   alldiffrh   astypesizereshape)	r1   r   hash_dspar   	row_countr   row_sizerh   s	            r#   _build_hashesz&NoDuplicatesBatchSampler._build_hashes  s    %%)9)9)Et||001 ,,""11--"")>VW) # 
 		\\((4F&"//2..0fr||R5F5F&GH !LMMFIA~#%88F"((#C   nn--U-CG71:
23H !|266"'''*:h*F#G !DEE]]++5+AHHX]H^F{{i(22 !GHH%~~y(.CDD s   ?B(H *CH H
c              #     K    j                   r> j                  2 j                   j                   j                   j                  z           j                  r# j                           j                  dfd}nd fd}dd}t         j                        }|dk(  ry|t        j                  t        j                        j                  k  rt        j                  nt        j                  }t        j                  | j                   |      j!                         }|dz   t        j                  t        j                        j                  k  rt        j                  nt        j                  }t        j"                  d|dz   |      }d	|d	<   d}|d	k7  rt%               }	g }
|}d	}d
}|d	k7  rt'        ||         }t'        ||         } ||      } |||	      r|}|}7|
j)                  |       |d	k(  r|}n|||<   t        |
       j*                  k(  rd}|
 n|	j-                  |       |}|d	k7  r|s j.                  s|
 |d	k7  ryyw)a5  
        Iterate over the remaining non-yielded indices. For each index, check if the sample values are already in the
        batch. If not, add the sample values to the batch keep going until the batch is full. If the batch is full, yield
        the batch indices and continue with the next batch.
        Nc                    |    S r6   r!   )indexr   s    r#   get_sample_valuesz<NoDuplicatesBatchSampler.__iter__.<locals>.get_sample_values   s    !%((r"   c                    j                   |    j                         D ch c]  \  }}|t        vst        |       c}}S c c}}w r6   )rG   re   r   r   )r   keyr   r1   s      r#   r   z<NoDuplicatesBatchSampler.__iter__.<locals>.get_sample_values  sC    15e1D1J1J1L#-3PS[sPsCJ  s
   AAc                p    t        | t              r| j                         S t        fd| D              S )Nc              3  &   K   | ]  }|v  
 y wr6   r!   )rX   r   batch_valuess     r#   rZ   zJNoDuplicatesBatchSampler.__iter__.<locals>._has_overlap.<locals>.<genexpr>  s     Hu,Hs   )r   r   
isdisjointany)sample_valuesr   s    `r#   _has_overlapz7NoDuplicatesBatchSampler.__iter__.<locals>._has_overlap
  s2    --(33LAAAH-HHHr"   r   )rE   r   rS   r   FT)r   r;   r9   set[str] | np.ndarray)r   r   r   zset[str | np.int64]r9   rJ   )rE   rF   rw   r0   r   r   r   rW   rG   r   iinfoint32r   rx   r   ry   numpyaranger   r;   rd   rB   updaterC   )r1   r   r   num_rowsindex_dtyperemaining_indicesposition_dtypenext_positionshead_positionr   batch_indicescurrent_positionprevious_position
full_batchnext_positionr   r   r   s   `                @r#   r   z!NoDuplicatesBatchSampler.__iter__  s     >>dii3NN&&tyy4::'=>!! %)%5%5J)

	I t||$q= &."((1C1G1G%GekkU[[!NN8t~~U`aggi &.\RXXbhh5G5K5K%KQSQYQY1hl.Irr!03L')M, "J"b( #N3C$D E-.>?@ 1% 8|<(8%'4$$$U+$*$1M8EN#45}%8!%J''##M2#0 - #b(0 ~~''C r!s   H7I<IIc                    | j                   r"t        | j                        | j                  z  S t        | j                        | j                  z   dz
  | j                  z  S )a_  Return the approximate number of batches.

        .. note::

            This is an upper-bound estimate. The actual number of batches
            yielded by :meth:`__iter__` may be smaller when the dataset
            contains many duplicate values, because those samples are
            deferred or skipped rather than placed into a batch.
        rS   )rC   rW   rG   rB   r1   s    r#   r   z NoDuplicatesBatchSampler.__len__C  sI     >>t||$77%7!;OOr"   )NNr   FNi  )rG   r   rB   r;   rC   rJ   rD   rK   rE   rL   rF   r;   r   rJ   r   z
int | Noner   r;   r9   r:   r8   r   r   )r   r   r   r/   r   r   r   r<   r=   s   @r#   r   r   y  s     15,0"'*.%)I;I; I; 	I;
 .I; *I; I;  I; (I;  #I; 
I;V,\N(`Pr"   r   c                  `     e Zd ZdZ	 	 d	 	 	 	 	 	 	 	 	 d fdZedd       Zedd       Z xZS )	MultiDatasetDefaultBatchSampleraT  
    Abstract base batch sampler that yields batches from multiple batch samplers.
    This class must be subclassed to implement specific sampling strategies, and
    cannot be used directly.

    Args:
        dataset (ConcatDataset): A concatenation of multiple datasets.
        batch_samplers (List[BatchSampler]): A list of batch samplers, one for each dataset in the ConcatDataset.
        generator (torch.Generator, optional): A generator for reproducible sampling. Defaults to None.
        seed (int): Seed for the random number generator to ensure reproducibility. Defaults to 0.
    c                    t        |j                        t        |      k7  rt        d      t        |   ||d   j
                  |d   j                         || _        || _        || _	        || _
        y )NzTThe number of batch samplers must match the number of datasets in the ConcatDataset.r   rA   )rW   datasetsr`   r.   r/   rB   rC   rG   batch_samplersrE   rF   )r1   rG   r   rE   rF   r4   s        r#   r/   z(MultiDatasetDefaultBatchSampler.__init__`  sr     w C$77stt^A->-I-IUcdeUfUpUpq,"	r"   c                     y)z?Yield batches from the underlying datasets in a specific order.Nr!   r   s    r#   r   z(MultiDatasetDefaultBatchSampler.__iter__o       	r"   c                     y)z,Return the number of batches in the sampler.Nr!   r   s    r#   r   z'MultiDatasetDefaultBatchSampler.__len__t  r   r"   r-   )
rG   r   r   zlist[BatchSampler]rE   rL   rF   r;   r9   r:   r   r   )	r   r   r   r   r/   r   r   r   r<   r=   s   @r#   r   r   S  sm    
  -1 + *	
  
    r"   r   c                       e Zd ZdZddZddZy)RoundRobinBatchSamplera  
    Batch sampler that yields batches in a round-robin fashion from multiple batch samplers, until one is exhausted.
    With this sampler, it's unlikely that all samples from each dataset are used, but we do ensure that each dataset
    is sampled from equally.

    Args:
        dataset (ConcatDataset): A concatenation of multiple datasets.
        batch_samplers (List[BatchSampler]): A list of batch samplers, one for each dataset in the ConcatDataset.
        generator (torch.Generator, optional): A generator for reproducible sampling. Defaults to None.
        seed (int): Seed for the random number generator to ensure reproducibility. Defaults to 0.
    c              #  <  K   | j                   r>| j                  2| j                   j                  | j                  | j                  z          | j                  j
                  D cg c]  }t        |       }}dgt        t        |            z   }| j                  D cg c]  }t        |       }}t        t        t        |                  D ])  }||   }	 t        ||         D cg c]  }||z   	 c} + y c c}w c c}w c c}w # t        $ r Y  y w xY wwr-   )rE   rF   rw   r0   rG   r   rW   rb   r	   r   r   r
   r   r   StopIteration)	r1   rG   num_samplessample_offsetssamplerr   dataset_idxsample_offsetrY   s	            r#   r   zRoundRobinBatchSampler.__iter__  s     >>dii3NN&&tyy4::'=>37<<3H3HIs7|IItJ{$;<<7;7J7JKG$w-KK s>':!;< 	K*;7M6:>+;V6WXss]*XX		 J L Y  sT   A#D%C=7(DD1'DD)D5D:DD	DDDDc                f    t        d | j                  D              t        | j                        z  S )Nc              3  2   K   | ]  }t        |        y wr6   rV   )rX   r   s     r#   rZ   z1RoundRobinBatchSampler.__len__.<locals>.<genexpr>  s     CG3w<Cs   )r]   r   rW   r   s    r#   r   zRoundRobinBatchSampler.__len__  s)    Ct/B/BCCc$J]J]F^^^r"   Nr   r   r   r   r   r   r   r   r!   r"   r#   r   r   z  s    
 _r"   r   c                       e Zd ZdZddZddZy)ProportionalBatchSamplera|  
    Batch sampler that samples from each dataset in proportion to its size, until all are exhausted simultaneously.
    With this sampler, all samples from each dataset are used and larger datasets are sampled from more frequently.

    Args:
        dataset (ConcatDataset): A concatenation of multiple datasets.
        batch_samplers (List[BatchSampler]): A list of batch samplers, one for each dataset in the ConcatDataset.
        generator (torch.Generator, optional): A generator for reproducible sampling. Defaults to None.
        seed (int): Seed for the random number generator to ensure reproducibility. Defaults to 0.
    c              #    K   | j                   r>| j                  2| j                   j                  | j                  | j                  z          | j                  j
                  D cg c]  }t        |       }}dgt        t        |            z   }| j                  D cg c]  }t        |       }}t        |      D cg c]  \  }}t        |      D ]  }|  }	}}}t        |	| j                         }
| j                  D cg c]  }t        |       }}|
D ])  }||   }	 t        ||         D cg c]  }||z   	 c} + y c c}w c c}w c c}}}w c c}w c c}w # t        $ r Y Sw xY ww)Nr   rt   )rE   rF   rw   r0   rG   r   rW   rb   r	   r   rc   r   r   r   r   r   )r1   rG   r   r   r   num_batchesrY   lengthr   dataset_indicesdataset_idx_samplerr   r   r   s                 r#   r   z!ProportionalBatchSampler.__iter__  sS    >>dii3NN&&tyy4::'=>37<<3H3HIs7|IItJ{$;<<373F3FGs7|GG2;K2H``;3RWX^R_`Q3`3``1/T^^\7;7J7JKG$w-KK. 	K*;7M6:>+;V6WXss]*XX	 J H` L Y  sl   A#E4%E
7(E4E1E4E)E4EE4&E%6E E%E4 E%%	E1.E40E11E4c                d    t        | j                  D cg c]  }t        |       c}      S c c}w r6   )ri   r   rW   )r1   r   s     r#   r   z ProportionalBatchSampler.__len__  s%    0C0CDWCLDEEDs   -Nr   r   r  r!   r"   r#   r  r    s    	&Fr"   r  )r?   rN   r   r   r   r  r   r%   )r   r   r9   r;   )r~   zdict[str, list[Any]]r   z	list[str]r   zset[str]r9   zdict[str, list[list[int]]])2
__future__r   loggingr   abcr   r   collectionsr   r   collections.abcr   	itertoolsr	   r
   typingr   r   r   rx   torch.utils.datar   r   r   transformers.utilsr   r   r   sentence_transformers.utilr   r   r   	getLoggerr   loggerr   r   r   r   r%   r+   r?   rN   r   r   r   r   r   r  __all__r!   r"   r#   <module>r     s;   "  	 # * $ '    M M + = 			8	$  *+ H&L H&VD" D"N - Bw2 wt  *3 FN  *WP2 WPt$m\3 $N_< _B F>  FF	Y  Fs   C6 6D ?D 