
    i#                        d dl Z d dlmZ d dlmZmZmZmZmZm	Z	m
Z
mZmZ ddlmZ ee
e   ef   Z ed      Zeee   geee      f   Zddded	ed
ede	eegef      def
dZdddeded
ede	eegef      def
dZ	 ddede	eegef      defdZddefdee   ded	ed
ededeee      fdZddefdee   dedeee      fdZefde
e   dedeee      fdZy)    N)partial)	AnyCallableIterableIteratorListOptionalSequenceTypeVarUnion   )	minibatchItemT)
get_lengthsizebufferdiscard_oversizer   returnc                 :    |d|ini }t        t        f| ||d|S )a  Create a batcher that uses the `batch_by_padded_size` strategy.

    The padded size is defined as the maximum length of sequences within the
    batch multiplied by the number of sequences in the batch.

    size (int or Sequence[int]): The largest padded size to batch sequences into.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    buffer (int): The number of sequences to accumulate before sorting by length.
        A larger buffer will result in more even sizing, but if the buffer is
        very large, the iteration order will be less random, which can result
        in suboptimal training.
    discard_oversize (bool): Whether to discard sequences that are by themselves
        longer than the largest padded batch size.
    get_length (Callable or None): Function to get the length of a sequence item.
        The `len` function is used by default.
    r   )r   r   r   )r   minibatch_by_padded_size)r   r   r   r   	optionalss        h/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/training/batchers.py"configure_minibatch_by_padded_sizer      s=    0 /9.Dz*"I )	
      	tolerancec                 :    |d|ini }t        t        f| ||d|S )a  Create a batcher that uses the "minibatch by words" strategy.

    size (int or Sequence[int]): The target number of words per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    tolerance (float): What percentage of the size to allow batches to exceed.
    discard_oversize (bool): Whether to discard sequences that by themselves
        exceed the tolerated size.
    get_length (Callable or None): Function to get the length of a sequence
        item. The `len` function is used by default.
    r   )r   r   r   )r   minibatch_by_words)r   r   r   r   r   s        r   configure_minibatch_by_wordsr   8   s=    " /9.Dz*"I)	
  r   c                 6    |d|ini }t        t        fd| i|S )zCreate a batcher that creates batches of the specified size.

    size (int or Sequence[int]): The target number of items per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    r   r   )r   r   )r   r   r   s      r   configure_minibatchr    S   s*     /9.Dz*"I9545955r      Fseqsc              #   d  K   t        |t              rt        j                  |      }nt	        |      }t        | |      D ]f  }t        |      }t        |      }t        |||      D ]>  }|D 	cg c]  }	||	   	 }
}	t        d |
D              t        |
      z  }|r||k\  r;|
 @ h yc c}	w w)aq  Minibatch a sequence by the size of padded batches that would result,
    with sequences binned by length within a window.

    The padded size is defined as the maximum length of sequences within the
    batch multiplied by the number of sequences in the batch.

    size (int or Sequence[int]): The largest padded size to batch sequences into.
    buffer (int): The number of sequences to accumulate before sorting by length.
        A larger buffer will result in more even sizing, but if the buffer is
        very large, the iteration order will be less random, which can result
        in suboptimal training.
    discard_oversize (bool): Whether to discard sequences that are by themselves
        longer than the largest padded batch size.
    get_length (Callable or None): Function to get the length of a sequence item.
        The `len` function is used by default.
    )r   c              3   2   K   | ]  }t        |        y wNlen).0seqs     r   	<genexpr>z+minibatch_by_padded_size.<locals>.<genexpr>   s     ;3c#h;   N)
isinstanceint	itertoolsrepeatiterr   listnext_batch_by_lengthmaxr'   )r"   r   r   r   r   size_outer_batchtarget_sizeindicesisubbatchpadded_sizes               r   r   r   _   s     . $  &T
 F3 	;'5k'[*M 	G0781A8H8;(;;c(mKKK;$>		 9s   A,B0.B+:6B0g?c              #     K   t        |t              rt        j                  |      }nt	        |      }t        |      }||z  }g }g }	d}
d}| D ]  } ||      }|||z   kD  r	|r|g |dk(  r|
|z   |k  r|j                  |       |
|z  }
@|
|z   |z   ||z   k  r|	j                  |       ||z  }e|r| t        |      }||z  }|	}|}
g }	d}|
|z   |k  r|j                  |       |
|z  }
|
|z   ||z   k  r|	j                  |       ||z  }|r| t        |      }||z  }|g}|}
 |j                  |	       |r| yyw)a  Create minibatches of roughly a given number of words. If any examples
    are longer than the specified batch length, they will appear in a batch by
    themselves, or be discarded if discard_oversize=True.

    seqs (Iterable[Sequence]): The sequences to minibatch.
    size (int or Sequence[int]): The target number of words per batch.
        Can be a single integer, or a sequence, allowing for variable batch sizes.
    tolerance (float): What percentage of the size to allow batches to exceed.
    discard_oversize (bool): Whether to discard sequences that by themselves
        exceed the tolerated size.
    get_length (Callable or None): Function to get the length of a sequence
        item. The `len` function is used by default.
    r   N)r,   r-   r.   r/   r0   r2   appendextend)r"   r   r   r   r   r5   r7   tol_sizebatchoverflow
batch_sizeoverflow_sizer)   n_wordss                 r   r   r      s    ( $  &T
u+KY&HEHJM (%S/ [8++#eaZ'%9k$ILL'!J=(72h8NOOOC W$M u+K"Y.HE&JHMW$4S!g%
w&K(,BC$( K"5k&2$
Q(%R 
LL s   A!E$C$E	max_wordsc                    t        |       D cg c]  \  }} ||      |f }}}|j                          g }g }|D ]S  \  }}|s|j                  |       |t        |      dz   z  |k  r|j                  |       @|j                  |       |g}U |r|j                  |       t	        d |D              t        |       k(  sJ |D cg c]  }t        |       }}|j                          |S c c}}w c c}w )zGiven a list of sequences, return a batched list of indices into the
    list, where the batches are grouped by length, in descending order.

    Batches may be at most max_words in size, defined as max sequence length * size.
       c              3   2   K   | ]  }t        |        y wr%   r&   )r(   bs     r   r*   z#_batch_by_length.<locals>.<genexpr>   s     '!s1v'r+   )	enumeratesortr=   r'   sumsortedreverse)	r"   rE   r   r9   r)   lengths_indicesbatchesr@   lengths	            r   r3   r3      s     ;DD/J3
3+JOJGE$ 	LLOs5zA~&)3LLONN5!CE u'w''3t9444*12ve}2G2OON% K  3s   C/	C5r%   )r.   	functoolsr   typingr   r   r   r   r   r	   r
   r   r   utilr   r-   Sizingr   BatcherTboolr   floatr   r    r'   r   r   r3    r   r   <module>rZ      s    
 
 
 	x}c!	"Xe_%xU'<<= 48
  	
 5'3,/0 N 48
  	
 5'3,/0 8 BF	6
	6&x'=>	6	6 "$
5/$
$ $ 	$
 $ d5k$T I
5/I
I d5kIZ 58
3-$'	$s)_r   