
    i                        d Z ddlZddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZm Z m!Z!m"Z" ddl#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z,m-Z-m.Z.m/Z/m0Z0m1Z1m2Z2 ddl3m4Z4m5Z5m6Z6  e6jn                  e8      Z9dZ:dZ;dZ<dZ=dZ>e,dz  Z,ee e!e"dZ?e:e=dZ@ e5e,       G d de/             ZAeAZBy)z
Tokenization classes for fast tokenizers (provided by HuggingFace's tokenizers library). For slow (python) tokenizers
see tokenization_utils.py
    N)defaultdict)Iterable)copyfile)Any)is_offline_mode)
AddedToken
processors)Encoding)	Tokenizer)Decoder)BPEUnigram)
BpeTrainerUnigramTrainerWordLevelTrainerWordPieceTrainercached_file   )SpmConverter)convert_gguf_tokenizer)load_gguf_checkpoint)INIT_TOKENIZER_DOCSTRINGBatchEncodingPreTokenizedInputPreTrainedTokenizerBase	TextInputTruncationStrategygenerate_merges)PaddingStrategyadd_end_docstringsloggingztokenizer.jsonzspecial_tokens_map.jsonztokenizer_config.jsonztokenizer.modelzadded_tokens.jsonu  
        tokenizer_object ([`tokenizers.Tokenizer`]):
            A [`tokenizers.Tokenizer`] object from 🤗 tokenizers to instantiate from. See [Using tokenizers from 🤗
            tokenizers](../fast_tokenizers) for more information.
        tokenizer_file ([`str`]):
            A path to a local JSON file representing a previously serialized [`tokenizers.Tokenizer`] object from 🤗
            tokenizers.
)r   r   	WordLevel	WordPiece)tokenizer_file
vocab_filec            )       
    e Zd ZdZeZdZdZedLd       Z	 fdZ
edefd       Zedefd       ZdMd	ed
edz  dee   fdZd Zed        Zed        Zej*                  d        Zej*                  d        Zd Zedefd       Zdeeef   fdZedeeef   fd       Zedeeef   fd       Zedeeef   fd       ZeZeZ deeef   fdZ!defdZ"defdZ#ede$fd       Z%ede&fd       Z'	 	 	 	 	 	 	 dNde(dedz  dedz  d ed!ed"ed#ed$edeeee)f   e*e(   f   fd%Z+d&edefd'Z,d(ededz  fd)Z-dLd*e*eez     defd+Z.dLd,edefd-Z/dLd.ee*e   z  d/edee*e   z  fd0Z0dOd1ed,edz  d2ede*e   fd3Z1d4e2d5e3d6ed7ed8edz  d9edz  fd:Z4dde2jj                  e3jl                  dd;ddddddddddddfd1e7e8z  e*e7   z  e*e8   z  d<e7e8z  e*e7   z  e*e8   z  dz  d2ed4e2d5e3d6edz  d7ed=ed8edz  d9edz  d>edz  dedz  dedz  d ed!ed"ed#ed$ed?edz  de9f(d@Z:dAe*e   defdBZ;	 	 dPdCee*e   z  d/edDedz  defdEZ<	 	 dQd	ee=j|                  z  dFeedGf   dHedz  d
edz  deedGf   f
dIZ?	 	 	 dRdJZ@e	 	 	 	 	 	 	 dSdK       ZA xZBS )TTokenizersBackendaQ  
    Base class for all fast tokenizers (wrapping HuggingFace tokenizers library).

    Inherits from [`~tokenization_utils_base.PreTrainedTokenizerBase`].

    Handles all the shared methods for tokenization and special tokens, as well as methods for
    downloading/caching/loading pretrained tokenizers, as well as adding tokens to the vocabulary.

    This class also contains the added tokens in a unified way on top of all tokenizers so we don't have to handle the
    specific vocabulary augmentation methods of the various underlying dictionary structures (BPE, sentencepiece...).
    NFc                 ^  ( t        |      }|j                  dd      }|Qt        j                  j	                  |      r2| t
        u sd| j                  vs|rt        j                  |      |d<   |S |t        j                  j	                  |      rt        |d      5 }t        j                  |      }ddd       j                  di       j                  d      }|d	vr[t        |      }t        |d         }	i |	d
<   |dk(  rg |	d<   |	|d<   g |d<   t        j                  t        j                  |            }
nt        j                  |      }
|
j                  |d<   |
j                   |d<   |
j"                  |d<   |
j"                  |
j"                  |d<   |
j                   |
j                   |d<   |j                  d      }|rk|j                  dd      dk(  r|d   }nt%        |t&              s|g}|D ]8  }|j                  d      dk(  sd|v sddl}|j+                  |d         |d<    n |j                  di       j                  d
d      }| j,                  ,t%        |t&              r%t'        t/        t0        |            }n
| j,                  j2                  dk(  rDt%        |t&              r|rt%        |d   t&        t0        f      r|D cg c]  }t1        |       }}n| j,                  j2                  dk(  rt5        |      D ci c]  \  }}||
 }}}nu| j,                  j2                  dk(  s| j,                  j2                  dk(  rCt%        |t&              r3t5        |      D ci c]  \  }}t%        |t&              r|d   n|| }}}||d
<   t7        | dd      }d|j                  di       v r`|r^|j2                  dk(  rO|d   d   }|D cg c]7  }t%        |t8              rt1        |j;                  d            n
t1        |      9 }}||d<   |S |j                  d      }|j                  d      }|j                  d
      }|j                  d      }t%        |t8              rY|j=                  d       rHt        j                  j	                  |      r)d!d"lm }  ||#      jC                  |      \  |d
<   |d<   |S t%        |t8              rnt        j                  j	                  |      rN|j=                  d$      r<	 d!d%lm"}  ||      } |jF                  | j,                  fi |}	 d!d&lm$} |j                  | j2                        }|tK        |d'      r |jL                  d?i |}tK        | d+      r | jT                  d?i |}d|vr| t
        u sd| j                  vr|j                  d
d      }|j                  dd      }|j                  d,      xs i }||r|jW                         D ci c]  \  }}||
 }}}|jW                         D ]S  \  }}tY        |      }t9        |      }|j                  |      } | s0| |k7  s6||vs;|j                  |       ||<   |||<   U t[        j\                  |j^                  ||-      }!|!|!|d<   |j^                  j`                  }"|"jb                  dk\  r |je                  d.|"jf                  xs d/       |"jh                  dk\  r |je                  d0|"jj                  xs d1       |"jl                  dk\  r |je                  d2|"jn                  xs d3       |S |9t%        |t8              r)t        j                  j	                  |      r
||d
<   |d
   }|9t%        |t8              r)t        j                  j	                  |      r
||d<   |d   }|| j,                  | j,                  j2                  dk(  rwt%        |t               rgd:tt        tv           d;t&        t8           f(fd<(g d=}%ty               }&|%D ]"  }'|'|v s|&j{                   (||'   g             $ t}        ||&>      }||d<   |S # 1 sw Y   xY wc c}w c c}}w c c}}w c c}w # tN        $ r1}tP        jS                  d(| j2                   d)| d*       Y d}~<d}~ww xY wc c}}w # tN        $ rY}tP        jS                  d4| d5| d6       d!d7lm8}#  |#||j                  d8      9      }$|$js                         |d<   Y d}~|S d}~ww xY w)@zs
        Build a `tokenizers.Tokenizer` backend from the available serialization files (tokenizer.json, sentencepiece
        models, tekken.json, vocab/merges).
        r%   N__init__tokenizer_objectutf-8encodingmodeltype)Nr   vocabr   mergesadded_tokenspost_processortokenizer_paddingtokenizer_truncation_json_truncation_json_padding
normalizerSequencenormalizersPrecompiledprecompiled_charsmapr   _spm_precompiled_charsmapr   r#   r$    r&   merges_fileztekken.jsonr   )MistralConverter)r&   .model)SentencePieceExtractor)SLOW_TO_FAST_CONVERTERSconvert_from_spmz,Could not reorder vocab using converter for z due to z/. Falling back to raw SentencePiece extraction.convert_from_spm_modeladded_tokens_decoder)protor1   r2   	bos_token<s>	eos_token</s>	unk_tokenz<unk>z+Could not extract SentencePiece model from z$ using sentencepiece library due to z%. Falling back to TikToken extractor.)TikTokenConverterextra_special_tokens)r&   rO   valuesreturnc                     g }| D ]M  }|t        |t        t        f      r|j                   |             4|j	                  t        |             O |S N)
isinstancelisttupleextendappendstr)rP   	collectedval_iter_special_tokenss      {/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/tokenization_utils_tokenizers.pyr\   zHTokenizersBackend.convert_to_native_format.<locals>._iter_special_tokens)  s[    ')	! 3C{ !#e}5!(()=c)BC!((S23 !     )		pad_tokenrM   rI   rK   	sep_token	cls_token
mask_tokenadditional_special_tokensrO   )skip_tokens )?dictpopospathisfiler(   __dict__TokenizerFast	from_fileopenjsonloadgetfrom_strdumpsr4   padding
truncationrT   rU   base64	b64decoder/   maprV   __name__	enumerategetattrrY   splitendswithconvert_slow_tokenizerrA   extract_vocab_merges_from_modelrC   extractrD   hasattrrE   	ExceptionloggerwarningrF   itemsintr   build_tokenizer_from_spm_protorH   trainer_specbos_id
setdefault	bos_pieceeos_id	eos_pieceunk_id	unk_piecerN   	convertedr   r   setupdater   ))clstrust_remote_codekwargslocal_kwargsfast_tokenizer_filetokenizer_handletokenizer_json
model_typeminimal_tokenizer_jsonminimal_modeltok_from_filenormalizer_configr9   rv   r1   itemitokenr2   merger&   r@   rA   rC   	extractorrD   converter_classerG   token_idid_to_token	new_tokencurrent_tokenr+   
proto_specrN   	converterspecial_tokens_keysrd   keyr\   s)                                           @r]   convert_to_native_formatz*TokenizersBackend.convert_to_native_formate   s	    F|*../?F  +23))Zs||-KO`/</F/FGZ/[L+, ,@S1T )G< =@P!%+;!<= (++GR8<<VDJ!22)-n)=& $^G%< =)+g&&.0M(+2?&w/9;&~6 - 6 6tzzBX7Y Z - 7 78K L-:-I-IL)*0=0E0EL,-3@3K3KL/0 ''33@3K3K/0$$00=0E0E_- !/ 2 2< @ $((6*D(9-(H%#$5t<):(;%"3 J!~~f->CY]gCg%DJDTDT&'=>E%@A  #&&w377FEyy eT* UE!23E##y0eT*uE!HtUZm9\5:;TU4[;E;##{22;E2BChaCC##u,		0B0Bk0QeT*_hin_opS[STV[E4)@U1XeQNpEp$)L! gt4J>--gr::
zObObfkOk'0:kqrbgZs5K%C 01QVW\Q]]rr)/X&!%%l3
"&&}5  )!!(+ j#&:+>+>}+MRTRYRYR`R`akRl@<L%=--j9 :L!<#9   j#&277>>*+E*J]J]^fJgFIJ 3:>	0y00KlK	O&=&A&A#,,&OO&2wPb7c'G'G'G'W,'W
 3 89#=3#=#=#M#ML
 &\9,,
#,,0N(,,Wd;E)--h=F ,8+;+;<R+S+YWY((-ANSkkm&\?5(x&\&\3G3M3M3O B/Hi'*8}H(+II,7OOH,EM,)1KPYafPf3899]3Ki 08AH 5B (4'R'R'oo#%($
 (3;K%78 &/__%A%A
%,,1(33KAUAUA^Y^_%,,1(33KAUAUA_Y_`%,,1(33KAUAUA`Y`a   =Z
C8RWW^^J=W$.L! )E>jc:rww~~k?Z%0L"!(+F >cii3		8J8Je8SXbchjnXo	!Xc] 	!tCy 	!
# %(EK* R,&&&';\#=N<O'PQR %UDF%+L"[= =x <C q sF ! NNFs||nT\]^\_  `O  P & ']6  
IA*Mqrsqt u: : F-)@P@PQg@h	 4=3F3F3H/0
Is   "a$a1a65"a<<b+c
 ?Ab  B
c
 
cAc
 c
  c
 %C'c
 $a.	c&b<6c
 <c	c
 
	d,Ad''d,c           	         |j                  dd       }|j                  dd       }|j                  dd        |j                  dd       }|j                  dd       }|j                  dd       }|j                  di       }|j                  dd	      }	|j                  d
      }
|j                  d      }|j                  d      }d }|t        j                  |      }n|6t        j
                  j                  |      rt        j                  |      }nn|{t        |j                  dd      |fi |}t        |      }|d   d   }|d   }|d   }t        ||      \  }}|j                  |       t        |      dkD  r|j                  |       n| j                  ||Ot        |t               r|n#t#        |      D ci c]  \  }\  }}|| c}}}}t        t%        ||dd             }nt        |t               rt        t%        |g dd             }nit        |t&              rY|rWt        |d   t(        t&        f      r>t        t+        ||j                  dd                  }n| j                  t-        d      |2|0| j                  $|j/                  dd       |j/                  dd       ||| _        | j                  t-        d      |j                  dd       xs | j                  j0                  xs |}|q | j                  j2                  d8i | |j/                  d|d          |j/                  d |d!          |j/                  d"|d"          |j/                  d#|d$          n| j                  j5                          |j                  d%d       xs | j                  j6                  xs |}| | j                  j8                  d8i | |j/                  d&|d&          |j/                  d'|d(          |j/                  d)|d!          |j/                  d|d*          |j/                  d+|d+          d,|vrd-|d,<   d.|v xs d/|v }|j                  d.d	      | _        |j                  d/d	      | _        |j                  d0d       x}r|| j                  _        |xs | j                  j>                  d u | _         tC        &|   d8i | |
|
| _#        |	| _$        | jJ                  | j                  _&        | jN                  D ch c]  }tQ        tS        |             }}tU        |jW                         d1 2      D cg c]  \  }}tQ        tS        |            |vr| }}}t'        | jX                  j[                               |D cg c]  }t]        |       c}z   } | j^                  ja                         D ])  }!|!t]        |!      | vs|!|vs|jc                  |!       + | jd                  D ]&  }t]        |      | vs||vs|jc                  |       ( t        |      dkD  rg }"| j^                  ja                         D #cg c]  }#|#st]        |#       }$}#|D ]a  }t        |t\              rtg        |d3      }n0t        |tf              r |jh                  st]        |      |$v rd|_4        |"jc                  |       c |"r| jk                  |"       	 | j                  jm                         }%|%d4kD  rtq        | j                  d5d       l|j                  dd         | jr                  | j                  | jt                  j                  dd       f| jt                  |j                  d6d       d7|| _        | j@                  xs | j                  j>                  d u | _         | j@                  r| jw                          y y c c}}}w c c}w c c}}w c c}w c c}#w # tn        $ r d}%Y w xY w)9Nr7   r8   r>   r+   	gguf_filer%   rG   add_prefix_spaceFr&   r1   r2   name_or_path configr   	tokenizertokenizer_configr   T)r1   r2   fuse_unkdropoutr   )r1   r   a9  Couldn't instantiate the backend tokenizer from one of: 
(1) a `tokenizers` library serialization file, 
(2) a slow tokenizer instance to convert or 
(3) an equivalent slow tokenizer class to instantiate and convert. 
You need to have sentencepiece or tiktoken installed to convert a slow tokenizer to a fast one.rI   rJ   rK   rL   z3The backend tokenizer is not correctly initialized.r6   
max_lengthtruncation_side	directionstridetruncation_strategystrategyr5   r_   pad_token_type_idpad_type_idpadding_sidelengthpad_to_multiple_ofbackend
tokenizersadd_bos_tokenadd_eos_tokenr4   c                     | d   S Nr   re   )xs    r]   <lambda>z,TokenizersBackend.__init__.<locals>.<lambda>  s    STUVSW r^   r   )speciali pre_tokenizerfix_mistral_regex)init_kwargsr   re   )<rg   rq   copydeepcopyrh   ri   rj   rl   rm   r   r   r   r   len
_tokenizerrT   rf   rz   r   rU   rV   r   
ValueErrorr   ru   enable_truncationno_truncationrt   enable_padding_add_bos_token_add_eos_tokenr4   _should_update_post_processorsuperr*   r&   r   split_special_tokensencode_special_tokensrG   hashreprsortedr   added_tokens_encoderkeysrY   _special_tokens_maprP   rX   _extra_special_tokensr   r   
add_tokensget_vocab_sizeNotImplementedErrorr{   _patch_mistral_regexr   update_post_processor)'selfargsr   r7   r8   r+   r   r   rG   r   r&   r1   r2   fast_tokenizer	gguf_path
gguf_paramarchitecturetokenizer_dictr   additional_kwargsr   w_
vocab_dict_truncation_paddingexplicit_bos_eos_in_kwargsr4   r   added_tokens_decoder_hashindextokens_to_addencoderspecial_token_valuetokenstall_named_tokens
vocab_size	__class__s'                                         r]   r*   zTokenizersBackend.__init__H  s    "::&8$?

?D9 	

.5!::&8$?JJ{D1	$jj)94@%zz*@"E!::&8%@ZZ-


7#H%'!]]+;<N ,@S1T*445HIN"#FJJ~r$BIXQWXI-i8J%h/=L'4N)*<=0F|Uc0d-N-MM*+$%)/0__$):!&0&=UZcdiZjCkCkYQPVQRTUAqDCk
!.sF]ako/p!qE4(!.srTXbf/g!hE4(Uz%(UTXM7Z!.wU6::V^`aKb/c!d__$r  &+;+CH_k51k62%,DO??"RSSjj!7>p$//B\B\p`p"-DOO--<<lK,EF/[1IJhH(=>3[5LMOO))+::148dDOO<S<SdWd*DOO**6X6k8K+@A18M3JKnh{.CDlHX,>?2H=Q4RS F" ,F9%4%>%[/U[B["$jj%@$jj%@#ZZ(8$??>?-;DOO*-G-q4??KiKimqKq*"6"!(DO 0040I0I-DHD]D]$^5T$u+%6$^!$^ !'';'A'A'C X
uDK (AA 
 

 t005578Ta;b5CJ;bb $(#;#;#B#B#D 	:"*&'w6;NVc;c$$%89		: // 	,E5z(U--G$$U+	, }!F040H0H0O0O0QW1UVAWW& %eS)&ud;Ez2 ==SZ;K-K(,e$% '	779J
 74??OT#R#^JJ{D)7d77  $$^T: !,,"(**-@$"G	
 DO ..X$//2P2PTX2X 	* --&&( .o Dl~ %_

 <c"  X  # 	J	s6   __!"_& _,_1&_1/_6 6``rQ   c                      y)NTre   r   s    r]   is_fastzTokenizersBackend.is_fast  s    r^   c                     d| j                   v r`| j                   d   j                  d      rBt        | d      r5| j                  r)t        j
                  j                  | j                        S yy)z
        `bool`: Whether or not the slow tokenizer can be saved. For a sentencepiece based slow tokenizer, this
        can only be `True` if the original `"sentencepiece.model"` was not deleted.
        r&   rB   FT)vocab_files_namesr}   r   r&   rh   ri   rj   r  s    r]   can_save_slow_tokenizerz)TokenizersBackend.can_save_slow_tokenizer  sX     4111d6L6L\6Z6c6cdl6mt\*tww~~doo66r^   save_directoryfilename_prefixc                    t         j                  j                  |      st        j	                  d| d       y t         j                  j                  ||r|dz   ndt        d   z         }t         j                  j                  | j                        t         j                  j                  |      k7  rt        | j                  |       |fS )NzVocabulary path (z) should be a directory-r   r&   )
rh   ri   isdirr   errorjoinVOCAB_FILES_NAMESabspathr&   r   )r   r	  r
  out_vocab_files       r]   save_vocabularyz!TokenizersBackend.save_vocabulary  s    ww}}^,LL,^,<<STUo_s22QbcoQpp
 77??4??+rww~/NNT__n5  r^   c                 @   | j                   }| j                  }|| j                  rd| _        | j                  }| j                  }|| j
                  rd| _        | j                  r|dz   nd d| j
                  rd|z   dz   nd }| | j                  rd|z   dz   nd d	| j
                  rd|z   dz   nd }g }| j                  r|j                  ||f       | j
                  r|j                  ||f       t        j                  |||
      | j                  _
        y)ze
        Updates the underlying post processor with the current `bos_token` and `eos_token`.
        NFz:0 r   z$A:0r?   z:0z:1z $B:1)singlepairspecial_tokens)rI   bos_token_idr   rK   eos_token_idr   rX   r	   TemplateProcessingr   r4   )r   bosr  eosr  r  r  r  s           r]   r   z'TokenizersBackend.update_post_processor
  s4    nn((;4--!&Dnn((;4--!&D%)%7%7S5[R@[_[m[mcCiRVFVsuDvw0B0B39t+K5gkgygyRUX[R[^bRb  @B  QC  D!!3"56!!3"56)3)F)F^*
&r^   c                     t        | dd      S )Nr   Fr{   r  s    r]   r   zTokenizersBackend.add_eos_token$      t-u55r^   c                     t        | dd      S )Nr   Fr  r  s    r]   r   zTokenizersBackend.add_bos_token(  r  r^   c                 R    t         j                  | d|       | j                          y )Nr   object__setattr__r   r   values     r]   r   zTokenizersBackend.add_eos_token,  !    4!159""$r^   c                 R    t         j                  | d|       | j                          y )Nr   r"  r%  s     r]   r   zTokenizersBackend.add_bos_token1  r'  r^   c                 @   g }| j                   j                         D ]U  }|t        |t              r|j	                  |       (t        |t
              s9|j	                  t        |dd             W | j                  D ]R  }t        |t              r|j	                  |       %t        |t
              s6|j	                  t        |dd             T |r| j                  |d       t        | dd      s| j                  j                  | j                          yy)a[  
        Post-initialization hook that runs after the tokenizer is fully set up.
        This is called by from_pretrained() after loading the tokenizer, which allows
        us to add any special tokens that may have been passed as AddedToken objects.

        Child classes should call super()._post_init() if they override this method.
        NTF)r   
normalized)r  r   )r   rP   rT   r   rX   rY   r   r   r{   r   r4   r   )r   r   token_valuer   s       r]   
_post_initzTokenizersBackend._post_init6  s    33::< 	^K"+z2$$[1K-$$ZTV[%\]	^ // 	XE%,$$U+E3'$$ZtPU%VW		X OOM$O?48$?4??CaCaCi&&( Djr^   c                 :    | j                   j                  d      S )zP
        `int`: Size of the base vocabulary (without the added tokens).
        Fwith_added_tokensr   r   r  s    r]   r  zTokenizersBackend.vocab_sizeV  s    
 ---FFr^   c                 :    | j                   j                  d      S )NTr.  )r   	get_vocabr  s    r]   r2  zTokenizersBackend.get_vocab]  s    ((4(@@r^   c                 "    | j                         S rS   )r2  r  s    r]   r1   zTokenizersBackend.vocab`  s    ~~r^   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the sorted mapping from string to index. The added tokens encoder is cached for performance
        optimisation in `self._added_tokens_encoder` for the slow tokenizers.
        c                     | d   S r   re   r   s    r]   r   z8TokenizersBackend.added_tokens_encoder.<locals>.<lambda>j      dhijdk r^   r   r   rG   r   contentr   vks      r]   r   z&TokenizersBackend.added_tokens_encoderd  s;     *00I0I0O0O0QWk)lmA		1mmm   Ac                 6    | j                   j                         S )z
        Returns the added tokens in the vocabulary as a dictionary of index to AddedToken.

        Returns:
            `dict[str, int]`: The added tokens.
        )r   get_added_tokens_decoderr  s    r]   rG   z&TokenizersBackend.added_tokens_decoderl  s     7799r^   c                     t        | j                  j                         d       D ci c]  \  }}|j                  | c}}S c c}}w )z
        Returns the added tokens in the vocabulary as a dictionary of token to index.

        Returns:
            `dict[str, int]`: The added tokens.
        c                     | d   S r   re   r6  s    r]   r   z3TokenizersBackend.get_added_vocab.<locals>.<lambda>  r7  r^   r   r8  r:  s      r]   get_added_vocabz!TokenizersBackend.get_added_vocab{  s;     *00I0I0O0O0QWk)lmA		1mmmr=  c                      y)zN
        Returns True, to avoid expensive `assert tokenizer` gotchas.
        Tre   r  s    r]   __bool__zTokenizersBackend.__bool__  s     r^   c                 :    | j                   j                  d      S )zD
        Size of the full vocabulary with the added tokens.
        Tr.  r0  r  s    r]   __len__zTokenizersBackend.__len__  s     ---EEr^   c                     | j                   S )zc
        `tokenizers.implementations.BaseTokenizer`: The Rust tokenizer used as a backend.
        )r   r  s    r]   backend_tokenizerz#TokenizersBackend.backend_tokenizer  s    
 r^   c                 .    | j                   j                  S )zU
        `tokenizers.decoders.Decoder`: The Rust decoder for this tokenizer.
        )r   decoderr  s    r]   rJ  zTokenizersBackend.decoder  s    
 &&&r^   Tr.   return_token_type_idsreturn_attention_maskreturn_overflowing_tokensreturn_special_tokens_maskreturn_offsets_mappingreturn_lengthverbosec	                 J   |d| j                   v }|d| j                   v }|r|j                  |g|j                  z   }	n|g}	t        t              }
|	D ]  }|
d   j	                  |j
                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |r|
d   j	                  |j                         |s|
d   j	                  t        |j
                                |
|	fS )a  
        Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
        of encodings, take care of building a batch from overflowing tokens.

        Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
        lists (overflows) of lists (tokens).

        Output shape: (overflows, sequence length)
        token_type_idsattention_mask	input_idsspecial_tokens_maskoffset_mappingr   )model_input_namesoverflowingr   rU   rX   idstype_idsrT  rV  offsetsr   )r   r.   rK  rL  rM  rN  rO  rP  rQ  	encodingsencoding_dictr   s               r]   _convert_encodingz#TokenizersBackend._convert_encoding  s$   ( !($48N8N$N! ($48N8N$N!$)=)=)I!
X%9%99I!
I#D) 	;A+&--aee4$./66qzzB$./66q7G7GH)34;;A<Q<QR%./66qyyAh'..s155z:	; i''r^   r   c                 X    | j                   j                  |      }|| j                  S |S rS   )r   token_to_idunk_token_id)r   r   r   s      r]   #_convert_token_to_id_with_added_vocz5TokenizersBackend._convert_token_to_id_with_added_voc  s,    ++E2=$$$r^   r   c                 J    | j                   j                  t        |            S rS   )r   r   r   )r   r   s     r]   _convert_id_to_tokenz&TokenizersBackend._convert_id_to_token  s    **3u:66r^   
new_tokensc                 r    |r| j                   j                  |      S | j                   j                  |      S rS   )r   add_special_tokensr   )r   rf  r  s      r]   _add_tokenszTokenizersBackend._add_tokens  s/    ??55jAA))*55r^   r  c                 8    | j                   j                  |      S )aG  
        Returns the number of added tokens when encoding a sequence with special tokens.

        <Tip>

        This encodes a dummy input and checks the number of added tokens, and is therefore not efficient. Do not put
        this inside your training loop.

        </Tip>

        Args:
            pair (`bool`, *optional*, defaults to `False`):
                Whether the number of added tokens should be computed in the case of a sequence pair or a single
                sequence.

        Returns:
            `int`: Number of special tokens added to sequences.
        )r   num_special_tokens_to_add)r   r  s     r]   rk  z+TokenizersBackend.num_special_tokens_to_add  s    & 88>>r^   rZ  skip_special_tokensc                 $   t        |t              r| j                  j                  |      S g }|rt	        | j
                        n	t	               }|D ]<  }t        |      }||v r|j                  | j                  j                  |             > |S )a  
        Converts a single index or a sequence of indices in a token or a sequence of tokens, using the vocabulary and
        added tokens.

        Args:
            ids (`int` or `list[int]`):
                The token id (or token ids) to convert to tokens.
            skip_special_tokens (`bool`, *optional*, defaults to `False`):
                Whether or not to remove special tokens in the decoding.

        Returns:
            `str` or `list[str]`: The decoded token(s).
        )rT   r   r   r   r   all_special_idsrX   )r   rZ  rl  r   ids_to_skipr   s         r]   convert_ids_to_tokensz'TokenizersBackend.convert_ids_to_tokens  s     c3??..s333Fc$../CE 	>EJE#MM$//55e<=		>
 r^   textrh  c                 J     | j                   d|||d|j                         S )N)rq  	text_pairrh  re   )_encode_plusr   )r   rq  r  rh  r   s        r]   tokenizezTokenizersBackend.tokenize  s,     t  lddOaleklssuur^   padding_strategyr   r   r   r   r   c                    | j                   j                  }| j                   j                  }|t        j                  k(  r||| j                   j                          na|||j                  | j                  d}	|d}
n |	D ci c]  }||j                  |d       }
}|
|	k7  r | j                   j                  di |	 |t        j                  k(  r|| j                   j                          yy|t        j                  k(  r|nd}|||n| j                  | j                  | j                   | j"                  |d}	||	k7  r | j                   j$                  di |	 yyc c}w )a  
        Define the truncation and the padding strategies for fast tokenizers (provided by HuggingFace tokenizers
        library) and restore the tokenizer settings afterwards.

        The provided tokenizer has no padding / truncation strategy before the managed section. If your tokenizer set a
        padding / truncation strategy before, then it will be reset to no padding / truncation when exiting the managed
        section.

        Args:
            padding_strategy ([`~utils.PaddingStrategy`]):
                The kind of padding that will be applied to the input
            truncation_strategy ([`~tokenization_utils_base.TruncationStrategy`]):
                The kind of truncation that will be applied to the input
            max_length (`int`):
                The maximum size of a sequence.
            stride (`int`):
                The stride to use when handling overflow.
            pad_to_multiple_of (`int`, *optional*):
                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
            padding_side (`str`, *optional*):
                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
                Default value is picked from the class attribute of the same name.
        N)r   r   r   r   )r   r   pad_idr_   r   r   re   )r   ru   rt   r   DO_NOT_TRUNCATEr   r&  r   rq   r   r    
DO_NOT_PAD
no_padding
MAX_LENGTHr   pad_token_idr_   r   r   )r   rv  r   r   r   r   r   r   r   targetcurrentr<  r   s                r]   set_truncation_and_paddingz,TokenizersBackend.set_truncation_and_padding  sZ   B oo00??**"4"D"DD&--/ ) /55!11	F "@FG11kooa66GG& 111;F;999#**, $ $47Q7Q#QZW[F -9-E\4K\K\++!^^#55&8F 6!...88 "% Hs   Er   rs  is_split_into_wordsreturn_tensorsr   c                    d } ||      st        d      | ||      st        d      |r6t        |t        t        f      xr |xr t        |d   t        t        f      }nt        |t        t        f      }|rrt        |t              rt        d      |;t        |      t        |      k7  r$t        dt        |       dt        |       d      |t        t        ||            n|}n
|r||fgn|g}t        |t        t        f      st        dt        |       d	      | j                  |||||	|

       || j                  }| j                  j                  |k7  r|| j                  _        | j                  j                  |||      }|D cg c]  }| j                  ||||||||       }}i }|d   d   D ]'  }|D cg c]  \  }}||   D ]  }|  } }}}| ||<   ) |D cg c]  \  }}|D ]  }|  }!}}}|r2g }"t        |      D ]  \  }#\  }$}|"|#gt        |$d         z  z  }" |"|d<   |d   D ]  }%| j!                  |%||        t#        ||!|      }&|sb|`|s^t#        |&j%                         D 'ci c].  \  }}'|t        |'      dkD  rt        |'d   t              r|'d   n|'0 c}'}|&j&                        }&|&S c c}w c c}}}w c c}}}w c c}'}w )Nc                    t        | t              ryt        | t        t        f      rt	        |       dk(  ryt        | d   t              ryt        | d   t        t        f      rtt	        | d         dk(  st        | d   d   t              ryt        | d   d   t        t        f      r/t	        | d   d         dk(  xs t        | d   d   d   t              S yyy)NTr   F)rT   rY   rU   rV   r   )r   s    r]   _is_valid_text_inputz<TokenizersBackend._encode_plus.<locals>._is_valid_text_inputq  s    !S!Ae}-q6Q;!c*!tUm41Q4yA~AaDGS)A##AaDGdE];"1Q47|q0OJqtAwqz34OO$ r^   ztext input must be of type `str` (single example), `list[str]` (batch or single pretokenized example) or `list[list[str]]` (batch of pretokenized examples) or `list[tuple[list[str], list[str]]]` (batch of pretokenized sequence pairs).r   zdwhen tokenizing batches of text, `text_pair` must be a list or tuple with the same length as `text`.zbatch length of `text`: z- does not match batch length of `text_pair`: .z:batch_text_or_text_pairs has to be a list or a tuple (got ))rv  r   r   r   r   r   )rh  is_pretokenized)r.   rK  rL  rM  rN  rO  rP  rQ  rU  overflow_to_sample_mapping)tensor_type)r   rT   rU   rV   rY   	TypeErrorr   zipr0   r  r   r   r   encode_batchr_  rz   &_eventual_warn_about_too_long_sequencer   r   r]  )(r   rq  rs  rh  rv  r   r   r   r  r   r   r  rK  rL  rM  rN  rO  rP  rQ  r   r   r  
is_batchedbatch_text_or_text_pairsr]  r.   tokens_and_encodingssanitized_tokensr   r   r   r   stacksanitized_encodingsr  r   toksrU  batched_outputr&  s(                                           r]   rt  zTokenizersBackend._encode_plusY  s   0	( $D)W 
  )=i)HW  #D4-8hThjQUVWQX[_afZgFhJ#D4-8J)S)  $Tc)n)D .s4yk :I'q*  FOEZtCi,@'A`d$ ?Hy(9':dV$ 2UDMBLTRjMkLllmn  	''- 3!1% 	( 	
  '#'#<#< ??004HH4HDOO1 OO00$1/ 1 
	$ & 
  ""!&;&;*C+E'=+ # 	 
  
 '*1- 	*C&:NN74DINqQNQNEN$)S!	* 1ESSWQdSqSqSS %)+& )*> ? K9D!*qcC[8I4J.JJ*K=W9:)+6 	XI77	:wW	X ''79LZhi n4=V* '5&:&:&<"U c%j1nE!Hd9S%(Y^^ ((N W 
" OS"s    KK
3K3K"
r   c                     | j                   j                  %| j                   j                  j                  |      S dj                  |      S )Nr?   )rH  rJ  decoder  )r   r   s     r]   convert_tokens_to_stringz*TokenizersBackend.convert_tokens_to_string  sJ     %%--9 ""**11&9	
 &!	
r^   	token_idsclean_up_tokenization_spacesc                     |j                  dd        t        |t              r|g}t        |t              r|d   }| j                  j                  ||      }||n| j                  }|r| j                  |      }|S )Nuse_source_tokenizerrU  )rl  )rg   rT   r   rf   r   r  r  clean_up_tokenization)r   r  rl  r  r   rq  s         r]   _decodezTokenizersBackend._decode  s     	

)40i%"Ii&!+.I%%iEX%Y ,7 )22 	%
 (--d3Dr^   
file_names.legacy_formatc                     t        |      }t        j                  j                  ||r|dz   ndt        z         }| j
                  j                  |       ||fz   }|S )Nr  r   )rY   rh   ri   r  TOKENIZER_FILErH  save)r   r	  r  r  r
  r%   s         r]   _save_pretrainedz"TokenizersBackend._save_pretrained  s]     ^,o_s22Q__
 	##N3>"33
r^   c           	      `
   t        j                  | j                  j                               }|j	                  d      }|j	                  d      }	d}
|d   d   dk(  ri |d   d<   g |d   d<   np|d   d   d	k(  r=|d   d
   ]|d   d
   }|d   d   |   d   }
|	|
|v r||
   }
d|d   d
<   |
dgg|d   d<   n(|d   d   dv r	i |d   d<   nt        d|d   d    d      |"d|d   v r|d   d   |v r||d   d      |d   d<   t        j                  t        j                  |            }g }|D ]b  }|j	                  dd      }|j	                  dd      }|d   d   d	k7  r|s5||d   |v r||d      |d<   |j                  t        d(i |       d ||j                  |       |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   dk(  rd|vr|d   d   |d   d   |d<   |d   d   d	k(  r|
|
|d<   |d   V|d   d   dk(  s*|d   d   dk(  r@d|d   v r9t        d |d   d   D              r!t        j                  j                         |d<   t         |d   d      } |d(||d|}|j#                  |||       |	&t        j                  |j                               }d|	v r|	d   D ]  }|	d   |   d   }||D cg c]  }|j%                  ||       }}||	d   |   d<   |D ]   }|j'                  |      }|t        d        |D cg c]  }|j'                  |       c}|	d   |   d!<    d"D ]?  }||	v s|	|   \  }}|	||v r||   }|j'                  |      }|t        d       ||g|	|<   A |	|d<   t        j                  t        j                  |            }| j(                  j+                         }t,        j.                  D ]  }t1        | |      t1        | |      }|	||v r||   }| j2                  j%                  |d      }t5        |t              r=t        ||j6                  |j8                  |j:                  |j<                  d#$      ||<   |||<    | j>                  r| j>                  j+                         ng }||j                  |       tA        |      dkD  r||d%<   ||d&<   	  | jB                  d(i |S c c}w c c}w # tD        $ rE}d'tG        |      v r2|j	                  d&d        | jB                  d(i |}||_        |cY d}~S  d}~ww xY w))uf  
        Trains a tokenizer on a new corpus with the same defaults (in terms of special tokens or tokenization pipeline)
        as the current one.

        Args:
            text_iterator (generator of `list[str]`):
                The training corpus. Should be a generator of batches of texts, for instance a list of lists of texts
                if you have everything in memory.
            vocab_size (`int`):
                The size of the vocabulary you want for your tokenizer.
            length (`int`, *optional*):
                The total number of sequences in the iterator. This is used to provide meaningful progress tracking
            new_special_tokens (list of `str` or `AddedToken`, *optional*):
                A list of new special tokens to add to the tokenizer you are training.
            special_tokens_map (`dict[str, str]`, *optional*):
                If you want to rename some of the special tokens this tokenizer uses, pass along a mapping old special
                token name to new special token name in this argument.
            kwargs (`dict[str, Any]`, *optional*):
                Additional keyword arguments passed along to the trainer from the 🤗 Tokenizers library.

        Returns:
            [`PreTrainedTokenizerFast`]: A new tokenizer of the same type as the original one, trained on
            `text_iterator`.

        r3   r4   Nr/   r0   r   r1   r2   r   r   r   g        )r#   r$   z;This method does not support this type of tokenizer (found z-) only BPE, Unigram, WordLevel and WordPiece.rM   r   idr9  continuing_subword_prefixend_of_word_suffixr   	ByteLevelr:   pretokenizersc              3   ,   K   | ]  }|d    dk(    yw)r0   r  Nre   ).0pretokenizers     r]   	<genexpr>z<TokenizersBackend.train_new_from_iterator.<locals>.<genexpr>  s"      $ !(K7s   initial_alphabet)r  r  )r   trainerr  r   zQAttempted to set a token in the post processor that does not exist in the mappingrZ  )r   sepT)single_wordlstriprstripr*  r   rO   r+   z7multiple values for keyword argument 'tokenizer_object're   )$ro   loadsr   to_strrg   r   rl   rr   rs   rX   r   rW   anypre_tokenizers_fastr  alphabetMODEL_TO_TRAINER_MAPPINGtrain_from_iteratorrq   ra  r   r   r   SPECIAL_TOKENS_ATTRIBUTESr{   r   rT   r  r  r  r*  rO   r   r  r  rY   )r   text_iteratorr  r   new_special_tokensspecial_tokens_mapr   r   r3   r4   rM   r   r   r  added_tokenr   r   trainer_classr  trained_tokenizer_jsonr   r   r   r   special_tokenspecial_token_fullrO   r   new_tokenizers                                r]   train_new_from_iteratorz)TokenizersBackend.train_new_from_iterator%  s   D DOO$:$:$<=%)).9'++,<=	'"6*e3/1N7#G,02N7#H-G$V,	9g&x0<'0:*73G<VDQG	%1iCU6U 29 =I45w'15>4D3Ew'0G$V,0JJ/1N7#G,Mn]dNeflNmMn o> >  *~g66w'48JJ3EnU\F]^iFj3kN7#K0!**4::n+EF	 ' 	=K!ooi6Gd+Ag&v.);G!-+i2HL^2^);K	<R)SI&!!*";{";<	= )!!"45 7#F+u4+69w'(CDP2@2IJe2fF./7#F+u4$F2w'(<=I+9'+BCW+XF'('"6*i7I<Q"+F;/*6/7;F!/26:jH#~o'FF (6(G(X 
 .A-J-J-S-S-U)*01H1PQ_:n_X^_%%mFG%T%%)ZZ	0@0@0B%C">1)*:; vC+,<=cB8LF)5TZ![5"4"8"8"F![![FLN#34S9(C!' #,#8#8#?#+", s#  ouCuejIDYDYZ_D`CuN#34S9%@v "0 
F N2-m<HE1)5%CU:U 25 9(44U;H'(o  6;H4EN=1
F 8F"#34%..tzz:P/QRI!!&&(,FF 	2EtU#/ 'e 4%1mGY6Y$6}$EM%)%=%=%A%A%%N"0*=$.%$6$B$B188188#5#@#@ $%F5M %2F5M%	2* DHC\C\t88==?bd) ''(:;#$q(-AF)* &/!"	!4>>+F++{ "\ Dvj  	HCPQFR 

-t4 . 8 8+4($$ 	s0   SSS 	T-(9T(!T-'T((T-c
           
         ddl ddlm ddlm} ddlm} dt        dt        ffd}t               rd	}||s|s ||      r ||d
|||dd|      }d}|t        |d      5 }t        j                  |      }ddd       j                  d      }|j                  d      }|r-|j                  |      |j                  d      k  r
|r/|-|dvr)|S |r%|j                  |      |j                  d      kD  r|S d	}|s|s@ ||      r7|rd|v rt        |d|d          |	5t!        |dd      s(t        |dd       t"        j%                  d| d       |S |	d	u st!        |dd      rt        |dd	       ddl}|j(                  j+                  |j-                  d      d      }|j.                  }t1        ||j(                  j2                        r||j.                  d<   |S t1        ||j(                  j4                        r|j(                  j7                  dd      }|j(                  j3                  ||g      |_        |S # 1 sw Y   xY w)af  
        Patches mistral related tokenizers with incorrect regex if detected
            1) Local file with an associated config saved next to it
                >> Model type one of the mistral models (on older versions)
            2) Remote models on the hub from official mistral models
                >> Tags including `base_model:.*mistralai`
        r   N)
model_info)versionr   model_idrQ   c                      |       }|j                   ,j                  ddj                  |j                               ryy)Nzbase_model:.*mistralair   TF)tagssearchr  )r  r/   r  res     r]   is_base_mistralz?TokenizersBackend._patch_mistral_regex.<locals>.is_base_mistral  s9    x(Ezz%995rwwuzz7JKr^   Tzconfig.jsonF)	cache_dirr   local_files_only%_raise_exceptions_for_missing_entries'_raise_exceptions_for_connection_errors_commit_hashr,   r-   transformers_versionr   z4.57.2)mistralmistral3voxtral	ministralpixtralz4.57.3r   z$The tokenizer you are loading from 'a  ' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.z[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]*[\p{Ll}\p{Lm}\p{Lo}\p{M}]+|[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}]+[\p{Ll}\p{Lm}\p{Lo}\p{M}]*|\p{N}| ?[^\s\p{L}\p{N}]+[\r\n/]*|\s*[\r\n]+|\s+(?!\S)|\s+isolated)patternbehavior)r   	use_regex)r  huggingface_hubr  	packagingr  transformers.utils.hubr   rY   boolr   rn   ro   rp   rq   parsesetattrr{   r   r   r   pre_tokenizersSplitRegexr   rT   r:   	Metaspacer  )r   r   pretrained_model_name_or_pathr   r  r  r  is_localr   r   r   r  r   r  _config_filemistral_config_detectedf_configr  transformers_model_typer   split_pretokenizercurrent_pretokenizerr  r  s                          @@r]   r   z&TokenizersBackend._patch_mistral_regex  s   * 	.%6	c 	d 	 H(4X/:W*X&-#!16;8=)	L ',#',9 +Q"iilG+'.{{3I'J$*1++l*C'
 (GMM:N,OSZS`S`aiSj,j 3?3   )()gmm<P.QT[TaTabjTk.k$$*.'&xOLi<j#6+#EI':KH[<\] %,WYH[]b5cI':EBNN>?\>] ^e eH ? '$.')EXZ_2`I':DA%)3)B)B)H)H * 0 0 s! ",	 *I *& ,5+B+B(!"6
8Q8Q8Z8Z[5G	//2"  &&:J<U<U<_<_`3=3L3L3V3V16% 4W 40
 3=2K2K2T2T 2 43	/ O+ +s   *I		I)FrS   )NNFFFFT)NF)FN)NN)NNN)NNFNFNN)Cry   
__module____qualname____doc__r  r  r/   r   classmethodr   r*   propertyr  r  r  rY   rV   r  r   r   r   setterr,  r   r  rf   r2  r1   r   r   rG   _added_tokens_encoder_added_tokens_decoderrB  rD  rF  rl   rH  DecoderFastrJ  EncodingFastr   rU   r_  rc  re  ri  rk  rp  ru  r    r   r  rz  ry  r   r   r   rt  r  r  rh   PathLiker  r  r   __classcell__)r  s   @r]   r(   r(   S   s   
 *EJ` `Da)F      !c !C$J !Z_`cZd !
4 6 6 6 6 % % % %)@ GC G GA4S> A  tCH~     nd38n n n :d3
?&; : : 10nc3h n$ F F =   ' ' ' .2-1*/+0',#-(-(  $d{-(  $d{	-(
 $(-( %)-( !%-( -( -( 
tCH~tL11	2-(^  7# 7#* 76d3+;&< 6WZ 6?d ?s ?*tCy t `cfjknfo`o 4vS vd
 vt vjnorjs vI9)I9 0I9 	I9
 I9  $JI9 DjI9\ gk#',;,F,F2D2T2T!%$))-#'&*-1-1*/+0',#,0)X++d9o=EV@WWX 004	?BTJ[E\\_ccX !	X
 *X 0X $JX X "X  $JX DjX tX  $d{X  $d{X $(X  %)!X" !%#X$ %X& 'X( #Tk)X, 
-Xt
tCy 
S 
 %*48	c? " '+Tk	 
< &*&*bkk) #s(O d{	
 t 
sCx* CJ 
 } }r^   r(   )Cr  r   ro   rh   collectionsr   collections.abcr   shutilr   typingr   tokenizers.pre_tokenizersr  r  r  r   r   r   r	   r
   r  r   rl   tokenizers.decodersr   r  tokenizers.modelsr   r   tokenizers.trainersr   r   r   r   r  r   r~   r   integrations.ggmlr   modeling_gguf_pytorch_utilsr   tokenization_utils_baser   r   r   r   r   r   r   utilsr    r!   r"   
get_loggerry   r   r  SPECIAL_TOKENS_MAP_FILETOKENIZER_CONFIG_FILETIKTOKEN_VOCAB_FILEADDED_TOKENS_FILEr  r  r(   PreTrainedTokenizerFastre   r^   r]   <module>r     s   
   	 # $   7 + - / 1 6 * ^ ^ . 0 5 =   @ ? 
		H	% "3 / '  (      !!	  (6EXY  ,-T/ T .Tp( , r^   