
    i1                        d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZmZ d dlZddlmZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z% dZ&dZ' G d de(e      Z)e)jT                  fde)fdZ+ G d de      Z, G d de      Z- G d de      Z.d Z/dee(   dee(   fd Z0d"d!Z1dgZ2y)#    N)Enum)Path)AnyCallableDictIterableListOptional   )util)ErrorsWarnings)BaseDefaultsLanguage)Scorer)Doc)Examplevalidate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)
STOP_WORDSzinstall spacy-pkuseg with `pip install "spacy-pkuseg>=0.0.27,<0.1.0"` or `conda install -c conda-forge "spacy-pkuseg>=0.0.27,<0.1.0"`z
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.zh.ChineseTokenizer"
segmenter = "char"

[initialize]

[initialize.tokenizer]
pkuseg_model = null
pkuseg_user_dict = "default"
c                   (    e Zd ZdZdZdZed        Zy)	Segmentercharjiebapkusegc                 H    t        | j                  j                               S N)list__members__keys)clss    g/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/lang/zh/__init__.pyvalueszSegmenter.values,   s    COO((*++    N)__name__
__module____qualname__r   r   r    classmethodr(    r)   r'   r   r   '   s#    DEF, ,r)   r   	segmenterc                       fd}|S )Nc                 2    t        | j                        S )Nr/   )ChineseTokenizervocab)nlpr/   s    r'   chinese_tokenizer_factoryz;create_chinese_tokenizer.<locals>.chinese_tokenizer_factory2   s    		Y??r)   r.   )r/   r6   s   ` r'   create_chinese_tokenizerr7   1   s    @ %$r)   c                       e Zd Zej                  fdedefdZ	 ddddddeeg e	e
   f      dee   d	ee   d
ee   fdZdedefdZddee   defdZd Zdeeef   fdZi fdeeef   ddfdZd Zd Zd Zd Zy)r3   r4   r/   c                    || _         t        |t              r|j                  n|| _        d | _        d | _        | j                  t        j                         vrxt        j                  j                  d| j                  dj                  t        j                               d      }t        j                  |       t        j                  | _        | j                  t        j                  k(  rt!               | _        y y )NChinese, 'char' (character segmentation)langr/   	supporteddefault)r4   
isinstancer   valuer/   
pkuseg_seg	jieba_segr(   r   W103formatjoinwarningswarnr   r   try_jieba_import)selfr4   r/   warn_msgs       r'   __init__zChineseTokenizer.__init__9   s    
))Y?IOOY 	 >>!1!1!33}}++..))I$4$4$679	 , H MM(#&^^DN>>Y__,-/DN -r)   Nr@   )r5   pkuseg_modelpkuseg_user_dictget_examplesr5   rN   rO   c                l    | j                   t        j                  k(  r||}t        ||      | _        y y )N)rN   rO   )r/   r   r    try_pkuseg_importrC   )rK   rP   r5   rN   rO   s        r'   
initializezChineseTokenizer.initializeL   s9     >>Y---'#/ /)<LDO .r)   textreturnc                    | j                   t        j                  k(  rht        | j                  j                  |d      D cg c]  }|s|	 c}      }t        j                  ||      \  }}t        | j                  ||      S | j                   t        j                  k(  rq| j                  t        t        j                        | j                  j                  |      }t        j                  ||      \  }}t        | j                  ||      S | j                   t        j                  k7  rct         j"                  j%                  d| j                   dj'                  t        j)                               d      }t+        j,                  |       t        |      }t        j                  ||      \  }}t        | j                  ||      S c c}w )NFcut_all)wordsspacesr:   r;   r<   r=   )r/   r   r   r#   rD   cutr   get_words_and_spacesr   r4   r    rC   
ValueErrorr   E1000r   r   rE   rF   rG   r(   rH   rI   )rK   rT   xrY   rZ   rL   s         r'   __call__zChineseTokenizer.__call__[   sd   >>Y__,T^^%7%7e%7%LRPQ!RSE 55eTBME6tzzv>>^^y///& ..OO''-E 55eTBME6tzzv>> >>Y^^+}}++..))I$4$4$679	 , H MM(# T
11%>v4::U6::/ Ss   G
GrY   resetc                    | j                   t        j                  k(  rc|r%	 dd l}|j	                  d       | j
                  _        |D ]6  }| j
                  j                  j                  |j                         d       8 y t        j                  j                  d| j                         }t        j                  |       y # t        $ r dt        z   }t        |      d w xY w)Nr   zEspacy_pkuseg not installed: unable to reset pkuseg user dict. Please  r    )targetcurrent)r/   r   r    spacy_pkusegPreprocesserrC   preprocesserImportError_PKUSEG_INSTALL_MSGinsertstripr   W104rF   rH   rI   )rK   rY   ra   rf   msgwordrL   s          r'   pkuseg_update_user_dictz(ChineseTokenizer.pkuseg_update_user_dictv   s    >>Y---	5'3?3L3LT3RDOO0  F,,33DJJL"EF  }}++8T^^+THMM(# # 5-/BC  &c*45s   $C C!c                 D    t        |d       t        j                  |      S )NzChineseTokenizer.score)r   r   score_tokenization)rK   exampless     r'   scorezChineseTokenizer.score   s    ($<=((22r)   c                     d| j                   iS Nr/   r2   rK   s    r'   _get_configzChineseTokenizer._get_config   s    
 	
r)   configc                 N    |j                  dt        j                        | _        y rv   )getr   r   r/   )rK   ry   s     r'   _set_configzChineseTokenizer._set_config   s    K@r)   c                     ddd  j                   rmt        j                         5 } j                   j                  j	                  |        j                   j
                  j	                  |       t        |      }t        |dz  d      5 }|j                         d d d        t        |dz  d      5 }|j                         d d d        d d d        t         j                   j                  j                         j                   j                  j                  t        t         j                   j                  j                               t        t         j                   j                  j"                              f fdfdfdfdd	}t%        j&                  |g       S # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)
Nr)   features.msgpackrbweights.npzc                  J    t        j                   j                               S r"   )srsly
json_dumpsrx   rw   s   r'   <lambda>z+ChineseTokenizer.to_bytes.<locals>.<lambda>   s    5++D,<,<,>? r)   c                       S r"   r.   )pkuseg_features_bs   r'   r   z+ChineseTokenizer.to_bytes.<locals>.<lambda>   s    '8 r)   c                       S r"   r.   )pkuseg_weights_bs   r'   r   z+ChineseTokenizer.to_bytes.<locals>.<lambda>   s    &6 r)   c                  .    t        j                         S r"   )r   msgpack_dumps)pkuseg_processors_datas   r'   r   z+ChineseTokenizer.to_bytes.<locals>.<lambda>   s    )<)<=S)T r)   cfgpkuseg_featurespkuseg_weightspkuseg_processors)rC   tempfileTemporaryDirectoryfeature_extractorsavemodelr   openread_get_pkuseg_trie_datarh   triepostprocesser
do_processsortedr#   common_wordsother_wordsr   to_bytes)rK   kwargstempdirfilehserializersr   r   r   s   `    @@@r'   r   zChineseTokenizer.to_bytes   su   !%??,,. 4'1166w?%%**73w-'$66= 5(-

%5'M148 4E',zz|$44 &doo&B&B&G&GH--88tDOO99FFGHtDOO99EEFG	&" @86!T	
 }}["--!5 54 44 4s<   A%GF."G9F;
G.F8	3G;G	 GGc                     ddd dfd}fd}fd} fd|||d}t        j                  ||g        d   r>d	   r8t        j                         5 }t	        |      }t        |d
z  d      5 }|j                  d          d d d        t        |dz  d      5 }|j                  d	          d d d        	 dd l}	|	j                  t        |             _        d d d        d   rd   }
|
\  }}}}	j                  |       j                  _        | j                  j                  _        t#        |       j                  j                  _        t#        |       j                  j                  _         S # 1 sw Y   xY w# 1 sw Y   xY w# t        $ r t        dt        z         d w xY w# 1 sw Y   xY w)Nr)   )
features_b	weights_bprocessors_datac                     | d<   y )Nr   r.   bpkuseg_datas    r'   deserialize_pkuseg_featuresz@ChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_features   s    ()K%r)   c                     | d<   y )Nr   r.   r   s    r'   deserialize_pkuseg_weightsz?ChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_weights   s    '(K$r)   c                 6    t        j                  |       d<   y )Nr   )r   msgpack_loadsr   s    r'   deserialize_pkuseg_processorszBChineseTokenizer.from_bytes.<locals>.deserialize_pkuseg_processors   s    -2-@-@-CK)*r)   c                 L    j                  t        j                  |             S r"   )r|   r   
json_loads)r   rK   s    r'   r   z-ChineseTokenizer.from_bytes.<locals>.<lambda>   s    T--e.>.>q.AB r)   r   r   r   r~   wbr   r   /spacy-pkuseg not installed. To use this model, r   )r   
from_bytesr   r   r   r   writerf   ri   rj   r    strrC   rg   rh   r   r   setr   r   )rK   datar   r   r   r   deserializersr   r   rf   r   	user_dictr   r   r   r   s   `              @r'   r   zChineseTokenizer.from_bytes   s   %(stT	*	)	D C:8!>	
 	mR0|$[)A,,. D'w-'$66= ;KKL 9:;'M148 :EKKK 89: ' #/"5"5c'l"CD ,-"-.?"@CR@	:|[/;/H/H/S,;E--8=@=N--:<?<L--9); ;: : #  %I-.    D DsT   F74E?	F7 F5F7>FF7?F	F7F	F7F44F77G c                      t        j                  |      } fd fd fdfdfdd}t        j                  ||g       S )Nc                     j                   rm| j                         s| j                  d       j                   j                  j	                  |        j                   j
                  j	                  |        y y )NT)parents)rC   existsmkdirr   r   r   )pathrK   s    r'   save_pkuseg_modelz3ChineseTokenizer.to_disk.<locals>.save_pkuseg_model   sV    {{}JJtJ,%%**401166t<	 r)   c                    j                   rt        j                   j                  j                        j                   j                  j
                  t        t        j                   j                  j                              t        t        j                   j                  j                              f}t        j                  | |       y y r"   )rC   r   rh   r   r   r   r   r#   r   r   r   write_msgpack)r   r   rK   s     r'   save_pkuseg_processorsz8ChineseTokenizer.to_disk.<locals>.save_pkuseg_processors   s    )$//*F*F*K*KLOO11<<4 = = J JKL4 = = I IJK	 ##D$/ r)   c                 L    t        j                  | j                               S r"   )r   
write_jsonrx   prK   s    r'   r   z*ChineseTokenizer.to_disk.<locals>.<lambda>   s    U--a1A1A1CD r)   c                      |       S r"   r.   )r   r   s    r'   r   z*ChineseTokenizer.to_disk.<locals>.<lambda>       &7&: r)   c                      |       S r"   r.   )r   r   s    r'   r   z*ChineseTokenizer.to_disk.<locals>.<lambda>       +A!+D r)   r   rN   r   )r   ensure_pathto_disk)rK   r   r   r   r   r   s   `   @@r'   r   zChineseTokenizer.to_disk   sE    %	=	0 E:!D

 ||D+r22r)   c                      t        j                  |      } fd fd fdfdfdd}t        j                  ||g        y )Nc                     	 dd l }| j                         rj	                  |       _        y y # t        $ r3 j                  t        j                  k(  rt        dt
        z         d Y cw xY w)Nr   r   )rf   ri   r/   r   r    rj   r   rC   )r   rf   rK   s     r'   load_pkuseg_modelz5ChineseTokenizer.from_disk.<locals>.load_pkuseg_model   sr     # {{}"."5"5d";    >>Y%5%55%I-.    6 s   / 9A+*A+c                    	 dd l }j                  t        j                  k(  rt        j                  |       }|\  }}}}j                  |      j                  _
        |j                  j                  _        t        |      j                  j                  _        t        |      j                  j                  _        y y # t        $ r6 j                  t        j                  k(  rt        j
                        d Y w xY w)Nr   )rf   ri   r/   r   r    _pkuseg_install_msgr   read_msgpackrg   rC   rh   r   r   r   r   r   )r   rf   r   r   r   r   r   rK   s          r'   load_pkuseg_processorsz:ChineseTokenizer.from_disk.<locals>.load_pkuseg_processors  s    J# ~~!1!11))$/CG@	:|[/;/H/H/S,;E--8=@=N--:<?<L--9 2  J>>Y%5%55%d&>&>?TI 6Js   C <DDc                 L    j                  t        j                  |             S r"   )r|   r   	read_jsonr   s    r'   r   z,ChineseTokenizer.from_disk.<locals>.<lambda>  s    T--eooa.@A r)   c                      |       S r"   r.   )r   r   s    r'   r   z,ChineseTokenizer.from_disk.<locals>.<lambda>  r   r)   c                      |       S r"   r.   )r   r   s    r'   r   z,ChineseTokenizer.from_disk.<locals>.<lambda>  r   r)   r   )r   r   	from_disk)rK   r   r   r   r   r   s   `   @@r'   r   zChineseTokenizer.from_disk   sC    %
	<	M B:!D

 	t["-r)   r"   )F)r*   r+   r,   r   r   r   rM   r
   r   r   r   r   r   rS   r   r`   r	   boolrp   rt   r   r   rx   r|   r   r   r   r   r.   r)   r'   r3   r3   8   s    <ENN 0e 0	 0* CG #'&**3xHW,=(=>? h	
 sm #3-;S ;S ;6$T#Y $t $&3
T#s(^ 

 46 A$sCx. A$ A.6+Z36".r)   r3   c                   0    e Zd Z ee      ZeZeZ	ddddZ
y)ChineseDefaultsltrF)	directionhas_casehas_lettersN)r*   r+   r,   r   DEFAULT_CONFIGry   r   lex_attr_gettersr   
stop_wordswriting_systemr.   r)   r'   r   r     s#    !.1F J#(eERNr)   r   c                       e Zd ZdZeZy)r:   zhN)r*   r+   r,   r>   r   Defaultsr.   r)   r'   r:   r:   $  s    DHr)   r:   c                  ~    	 dd l } t        | j                  dd             | S # t        $ r d}t        |      d w xY w)Nr   u   作为FrW   znJieba not installed. To use jieba, install it with `pip  install jieba` or from https://github.com/fxsjy/jieba)r   r#   r[   ri   )r   rn   s     r'   rJ   rJ   )  sO    ) 	UYYxY/0 )E 	 #D()s   !$ <rN   rO   c                     	 dd l }	 |j                  | |      S # t        $ r dt        z   }t        |      d w xY w# t        $ r dt        | xs d      z   }t	        |      d w xY w)Nr   z+spacy-pkuseg not installed. To use pkuseg, )r   z"Unable to load pkuseg model from: rc   )rf   ri   rj   r    FileNotFoundErrorr   )rN   rO   rf   rn   s       r'   rR   rR   9  s    )
/""<;K"LL	  );>QQ#D()
  /2S9K5LL$$./s    < 9(A$c                     g }t        | j                  j                               D ]#  \  }}|j                  t	        |||z                % | j
                  r|j                  || j                  f       |S r"   )r   childrenitemsextendr   iswordappendusertag)noder   r   c
child_nodes        r'   r   r   G  sg    D 3 3 56 A:)*dQh?@A{{T4<<()Kr)   )rc   )3r   rH   enumr   pathlibr   typingr   r   r   r   r	   r
   r   rc   r   errorsr   r   languager   r   scorerr   tokensr   trainingr   r   r   r   r   r4   r   	lex_attrsr   r   r   rj   r   r   r   r   r7   r3   r   r:   rJ   rR   r   __all__r.   r)   r'   <module>r     s        @ @   & .   2 B B    " b ,T , 5>NN %	 %b.~ b.JSl Sh 
) /HSM /Xc] / +r)   