
    i                        d dl mZmZmZ ddlmZmZ ddlmZ ddl	m
Z
mZ ddlmZ ddlmZ ddlmZmZmZ dd	lmZ d
dlmZ d
dlmZ d
dlmZ d
dlmZ dZd Z G d de      Z  G d de      Z! G d de      Z"ddZ#d Z$dgZ%y)    )AnyDictIterator   )BaseDefaultsLanguage)Scorer)POSX)Doc)validate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)TOKENIZER_INFIXES)
STOP_WORDS)TAG_MAPzA
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.ko.KoreanTokenizer"
c                      d } | S )Nc                 ,    t        | j                        S NKoreanTokenizervocab)nlps    g/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/lang/ko/__init__.pykorean_tokenizer_factoryz2create_tokenizer.<locals>.korean_tokenizer_factory   s    syy))     )r   s    r   create_tokenizerr"      s    * $#r    c                   d    e Zd ZdefdZed        Zd Zdede	fdZ
dedeeeef      fdZd	 Zy
)r   r   c                 >    || _         t               | _        d | _        y r   )r   try_mecab_import_mecab_mecab_tokenizer)selfr   s     r   __init__zKoreanTokenizer.__init__   s    
&( $r    c                 ^    | j                   | j                  d      | _         | j                   S )Nz-F%f[0],%f[7])r'   r&   r(   s    r   mecab_tokenizerzKoreanTokenizer.mecab_tokenizer$   s-       ($(KK$@D!$$$r    c                 (    t         | j                  ffS r   r   r+   s    r   
__reduce__zKoreanTokenizer.__reduce__0   s    --r    textreturnc           
         t        | j                  |            }|D cg c]  }|d   	 }}t        | j                  |t        t	        ||                  }t        ||      D ]k  \  }}|d   j                  d      \  }}	}
||_        |j                  t        v r t        |j                     t           |_
        nt        |_
        |d   |_        m |D cg c]  }|d   	 c}|j                  d<   |S c c}w c c}w )Nsurface)wordsspacestag+lemma	full_tags)listdetailed_tokensr   r   check_spaceszip	partitiontag_r   r
   posr   lemma_	user_data)r(   r/   dtokensdtsurfacesdoctokendtoken	first_tagsep	eomi_tagss              r   __call__zKoreanTokenizer.__call__3   s    t++D12,34bByM44$**HT,tX:V5WX g. 	+ME6(.u(?(?(D%IsI"EJzzW$#EJJ/4		!'?EL	+ ;B%BBbi%Bk"
 5 &Cs   C4C9c              #     K   | j                   j                  |d      D ]e  }|j                         r y |j                  }|j                  }|j                  d      \  }}}|j                  d      \  }}}	|dk(  r|}|||d g y w)NT)as_nodes,/*)r2   r7   r5   )r,   parseis_eosr2   featurer=   )
r(   r/   noder2   rS   r5   _exprr7   	remainders
             r   r:   zKoreanTokenizer.detailed_tokensB   s      ((..td.C 		CD{{}llGllG",,S1LCD"&.."5E1i|%cBB		Cs   BB	c                 D    t        |d       t        j                  |      S )NzKoreanTokenizer.score)r   r	   score_tokenization)r(   exampless     r   scorezKoreanTokenizer.scoreP   s    ($;<((22r    N)__name__
__module____qualname__r   r)   propertyr,   r.   strr   rK   r   r   r   r:   r[   r!   r    r   r   r      sb    %e %
 	% 	%.S S CC CHT#s(^,D C3r    r   c                   4    e Zd Z ee      ZeZeZ	ddddZ
eZy)KoreanDefaultsltrF)	directionhas_casehas_lettersN)r\   r]   r^   r   DEFAULT_CONFIGconfigr   lex_attr_gettersr   
stop_wordswriting_systemr   infixesr!   r    r   rb   rb   U   s(    !.1F J#(eERNGr    rb   c                       e Zd ZdZeZy)KoreankoN)r\   r]   r^   langrb   Defaultsr!   r    r   rn   rn   ]   s    DHr    rn   Nc                  F    	 ddl m}  | S # t        $ r t        d      d w xY w)Nr   MeCabzThe Korean tokenizer ("spacy.ko.KoreanTokenizer") requires [mecab-ko](https://bitbucket.org/eunjeon/mecab-ko/src/master/README.md), [mecab-ko-dic](https://bitbucket.org/eunjeon/mecab-ko-dic), and [natto-py](https://github.com/buruzaemon/natto-py))nattort   ImportErrorrs   s    r   r%   r%   b   s6    
 E

 	s   
  c              #      K   d}d}|D ]0  }| j                  ||      }|dkD  r||k7   |t        |      z   }|}2 |dkD  rd y y w)Nr   F)findlen)r/   tokensprev_endstartrF   idxs         r   r;   r;   p   sf     HE iiu%a<c/!U# qy s   AA)r0   N)&typingr   r   r   languager   r   scorerr	   symbolsr
   r   r{   r   trainingr   utilr   r   r   r   r   	lex_attrsr   punctuationr   rj   r   tag_mapr   rg   r"   r   rb   rn   r%   r;   __all__r!   r    r   <module>r      ss    & & .    ) B B    * " $43n 43n \  X 

 *r    