
    i                         d dl Z d dlZd dlmZ d dlmZmZmZ d dlZddl	m
Z
 ddlmZmZ ddlmZ ddl
mZmZmZ dd	lmZ d
dlmZ d
dlmZ dZddefdZ G d de      Z G d de      Z G d de      ZdgZy)    N)Path)AnyDictUnion   )util)BaseDefaultsLanguage)Doc)DummyTokenizerload_config_from_strregistry)Vocab   )	LEX_ATTRS)
STOP_WORDSzU
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.vi.VietnameseTokenizer"
use_pyvi = true
use_pyvic                       fd}|S )Nc                 2    t        | j                        S )Nr   )VietnameseTokenizervocab)nlpr   s    g/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/lang/vi/__init__.pyvietnamese_tokenizer_factoryzAcreate_vietnamese_tokenizer.<locals>.vietnamese_tokenizer_factory   s    "399x@@     )r   r   s   ` r   create_vietnamese_tokenizerr      s    A ('r   c                       e Zd ZddedefdZd ZdedefdZ	d Z
d	 Zdeeef   fd
Zi fdeeef   ddfdZdefdZdedd fdZdeeef   ddfdZdeeef   dd fdZy)r   r   r   c                     || _         || _        | j                  r	 ddlm} || _        y y # t        $ r d}t	        |      d w xY w)Nr   )ViTokenizerz`Pyvi not installed. Either set use_pyvi = False, or install it https://pypi.python.org/pypi/pyvi)r   r   pyvir!   ImportError)selfr   r   r!   msgs        r   __init__zVietnameseTokenizer.__init__!   sW    
 ==	1,#. 	 
  1F  "#&D01s	   + Ac                 >    t         | j                  | j                  ffS N)r   r   r   r$   s    r   
__reduce__zVietnameseTokenizer.__reduce__0   s    "TZZ$???r   textreturnc                    | j                   rB| j                  |      }t        j                  ||      \  }}t	        | j
                  ||      S t        j                  |j                         |      \  }}t	        | j
                  ||      S )N)wordsspaces)r   pyvi_tokenizer   get_words_and_spacesr   r   split)r$   r+   r.   r/   s       r   __call__zVietnameseTokenizer.__call__3   sq    ==&&t,E 55eTBME6tzzv>> 55djjlDIME6tzzv>>r   c                 d   g d}d}d}d}d}d}g d}g }	|	j                  |       |	j                  |       |	j                  ||g       |	j                  |||g       dd	j                  |	      z   d
z   }	t        j                  |	|t        j                        }
|
D cg c]  }|d   	 c}S c c}w )zQModified from pyvi to preserve whitespace and skip unicode
        normalization.)z==>z->z\.\.\.z>>z\d+([\.,_]\d+)+z2([a-zA-Z0-9_.+-]+@([a-zA-Z0-9-]+\.)+[a-zA-Z0-9-]+)z\w+://[^\s]+z\w+z[^\w\s])u
   [A-ZĐ]+\.zTp\.zMr\.zMrs\.zMs\.zDr\.zThS\.z(\s+||)r   )extendjoinrefindallUNICODE)r$   r+   specialsdigitemailwebwordnon_wordabbreviationspatternstokenstokens               r   pyvi_sylabelize_with_wsz+VietnameseTokenizer.pyvi_sylabelize_with_ws?   s     5"E
 &!e%$/0chhx0036HdBJJ7&,-Ua---s   B-c                    t        |      dk(  rg S |j                         r|gS | j                  |      }g }g }t        |      D ][  \  }}|j                         r|j	                  |       |j	                  |dk(  s||dz
     j                         sdn||dz
            ] | j
                  j
                  j                  j                  | j
                  j
                  j                  |d      g      }|d   }g }t        dt        |d               D ]  }|d   |   dk(  r||   t        j                  vr||dz
     t        j                  vrm||   d   j                         sW||dz
     d   j                         s>||   d   j                         r||dz
     d   j                         r|||   z   ||   z   }|j	                  |       ||   } |j	                  |       |S )z3Modified from pyvi to preserve text and whitespace.r   r    FI_W)lenisspacerF   	enumerateappendr!   modelpredictsent2featuresrangestringpunctuationisdigitistitle)	r$   r+   segsr.   preceding_wsirE   labelsrD   s	            r   r0   z!VietnameseTokenizer.pyvi_tokenize]   s   t9>I\\^6M++D1!$ 	HAu==?U###6a!e)<)<)>BT!a%[	 !!--33;;))77uEF
 aq#fQi.) 	!Aq	!%!HF$6$66!a%L(:(::a++-a!eQ//1q!,,.uQU|A7N7N7PQ/%(:e$a	! 	er   c                     d| j                   iS )Nr   r   r)   s    r   _get_configzVietnameseTokenizer._get_config   s    DMM**r   configNc                 2    |j                  dd      | _        y )Nr   F)getr   )r$   r\   s     r   _set_configzVietnameseTokenizer._set_config   s    

:u5r   c                 >     d fdi}t        j                  |g       S )Ncfgc                  J    t        j                   j                               S r(   )srsly
json_dumpsr[   r)   s   r   <lambda>z.VietnameseTokenizer.to_bytes.<locals>.<lambda>   s    e&6&6t7G7G7I&J r   )r   to_bytes)r$   kwargsserializerss   `  r   rf   zVietnameseTokenizer.to_bytes   s    JK}}["--r   datac                 D     d fdi}t        j                  ||g         S )Nra   c                 L    j                  t        j                  |             S r(   )r_   rc   
json_loads)br$   s    r   re   z0VietnameseTokenizer.from_bytes.<locals>.<lambda>   s    $*:*:5;K;KA;N*O r   )r   
from_bytes)r$   ri   rg   deserializerss   `   r   rn   zVietnameseTokenizer.from_bytes   s#     OPmR0r   pathc                 l     t        j                  |      }d fdi}t        j                  ||g        y )Nra   c                 L    t        j                  | j                               S r(   )rc   
write_jsonr[   pr$   s    r   re   z-VietnameseTokenizer.to_disk.<locals>.<lambda>   s    (8(8D<L<L<N(O r   )r   ensure_pathto_diskr$   rp   rg   rh   s   `   r   rw   zVietnameseTokenizer.to_disk   s.    %OPT;+r   c                 n     t        j                  |      }d fdi}t        j                  ||g         S )Nra   c                 L    j                  t        j                  |             S r(   )r_   rc   	read_jsonrt   s    r   re   z/VietnameseTokenizer.from_disk.<locals>.<lambda>   s    (8(89K(L r   )r   rv   	from_diskrx   s   `   r   r|   zVietnameseTokenizer.from_disk   s3    %LMt["-r   )F)__name__
__module____qualname__r   boolr&   r*   strr   r3   rF   r0   r   r   r[   r_   bytesrf   rn   r   r   rw   r|   r   r   r   r   r       s    1e 1t 1@?S ?S ?.<"H+T#s(^ + 46 6$sCx. 6$ 6.E .u 3H 
,E#t), ,4 ,
eCI. =R r   r   c                   $    e Zd Z ee      ZeZeZ	y)VietnameseDefaultsN)
r}   r~   r   r   DEFAULT_CONFIGr\   r   lex_attr_gettersr   
stop_wordsr   r   r   r   r      s    !.1F Jr   r   c                       e Zd ZdZeZy)
VietnameseviN)r}   r~   r   langr   Defaultsr   r   r   r   r      s    D!Hr   r   )T) r9   rR   pathlibr   typingr   r   r   rc   rH   r   languager	   r
   rD   r   r   r   r   r   r   	lex_attrsr   r   r   r   r   r   r   r   r   __all__r   r   r   <module>r      sr    	   # #   .  B B    "($ (y. yx " "
 .r   