
    i$1                         d dl Z d dlmZ d dlmZ d dlmZmZmZm	Z	m
Z
 d dlZd dlmZ ddlmZ ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZ ddl m!Z! ddlm"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z( ddl)m*Z* ddl+m,Z, ddl-m.Z. ddl/m0Z0 dZ1d7de	e2   fdZ3 G d de"      Z4 G d de      Z5 G d d e      Z6e6jo                  d!d"d#ged$d$d%d&id'd(d(dd)*      d+ed,ed-e2d.e8d/e8d0e	e   fd1       Z9 ed2g d3      Z:d8d4Z;d5 Z<d9d6Z=d gZ>y):    N)
namedtuple)Path)AnyCallableDictOptionalUnion)Model   )util)Errors)BaseDefaultsLanguage)Morphologizer)DEFAULT_MORPH_MODEL)Scorer)POS)DocMorphAnalysis)validate_examples)DummyTokenizerload_config_from_strregistry)Vocab   )
STOP_WORDS)SYNTAX_ITERATORS)TAG_BIGRAM_MAP)TAG_MAP)TAG_ORTH_MAPzU
[nlp]

[nlp.tokenizer]
@tokenizers = "spacy.ja.JapaneseTokenizer"
split_mode = null

split_modec                       fd}|S )Nc                 2    t        | j                        S )Nr!   )JapaneseTokenizervocab)nlpr!   s    g/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/lang/ja/__init__.pyjapanese_tokenizer_factoryz4create_tokenizer.<locals>.japanese_tokenizer_factory$   s     zBB     )r!   r)   s   ` r(   create_tokenizerr,   #   s    C &%r*   c                       e Zd Zddedee   ddfdZd ZdedefdZ	dd	e
fd
Zd Zd Zdeeef   fdZi fdeeef   ddfdZdefdZdedd fdZdeeef   ddfdZdeeef   dd fdZy)r%   Nr&   r!   returnc                 v    || _         || _        t        | j                        | _        |d u xs |dk(   | _        y )NA)r&   r!   try_sudachi_import	tokenizerneed_subtokens)selfr&   r!   s      r(   __init__zJapaneseTokenizer.__init__+   s:    
$+DOO<#-#5#Js9JKr*   c                 >    t         | j                  | j                  ffS N)r%   r&   r!   r4   s    r(   
__reduce__zJapaneseTokenizer.__reduce__2   s     4::t"???r*   textc           	      r   | j                   j                  |      }| j                  |      }t        ||      \  }}|rt	        | ng gdz  \  }}}}}	}
}t        |      }t        | j                  ||      }d }t        t	        ||            D ]
  \  }\  }}|j                  |_
        |r
||_        d }nBt        |j                  |j                  |dz   t        |      k  r||dz      nd       \  |_        }|j                  r|j                  n|j                   |_        i }|j$                  r|j$                  |d<   |j&                  |_        |j*                  r$t-        j.                  dd|j*                        |d<   t1        | j                  |      |_         | j4                  r||j6                  d<   |S )	N   )wordsspacesr   
Inflectionz[=|]_Reading
sub_tokens)r2   tokenize_get_dtokensget_dtokens_and_spacesziplistr   r&   	enumeratetagtag_posresolve_posorth_lenlemmasurfacelemma_infnormnorm_readingresubr   morphr3   	user_data)r4   r:   sudachipy_tokensdtokensr>   r=   tagsinflectionslemmasnormsreadingssub_tokens_listdocnext_posidxtokendtokenrX   s                     r(   __call__zJapaneseTokenizer.__call__5   s   >>2248##$450$? %CM2$( 	Kt[&%? /$**E&9$-c#w.?$@ 	; C%EJ$	&1KKJJ%(1Ws4y%8DqMd'#	8 ,2<<6<<V^^ELEzz&,jjl# ++EK~~ $&66&#v~~#Fi '

E:EK-	;. *9CMM,'
r*   need_sub_tokensc                 >   |r| j                  |      nd }t        |      D cg c]  \  }}t        |j                               dkD  rt	        |j                         dj                  |j                         d d D cg c]
  }|dk7  s	| c}      dj                  |j                         dd  D cg c]
  }|dk7  s	| c}      |j                         |j                         |j                         |r||   nd        }}}}t        |      D cg c]j  \  }}|dk(  s^|j                  j                         rD|j                  dk7  s5||dz
     j                  j                         r||dz
     j                  dk7  r|l c}}S c c}w c c}w c c}}}w c c}}w )Nr   -   *;   空白r   )_get_sub_tokensrH   rN   rP   DetailedTokenjoinpart_of_speechdictionary_formnormalized_formreading_formisspacerI   )	r4   rZ   rh   ra   rd   re   xxr[   ts	            r(   rD   zJapaneseTokenizer._get_dtokens]   s   6ED  !124 	 ((89
 
 U5==?#a' u';';'=bq'AOR3Y"OPu';';'=ab'AOR3Y"OP%%'%%'""$,;OC(

 
( $G,
Qax99$$&uu 37#++335sQw##x/ 
 	
 PO	
$
s8   AF=
FF&F2
F=FAFA/F
Fc                 v   | j                   sy g }|D ]#  }|j                  | j                  j                  j                        }t        |      dk(  r|j                  d        S| j                  dk(  r#|j                  | j                  |d      g       |j                  | j                  j                  j                        }t        |      t        |      k(  r&| j                  |d      }|j                  ||g       |j                  | j                  |d      | j                  |d      g       & |S )Nr   BF)
r3   splitr2   	SplitModer0   rN   appendr!   rD   rz   )r4   rZ   ra   re   sub_asub_br[   s          r(   ro   z!JapaneseTokenizer._get_sub_tokens}   s   ""% 	EKK 8 8 : :;E5zQ&&t,C'&&(9(9%(G'HIDNN$<$<$>$>?u:U+"//u=G#**GW+=>#** --eU; --eU;	$ r*   c                 D    t        |d       t        j                  |      S )NzJapaneseTokenizer.score)r   r   score_tokenization)r4   exampless     r(   scorezJapaneseTokenizer.score   s    ($=>((22r*   c                     d| j                   iS Nr!   r$   r8   s    r(   _get_configzJapaneseTokenizer._get_config   s    doo..r*   configc                 2    |j                  dd       | _        y r   )getr!   )r4   r   s     r(   _set_configzJapaneseTokenizer._set_config   s     **\48r*   c                 >     d fdi}t        j                  |g       S )Ncfgc                  J    t        j                   j                               S r7   )srsly
json_dumpsr   r8   s   r(   <lambda>z,JapaneseTokenizer.to_bytes.<locals>.<lambda>   s    e&6&6t7G7G7I&J r*   )r   to_bytes)r4   kwargsserializerss   `  r(   r   zJapaneseTokenizer.to_bytes   s    JK}}["--r*   datac                 x     d fdi}t        j                  ||g        t         j                         _         S )Nr   c                 L    j                  t        j                  |             S r7   )r   r   
json_loads)br4   s    r(   r   z.JapaneseTokenizer.from_bytes.<locals>.<lambda>   s    $*:*:5;K;KA;N*O r*   )r   
from_bytesr1   r!   r2   )r4   r   r   deserializerss   `   r(   r   zJapaneseTokenizer.from_bytes   s3     OPmR0+DOO<r*   pathc                 l     t        j                  |      }d fdi}t        j                  ||g        y )Nr   c                 L    t        j                  | j                               S r7   )r   
write_jsonr   pr4   s    r(   r   z+JapaneseTokenizer.to_disk.<locals>.<lambda>   s    (8(8D<L<L<N(O r*   )r   ensure_pathto_diskr4   r   r   r   s   `   r(   r   zJapaneseTokenizer.to_disk   s.    %OPT;+r*   c                      t        j                  |      }d fdi}t        j                  ||g        t         j                         _         S )Nr   c                 L    j                  t        j                  |             S r7   )r   r   	read_jsonr   s    r(   r   z-JapaneseTokenizer.from_disk.<locals>.<lambda>   s    (8(89K(L r*   )r   r   	from_diskr1   r!   r2   r   s   `   r(   r   zJapaneseTokenizer.from_disk   sC    %LMt["-+DOO<r*   r7   )T)__name__
__module____qualname__r   r   strr5   r9   r   rg   boolrD   ro   r   r   r   r   r   bytesr   r   r	   r   r   r   r+   r*   r(   r%   r%   *   s    Le L# L$ L@&S &S &P
d 
@43/T#s(^ / 46 9$sCx. 9$ 9.E .u 3F ,E#t), ,4 ,
eCI. =P r*   r%   c                   0    e Zd Z ee      ZeZeZ	ddddZ
y)JapaneseDefaultsltrF)	directionhas_casehas_lettersN)r   r   r   r   DEFAULT_CONFIGr   r   
stop_wordsr   syntax_iteratorswriting_systemr+   r*   r(   r   r      s#    !.1FJ'#(eERNr*   r   c                       e Zd ZdZeZy)JapanesejaN)r   r   r   langr   Defaultsr+   r*   r(   r   r      s    DHr*   r   morphologizerztoken.morphz	token.posTz@scorerszspacy.morphologizer_scorer.v1)model	overwriteextendscorerg      ?)pos_accmorph_micro_fmorph_per_feat)assignsdefault_configdefault_score_weightsr'   r   namer   r   r   c                 8    t        | j                  |||||      S )N)r   r   r   )r   r&   )r'   r   r   r   r   r   s         r(   make_morphologizerr      s"    . 		5$)F6 r*   rp   )rP   rI   rR   rO   rS   rU   rB   c                    	 ddl m}m} |j                  j                  j
                  |j                  j                  j
                  |j                  j                  j                  |j                  j                  j                  d|    } |j                         j                  |       }|S # t        $ r t        d      dw xY w)zSudachiPy is required for Japanese support, so check for it.
    It it's not available blow up and explain how to fix it.
    split_mode should be one of these values: "A", "B", "C", None->"A".r   )
dictionaryr2   )Nr0   rz   C)modezJapanese support requires SudachiPy and SudachiDict-core (https://github.com/WorksApplications/SudachiPy). Install with `pip install sudachipy sudachidict_core` or install spaCy with `pip install spacy[ja]`.N)	sudachipyr   r2   	Tokenizerr|   r0   rz   r   
DictionarycreateImportError)r!   r   r2   toks       r(   r1   r1      s    3 %%//11$$..00$$..00$$..00	

 
 ##%,,*,=
 :

 	s   B+B. .Cc                     |t         v rt         |   }| |v r||    dfS |r0||f}|t        v r$t        |   \  }}|t        |   t           |fS ||fS t        |   t           dfS )a2  If necessary, add a field to the POS tag for UD mapping.
    Under Universal Dependencies, sometimes the same Unidic POS tag can
    be mapped differently depending on the literal token or its context
    in the sentence. This function returns resolved POSs for both token
    and next_token by tuple.
    N)r    r   r   r   )orthrI   next_tagorth_map
tag_bigramcurrent_posrc   s          r(   rL   rL      s     l$8D>4'' (]
'$2:$>!K"CL% 
 #H,, 3<d""r*   c                    | D cg c]  }|j                    }}dj                  dj                  |      j                               dj                  |j                               k7  r*t        t        j
                  j                  ||            g }g }d}t        |      dk(  r||fS t        |D cg c]  }|j                         r| c}      dk(  r+|j                         sJ t        ||d||d d       g}dg}||fS t        t        ||             D ]  \  }	\  }}
|j                         r	 ||d  j                  |      }|dkD  r>||||z    }|j                  t        ||d||d d              |j                  d       ||z  }|j                  |
       |j                  d       |t        |      z  }|	dz   t        |       k  s| |	dz      j                   dk(  sd|d<   |dz  } |t        |      k  r6||d  }|j                  t        ||d||d d              |j                  d       ||fS c c}w c c}w # t        $ r, t        t        j
                  j                  ||            d w xY w)	N )r:   r=   r   Fr    T)rP   rq   r{   
ValueErrorr   E194formatrN   rv   rp   rH   rF   indexr}   )r[   r:   gap_tagxr=   text_dtokenstext_spacestext_poswordirf   
word_startws                r(   rE   rE      sy    '(1QYY(E(	wwrwwu~##%&"''$**,*??++U+CDDLKH
5zQ[((	u;tDLLNd;	<	A||~~%dGRtT4PQg[(( 's5'':; >D&<<>	Shi..t4J
 >X: 56Aa"aD$ OPu%
"H 	F#5!CIq53w<GAEN$:$:c$A"KOMH/4 #d)OM!Wb!QdKL5!$$a ) <  	SV[[//T/GHdR	Ss   H53H:	H:2H??5I4r7   )r0   )rn   )?rV   collectionsr   pathlibr   typingr   r   r   r   r	   r   	thinc.apir
   r   r   errorsr   languager   r   pipeliner   pipeline.morphologizerr   r   r   symbolsr   tokensr   r   trainingr   r   r   r   r&   r   r   r   r   r   tag_bigram_mapr   tag_mapr   tag_orth_mapr    r   r   r,   r%   r   r   factoryr   r   rp   r1   rL   rE   __all__r+   r*   r(   <module>r      sL   	 "  7 7     . % 9   ( ) B B  " . *  &&# &K K\S| S x  
 
K($>?	   
	

 
 	

 
 X

 X
0#D2%j ,r*   