
    iE                        d dl Z d dlZd dlZd dlmZmZ d dlmZ d dlm	Z	 d dl
mZ d dlmZmZmZmZmZ ddlmZmZ d	 Zej,                  j/                  d
      d        Zej,                  j/                  d      d        Zej,                  j/                  d      d        Zd Zej,                  j9                  dg d      d        Zd Zy)    N)ENT_IOBENT_TYPE)English)	Tokenizer)Doc)compile_infix_regexcompile_prefix_regexcompile_suffix_regexget_lang_class
load_model   )assert_packed_msg_equalmake_tempdirc                 \     t        d             j                  }|j                  |        |S )Nen)r   	tokenizer
from_bytes)btoks     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/tests/serialize/test_serialize_tokenizer.pyload_tokenizerr      s(    
.

 
*
*CNN1J    i  c                 :   t        | ddg      }t        j                  t              5  t	        j
                  |d          ddd       t        j                  t              5  t	        j
                  |dd        ddd       y# 1 sw Y   DxY w# 1 sw Y   yxY w)zATest that a custom error is raised if a token or span is pickled.Helloworld)wordsr   Nr   )r   pytestraisesNotImplementedErrorpickledumps)en_vocabdocs     r   test_issue2833r$      s     hw0
1C	*	+ SV	*	+ S1X   s   B#BBBi  c                    g d}g d}g d}g d}t        | ||||      }|j                  d      sJ d}|d   j                  |d   j                  |d   j                  |d   j
                  f|k(  sJ t        t        g}|j                  |      }|j                  ||       |d   j                  |d   j                  |d   j                  |d   j
                  f|k(  sJ |j                         }	t        |       j                  |	      }
|
d   j                  |
d   j                  |
d   j                  |
d   j
                  f|k(  sJ y	)
ziTest that the is_tagged attribute doesn't get overwritten when we from_array
    without tag information.)Thisis10%.)DTVBZCDNNr*   )DETVERBNUMNOUNPUNCT)Or4   z	B-PERCENTz	I-PERCENTr4   )r   tagsposentsTAG)r(   r1   r-   PERCENTr   N)r   has_annotationtextpos_tag_	ent_type_r   r   to_array
from_arrayto_bytesr   )r"   r   r5   r6   r7   r#   expectedheader	ent_array	doc_bytesdoc2s              r   test_issue3012rG   %   s3    +E)D
1C4D
he$Cd
CCe$$$-HFKKQc!fkk3q63C3CDPPPx FV$INN69%FKKQc!fkk3q63C3CDPPPIx=##I.DGLL$q',,Qd1g6G6GHHTTTr   i^  c                     d } d}t               } ||      }|D cg c]  }|j                   }} | |        ||      }|D cg c]  }|j                   }}t               5 }|j                  |       t	        |      }	d d d         	|      }
|
D cg c]  }|j                   }}||k(  sJ |	j
                  j                  du sJ y c c}w c c}w # 1 sw Y   VxY wc c}w )Nc           	          t        | j                  j                        }t        | j                  j                        }t        | j                  j                        }t        | j                  j                        j                         D ci c]  \  }}t        |      dk(  r|d   dk(  s||  }}}t        | j                  ||j                  |j                  |j                  | j                  j                   d      }|| _        y c c}}w )Nr      r*   F)prefix_searchsuffix_searchinfix_finditertoken_matchfaster_heuristics)r	   Defaultsprefixesr
   suffixesr   infixesdicttokenizer_exceptionsitemslenr   vocabsearchfinditerr   rN   )nlp	prefix_re	suffix_reinfix_rekv
exceptionsnew_tokenizers           r   customize_tokenizerz+test_issue4190.<locals>.customize_tokenizer=   s    ()>)>?	()>)>?	&s||';';< S\\>>?EEG
1FaKAaDCK qD

 

 "II#**#**#,,11#
 &
s   #D
zTest c.F)r   r;   r   to_diskr   r   rO   )rc   test_stringnlp_1doc_1atoken	result_1adoc_1b	result_1b	model_dirnlp_2doc_2result_2s               r   test_issue4190rp   ;   s    &* KIE;F)/00I0;F)/00I0	 &9i 9%& +E(-.u

.H.   ??,,555 1 1& &
 /s   CC$CC"Cc                    t        | |j                        }|j                         }t        |       j                  |        t	        d             j
                  }t        j                  d      j                  |_	        |j                  i k7  sJ |j                  J |j                  J |j                  J |j                  J |j                  |       |j                  i k(  sJ |j                  J |j                  J |j                  J |j                  J t        | dddiddigi	      }i |_
        |j                         }t        |       j                  |      }|j                  i k(  sJ y)
zTest that custom tokenizer with not all functions defined or empty
    properties can be serialized and deserialized correctly (see #2494,
    #4991).)rL   r   testNzABC.ORTHABCr*   )rules)r   rL   rA   r   r   r   recompilematchrN   ru   	url_matchrK   rM   )r"   en_tokenizerr   tokenizer_bytestokenizer_reloadeds        r   test_serialize_custom_tokenizerr}   f   s    (,2L2LMI((*Oh""?3 %t$&00IJJv.44I??b     ,,,***""...##///)??b     (((&&&""***##+++(6VUOfc]3S*TUIIO((*O"8,77H##r)))r   r;   )u   I💜youu	   they’reu   “hello”c                 r   | }t        |j                               }t        |j                         |j                                |j                         |j                         k(  sJ  ||      } ||      }|D cg c]  }|j                   c}|D cg c]  }|j                   c}k(  sJ y c c}w c c}w )N)r   rA   r   r;   )rz   r;   r   rb   doc1rF   rh   s          r   (test_serialize_tokenizer_roundtrip_bytesr      s    I"9#5#5#78MM224i6H6H6JK!!#y'9'9';;;;T?DD$()5EJJ)d-KUejj-KKKK)-Ks   ;B/B4c                     | }t               5 }|dz  }|j                  |       | j                  |      }|j                         |j                         k(  sJ 	 d d d        y # 1 sw Y   y xY w)Nr   )r   rd   	from_diskrA   )rz   r   d	file_pathtokenizer_ds        r   'test_serialize_tokenizer_roundtrip_diskr      sj    I	 >1O	)$",,Y7!!#{';';'====	> > >s   AA""A+)r    rv   r   spacy.attrsr   r   spacy.lang.enr   spacy.tokenizerr   spacy.tokensr   
spacy.utilr   r	   r
   r   r   utilr   r   r   markissuer$   rG   rp   r}   parametrizer   r    r   r   <module>r      s     	  ) ! %   9 4  4U U* 4'6 '6T*< !IJL KL>r   