
    i-F                     B   d dl Z d dlZd dlZd dlmZ d dlmZ d dlmZm	Z	 d dl
mZmZ d dlmZ ddlmZ ej"                  j%                  d	      d
        Zej"                  j%                  d      d        Zej"                  j%                  d      d        Zej"                  j%                  d      d        Zej"                  j%                  d      d        Zej"                  j%                  d      d        Zej"                  j%                  d      d        Zej"                  j%                  d      d        Zd Zd Zd Zd Zd Zd Z d Z!d  Z"d! Z#d" Z$d# Z%d$ Z&d% Z'd& Z(d' Z)d( Z*d) Z+d* Z,d+ Z-ej"                  j]                  d,d-d.g      d/        Z/d0 Z0d1 Z1y)2    N)Mock)English)MatcherPhraseMatcher)DocSpan)Vocab   )make_tempdiri  c                      t               } t        | j                        }|j                  d | d       | d       | d      g       |j                  d | d      g       t	        |      dk(  sJ y)	zdTest that the PhraseMatcher correctly reports its number of rules, not
    total number of patterns.TEST1abcTEST2dr
   N)r   r   vocabaddlen)nlpmatchers     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/tests/matcher/test_phrase_matcher.pytest_issue3248_1r      s`     )CCII&GKK#c(CHc#h78KK#c($w<1    i  c                 `   t        |       }|j                  dt        | ddg      g       |j                  dt        | ddg      g       t        | g d      } ||      }t        |      dk(  sJ | j                  |d   d      | j                  |d	   d      g}t        |      ddgk(  sJ y
)zcTest that duplicate patterns for different rules result in multiple
    matches, one per rule.
    ABarackObamawordsB)r   r   liftsAmericar
   r      N)r   r   r   r   stringssorted)en_vocabr   docmatches	match_idss        r   test_issue3331r+      s    
 H%GKKc(8W*=>?@KKc(8W*=>?@
hE
FCclGw<1!!'!*Q-0(2B2B71:a=2QRI)c
***r   i  c                 P   t        |       }|j                  dt        | ddg      g       |j                  dt        | ddg      g       t        | g d      } ||      }t        |      dk(  sJ |D cg c]  \  }}}| j                  |    }}}d|v sJ d|v sJ yc c}}w )	zGTest that the PhraseMatcher returns duplicates for duplicate match IDs.r   NewYorkr   r!   )Iliveinr-   r.   r
   N)r   r   r   r   r%   )r'   r   r(   r)   ent_id_	found_idss          r   test_issue3972r5   )   s     H%GKKc(5&/:;<KKc(5&/:;<
h@
ACclGw<1 AHHnvq!!!&)HIH)) Is   8B"i  c                 J   t        | d      }t        | ddg      }|D cg c]  }|j                   c}ddgk(  sJ |j                  d|g       t        | g d      }|D cg c]  }|j                   c}g dk(  sJ  ||      }t	        |      dk(  sJ t        | d      }t        | d	d
g      }d|d   _        d|d   _        |D cg c]  }|j                   c}ddgk(  sJ |j                  d|g        ||      }t	        |      dk(  sJ yc c}w c c}w c c}w )zETest that the PhraseMatcher can match on overwritten NORM attributes.NORMattrr   r   r   TEST)r   r   r   r   r$   12r   N)r   r   norm_r   r   )r'   r   pattern1tr(   r)   pattern2s          r   test_issue4002rA   :   s&    H62G8C:.H%&AGG&3*444KK
#
h2
3C !AGG!%9999clGw<1H62G8C:.HHQKHQK%&AGG&3*444KK
#clGw<1 ' " 's   D$DD i  c                      t        t                     } t        | j                  t              sJ t	        t                     } t        | j                  t              sJ y)zCTest that PhraseMatcher.vocab can be accessed (like Matcher.vocab).N)r   r	   
isinstancer   r   )r   s    r   test_issue4373rD   O   sD     egGgmmU+++EG$GgmmU+++r   i+  c                  ^   d} t               }ddddg}|j                  dddi	      }|j                  |        ||       }|j                  D cg c]%  }|j                  |j
                  |j                  f' }}t               }t               5 }|d
z  }	|j                  |	       |j                  d      j                  |	       ddd        ||       }
|
j                  D cg c]%  }|j                  |j
                  |j                  f' }}||k(  sJ yc c}w # 1 sw Y   XxY wc c}w )zTest that the EntityRuler PhraseMatcher is deserialized correctly using
    the method from_disk when the EntityRuler argument phrase_matcher_attr is
    specified.
    z!Spacy is a python library for nlp
PYTHON_LIBspacyspaCy)labelpatternidentity_rulerphrase_matcher_attrLOWER)configentityrulerN)
r   add_pipeadd_patternsentstextlabel_ent_id_r   to_disk	from_disk)rT   r   patternsrulerr(   entresnlp_reloadedr   	file_pathdoc_reloadedres_reloadeds               r   'test_issue4651_with_phrase_matcher_attrra   X   s    /D
)C&7'JKHLL1F0PLQE	x 
d)C:=((
C3CHHcjj#++.
CC
C9L	 C1%	i n-77	BC  %LCOCTCTUCSXXszz3;;7ULU, DC C
 Vs   *D7D%*D*D'i  c                     g d}t        | |      }|dd }t        | g d      }t        |       }|j                  d|g        ||      }|sJ y)z/Ensure that PhraseMatcher accepts Span as inputr/   likeSpansandDocsr1   myinput,rf   nothingelse.r   N   re   rf   rg   SPACY)r   r   r   )r'   r    r(   spanrJ   r   r)   s          r   test_issue6839rr   o   sX     kE
he
$Cr7D(":;GH%GKK'#dmGN7r   i)  c                    g d}t        | |      }t        | dg      t        | ddg      d}t        |       }|j                         D ]  \  }}|j                  ||g         ||      }|| j                  d   ddf| j                  d	   dd
fgk(  sJ |j                  d       t        |      dk(  sJ  ||      }|| j                  d	   dd
fgk(  sJ |j                  d	       t        |      dk(  sJ  ||      }	|	rJ y)z:Ensure overlapping terms can be removed from PhraseMatcher)Onlysaveoutthebinarydataforrw   
individual
componentsrm   r   rx   ry   )0r;   r}         r;      r$   r   N)r   r   itemsr   r%   remover   )
r'   r    r(   termsr   match_idtermr)   new_matches
no_matchess
             r   test_issue10643r   ~   s+   
 lE
he
$C(,(F!34E H%G++- &$Htf%& clG((-q!4x7G7G7LaQR6STTTTNN3w<1#,KH,,S11a89999NN3w<1J>zr   c                    t        | g d      }t        | ddg      }t        |       }|j                  d|g       t         ||            dk(  sJ t        | dg      }t        |       }|j                  d|g       t         ||            dk(  sJ t        | ddg      }t        |       }|j                  d	|g       t         ||            dk(  sJ t        | d
g      }t        |       }|j                  d|g       t         ||            dk(  sJ t        | dd
g      }t        |       }|j                  d|g       t         ||            dk(  sJ y )Nr/   rd   GoogleNowbestr   r   r   COMPANYr$   r/   rd   ILIKEr   BESTNOWBESTr   r   r   r   )r'   r(   rJ   r   s       r   test_matcher_phrase_matcherr      sQ   
hD
EC(8U"34GH%GKK	G9%ws|!!!(3%(GH%GKKgYws|!!!(3-0GH%GKK'#ws|!!!(6(+GH%GKK	"ws|!!!(5&/2GH%GKK	G9%ws|!!!r   c                     t        |       }t        |      dk(  sJ |j                  dt        | dg      g       t        |      dk(  sJ |j                  dt        | dg      g       t        |      dk(  sJ y )	Nr   r:   testr   r$   r   test2r
   )r   r   r   r   r'   r   s     r   test_phrase_matcher_lengthr      sv    H%Gw<1KKXfX678w<1KK#hwi89:w<1r   c                 p    t        |       }|j                  dt        | dg      g       d|v sJ d|vsJ y )Nr:   r   r   r   )r   r   r   r   s     r   test_phrase_matcher_containsr      sB    H%GKKXfX678W'!!!r   c                 l   t        | ddg      }t        | dg      t        | ddg      g}t        |       } |j                  dd g|  t         ||            dk(  sJ t        |       }t	               } |j                  d|g|  t         ||            dk(  sJ |j
                  dk(  sJ t        |       }|j                  d|       t         ||            dk(  sJ t        |       }t	               }|j                  d||	       t         ||            dk(  sJ |j
                  dk(  sJ y )
Nr   r   r   OLD_APIr
   OLD_API_CALLBACKNEW_APINEW_API_CALLBACKon_match)r   r   r   r   r   
call_count)r'   r(   rY   r   r   s        r   test_phrase_matcher_add_new_apir      s6   
hsCj
)CHSE*Cc
,KLHH%GGKK	4+(+ws|!!!H%GvHGKK"H8x8ws|!!!!###H%GKK	8$ws|!!!H%GvHKK"HxK@ws|!!!!###r   c                 t   t        |       }|j                  dt        | dg      g       |j                  dt        | dg      g       |j                  dt        | dg      g       |j                  dt        | dg      g       t        | g d      }d|v sJ d|vsJ t         ||            dk(  sJ y )Nr:   rd   r   r   r   r$   )r   r   r   r   r'   r   r(   s      r    test_phrase_matcher_repeated_addr      s    H%GKKXfX678KKXfX678KKXfX678KKXfX678
hD
ECW'!!!ws|!!!r   c                    t        |       }|j                  dt        | dg      g       |j                  dt        | dg      g       t        | g d      }d|v sJ d|v sJ d|vsJ t         ||            dk(  sJ |j	                  d       d|vsJ d|v sJ d|vsJ t         ||            d	k(  sJ |j	                  d       d|vsJ d|vsJ d|vsJ t         ||            d
k(  sJ t        j                  t              5  |j	                  d       d d d        d|vsJ d|vsJ d|vsJ t         ||            d
k(  sJ y # 1 sw Y   2xY w)Nr   rd   r   r   r   r   TEST3r
   r$   r   )r   r   r   r   r   pytestraisesKeyErrorr   s      r   test_phrase_matcher_remover      s   H%GKK#hvh789KK#hvh789
hD
ECgg'!!!ws|!!!NN7'!!!g'!!!ws|!!!NN7'!!!'!!!'!!!ws|!!!	x	   w '!!!'!!!'!!!ws|!!!   s   EEc                 @   t        |       }|j                  dt        | dg      g       |j                  dt        | dg      g       t        | g d      }d|v sJ t        |      dk(  sJ t         ||            dk(  sJ |j	                  d       d|vsJ t        |      dk(  sJ t         ||            dk(  sJ  ||      d   d   | j
                  d   k(  sJ |j	                  d       d|vsJ t        |      dk(  sJ t         ||            dk(  sJ y )	Nr:   rd   r   r   r   r
   r$   r   )r   r   r   r   r   r%   r   s      r   +test_phrase_matcher_overlapping_with_remover     s0   H%GKKXfX678KK#hvh789
hD
ECWw<1ws|!!!NN6   w<1ws|!!!3<?1!1!1'!::::NN7'!!!w<1ws|!!!r   c                 "   g d}g d}g d}g d}t        | ||      }t        | d      }|j                  d|g       t        | ||      } ||      }t        |      d	k(  sJ |d
   \  }	}
}|	| j                  d   k(  sJ |
dk(  sJ |dk(  sJ y )Nr/   rd   catsPRONVERBNOUN)Yesrj   youhatedogsverymuch)INTJPUNCTr   r   r   ADVr   r    posPOSr8   r:   r$   r   r
   r   r   r   r   r   r%   )r'   words1pos1words2pos2rJ   r   r(   r)   r   startends               r    test_phrase_matcher_string_attrsr     s    "F#D@FBD(&d3GH51GKK	"
hf$
/CclGw<1"1:HeSx''////A::!8O8r   c                     g d}g d}g d}g d}t        | ||      }t        | d      }|j                  d|g       t        | ||      } ||      }t        |      d	k(  sJ y
)zATest that token with the control codes as ORTH are *not* matched.r   r   )zmatcher:POS-PRONzmatcher:POS-VERBzmatcher:POS-NOUN)Xr   r   r   r   r8   r:   r   Nr   )	r'   r   r   r   r   rJ   r   r(   r)   s	            r   )test_phrase_matcher_string_attrs_negativer   .  si    "F#DIFD(&d3GH51GKK	"
hf$
/CclGw<1r   c                 d   g d}g d}t        | |      }t        | d      }|j                  d|g       t        | |      } ||      }t        |      dk(  sJ |d   \  }}}	|d	   \  }
}}|| j                  d   k(  sJ |
| j                  d   k(  sJ |dk(  sJ |	d
k(  sJ |d
k(  sJ |dk(  sJ y )N)Helloworld!)Noproblemrj   hesaidrm   r   IS_PUNCTr8   r:   r
   r   r$      r   r   )r'   r   r   rJ   r   r(   r)   	match_id1start1end1	match_id2start2end2s                r   test_phrase_matcher_bool_attrsr   <  s    $F6F(&)GH:6GKK	"
hf
%CclGw<1%ajIvt%ajIvt((0000((0000Q;;199Q;;199r   c                 :   t        | dg      }d|d   _        t        | dg      }d|d   _        d|d   _        |d   j	                  d       t        | dg      }t        | d	      }t        j                  t              5  |j                  d
|g       d d d        t        j                  t              5  |j                  d|g       d d d        t        j                         5  t        j                  d       |j                  d|g       d d d        t        | dd      }t        j                         5  t        j                  d       |j                  d|g       d d d        y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   uxY w# 1 sw Y   y xY w)NTestr   ROOTr   TAGr   Feat=ValT)validater   r   errorr   r   )r9   r   TEST4)r   dep_tag_pos_	set_morphr   r   warnsUserWarningr   warningscatch_warningssimplefilter)r'   doc1doc2doc3r   s        r   test_phrase_matcher_validationr   O  s`   xx(DDGLxx(DDGLDGLGj!xx(DHt4G	k	" %GdV$%	k	" %GdV$%		 	 	" %g&GdV$% H54@G		 	 	" %g&GdV$% %% %% %% %% %s0   E-8E9()F;)F-E69FFFc                 z    t        j                  t              5  t        | d       d d d        y # 1 sw Y   y xY w)NUNSUPPORTEDr8   )r   r   
ValueErrorr   )r'   s    r   test_attr_validationr   e  s-    	z	" 4h]34 4 4s   1:c                    t        | dg      }d|d   _        t        | dg      }d|d   _        d|d   _        |d   j	                  d       d|d   _        t        | dg      }t        | d	
      }|j                  d|g       t        j                  t              5  |j                  d|g       d d d        t        j                  t              5  |j                  d|g       d d d        dD ]  }t        | |
      }|j                  d|g       t        j                  t              5  |j                  d|g       d d d        t        j                  t              5  |j                  d|g       d d d         t        | d
      }|j                  d|g       t        | d
      }|j                  d|g       y # 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w# 1 sw Y   xY w)Nr   r   r   r   r   r   r   LEMMADEPr8   r   r   r   )r   r   r   ORTHTEXT)r   r   r   r   r   lemma_r   r   r   r   r   )r'   r   r   r   r   r9   s         r   test_attr_pipeline_checksr   j  s   xx(DDGLxx(DDGLDGLGj!DGNxx(DH51GKK$ 	z	" %GdV$%	z	" %GdV$% ( )t4GdV$]]:& 	)KK$(	)]]:& 	)KK$(	) 	)) H62GKK$ H62GKK$ !% %% %	) 	)	) 	)s0    GG/G$G(GGG%	(G1	c                     t               }t        | g d      }t        | ddg      }t        |       }|j                  d|g|        ||      }|j	                  ||d|       y )Nr   r   r   r   r   r   r   )r   r   r   r   assert_called_once_with)r'   mockr(   rJ   r   r)   s         r   test_phrase_matcher_callbackr    sb    6D
hD
EC(8U"34GH%GKK	G9tK4clG  #q':r   c                     t        |       }t        | dg      }t        | ddg      }t        | g d      }t        | g d      }|j                  d||||g       |j                  d       y )Nthisr   is)r  r  r   )r  r  r   wordTHIS)r   r   r   r   )r'   r   r>   r@   pattern3pattern4s         r   /test_phrase_matcher_remove_overlapping_patternsr	    sg    H%G8F8,H8FD>2H8#67H8#>?HKK8Xx@ANN6r   c                     t        |       }t        | ddg      }t        j                  t              5  |j                  d|       d d d        y # 1 sw Y   y xY w)Nhellor   r   r:   )r   r   r   r   r   r   )r'   r   rJ   s      r   test_phrase_matcher_basic_checkr    sL    H%G(7G"45G	z	" %FG$% % %s   AAc                    t        |       }t               }|j                  dt        | dg      g       |j                  dt        | dg      g|       t        | g d      }t	        |      dk(  sJ t        j                  |      }t        j                  |      } ||      } ||      }t	        |      t	        |      k(  sJ ||k(  sJ |j                         d	   \  }}	}
}t        |
j                  d      t              sJ y )
Nr:   r   r   r   r   r   )thesearetests:r   r   r
   r$   )r   r   r   r   r   srslypickle_dumpspickle_loads
__reduce__rC   get)r'   r   r   r(   r   matcher_unpickledr)   matches_unpickledr   docs	callbacksr9   s               r   test_phrase_matcher_pickler    s    H%G6DKKXfX678KK#hwi89DKI
hM
NCw<17#A**1- clG)#.w<301111'''' $5#?#?#A!#D E4DimmG,d333r   c                    t        |       }|j                  dt        | ddg      g       |j                  dt        | dg      g       t        | g d      } ||d	      }t        |      d
k(  sJ t	        |d   t
              sJ |d   j                  dk(  sJ |d   j                  dk(  sJ t	        |d   t
              sJ |d   j                  dk(  sJ |d   j                  dk(  sJ y)zTest the new as_spans=True API.r   r  r   r   r!   r   )z...r  r   r  r  r   r   T)as_spansr
   r   zhello worldr$   N)r   r   r   r   rC   r   rT   rU   )r'   r   r(   r)   s       r   test_phrase_matcher_as_spansr    s    H%GKKc(7G*<=>?KKc(6(345
hR
SCcD)Gw<1gaj$'''1:??m+++1:###gaj$'''1:??f$$$1:###r   c                 j   t        |       }|j                  dt        | dg      g       t        | ddg      }t        j                  t
              5 }|j                  |g      D ]  } |j                  sJ dt        |j                  d   j                        v sJ 	 d d d        y # 1 sw Y   y xY w)Nr:   helllor   r  r   z
spaCy v3.0r   )
r   r   r   r   r   DeprecationWarningpipeliststrmessage)r'   r   r(   recordr3   s        r   test_phrase_matcher_deprecatedr'    s    H%GKKXhZ89:
hw0
1C	(	) ;Vse$ 	A	{{{s6;;q>#9#9::::	; ; ;s   AB))B2r9   
SENT_STARTIS_SENT_STARTc                     t        | |      }y )Nr8   )r   )r'   r9   r3   s      r   test_phrase_matcher_sent_startr+    s    hT*Ar   c                     g d}t        | |      }|dd }t        | g d      }t        |       }|j                  d|g        ||      } ||      }t        |      dk(  sJ t        |      dk(  sJ y)z7Ensure that PhraseMatcher accepts Span and Doc as inputrc   r   Nrn   ro   rp   r$   r   r'   r    r(   rq   rJ   r   matches_docmatches_spans           r   test_span_in_phrasematcherr0    s     kE
he
$Cr7D(":;GH%GKK'##,K4=L{q   |!!!r   c                     g d}t        | |      }|dd }t        | g d      }t        |       }|j                  d|g        ||      } ||      }t        |      dk(  sJ t        |      dk(  sJ y	)
zREnsure that PhraseMatcher only returns matches in input Span and not in entire Doc)r/   rd   re   rf   rg   r1   rh   ri   rj   re   rf   rg   r1   rh   matchersz,andre   rf   rg   
everywhererm   r   	      ro   rp   r   r$   Nr   r-  s           r    test_span_v_doc_in_phrasematcherr6    s    E he
$Cq9D(":;GH%GKK'##,K4=L{q   |!!!r   )2r   r   r  r   r   spacy.lang.enr   spacy.matcherr   r   spacy.tokensr   r   spacy.vocabr	   utilr   markissuer   r+   r5   rA   rD   ra   rr   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r  r	  r  r  r  r'  parametrizer+  r0  r6   r   r   <module>r@     s       ! 0 "   4  4+ + 4   4 ( 4, , 4 , 4  5 :":"$,
""6","&%,4
!>;%4.$ ; ,!@A+ B+" "r   