
    iE7                     R   d dl Z d dlZd dlZd dlZd dlmZ d dlmZ d dlm	Z	m
Z
mZmZmZmZ d dlZd dlZd dlZd dlmZmZmZmZ ddlmZmZ ddlmZ dd	lmZ dd
lmZm Z m!Z!m"Z"m#Z#m$Z$m%Z%m&Z&m'Z' ddl(m)Z*m+Z+ ddl,m-Z- e
rddl.m/Z/ dddede0ddfdZ1dddddddee   dee   dee2   ddf
dZ3dddddee2ef   de4ddfd Z5ddd!ee2ef   d"ee2ef   de4fd#Z6de*jn                  d$d%ddd&ee   d'e0d(e0dee2   d)e2d*e2ddfd+Z8e*jn                  d,d&ed-e0d)e2fd.Z9d/ee2ef   de	fd0Z:d1 Z;y)2    Nislice)Path)IOTYPE_CHECKINGAnyDictOptionalUnion)ConfigConfigValidationErrorfix_random_seedset_gpu_allocator   )ErrorsWarnings)Lookups)ConfigSchemaTraining)	DEFAULT_OOV_PROBOOV_RANKensure_pathget_sourced_components
load_modelload_model_from_configloggerregistryresolve_dot_names)ModeVectors   )get_tok2vec_ref)Language)use_gpuconfigr$   returnr"   c          	      h   | }|j                         } d| d   vr)t        t        j                  j	                  d            d| d   vr)t        t        j                  j	                  d            | d   d   t        | d   d          | d   d   }|dk\  r|rt        |       t        |       }t        |d	      t        j                  d
       j                  j                         } t        j                  | d   t              }|d   |d   g}t        |d   t               s7t#        t        j$                  j	                  dt'        |d                     t        |d   t               s7t#        t        j$                  j	                  dt'        |d                     t)        | |      \  }|d   }|d   }	|D 
cg c]	  }
|
|	vs|
 }}
t        j                  dj*                         |rCj-                  |      5  t        j                  d|       j/                  |       d d d        j1                          j-                  g |	|      5  |d   dk(  r1dt        j2                  d       j5                  fd|       nj5                  fd|       t        j                  dj*                         d d d        j6                  D ]  \  }}t9        |d g       D ]  }|j*                  vr||	v r8||	vr4t        j:                  t<        j>                  j	                  ||!             ||	vsS||	v sX||d"   vs`t        j:                  t<        j@                  j	                  ||!               S c c}
w # 1 sw Y   gxY w# 1 sw Y   xY w)#Nseedtrainingz[training] seed)valuegpu_allocatorz[training] gpu_allocatorr   T)	auto_fillzSet up nlp object from config)schematrain_corpus
dev_corpusztraining.train_corpus)fieldtype)descztraining.dev_corpus	optimizerfrozen_componentszPipeline: %s)enablezResuming training for: %s)sgddisable
max_epochsr#   d   zDue to streamed train corpus, using only first %s examples for initialization. If necessary, provide all labels in [initialize]. More info: https://spacy.io/api/cli#init_labelsc                  (    t                      S Nr   )nlpsample_sizer.   s   j/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/training/initialize.py<lambda>zinit_nlp.<locals>.<lambda>[   s    |C0+>     c                              S r<    )r=   r.   s   r?   r@   zinit_nlp.<locals>.<lambda>^   s    <#4 rA   z#Initialized pipeline components: %slistening_components)namelistenerannotating_components)!interpolate
ValueErrorr   E1015formatr   r   r   r   r   infor%   r   resolver   
isinstancestrr   E897r1   r   
pipe_namesselect_pipesresume_training_link_componentsdebug
initializepipelinegetattrwarningr   W087W086)r%   r$   
raw_config	allocatorsourcedT	dot_namesr/   r3   r4   presume_componentsrE   procrF   r=   r>   r.   s                  @@@r?   init_nlprd   #   s   J##%FVJ'',,3D,EFFfZ00,,3M,NOOj&!-z*623z"?3I!|	)$$V,G
 t
<C
KK/0ZZ##%F
+4HIA>"AlO4Ia'-###-D>9J4K $ 
 	

 aos+###+$q2G $ 
 	

  1CL*+I-.$+Jqq8I/IJJ
KK/%67 	/KK35FGI.	/
 			"J$5"J8I"J		K K\?b KLLB 	 NN>I   NN4)ND93>>JK  ll W
d("
 	WH s~~-,,=N1Nx}}333QR00T=N5Nq!899NN8==#7#7TH#7#UV	WW JM K	/ 	/K Ks%   		NN)N$A1N(N%(N1)datalookupsvectorsr=   re   rf   rg   c                l   |r@|| j                   _        t        j                  ddj	                  |j
                               t        |      }|t        j                  |      }| j                   D ]  }t        |_
         |D ]+  }d|v r| j                   |d      } |j                  di | - t        | j                         r t        d | j                   D              dz
  }nt        }| j                   j                  j!                  d|i       t        j                  dt        | j                                t        j                  d	       |"t#        | |       t        j                  d
|       | j$                  j'                  di       }	t        |	      dkD  rt)        | j                   j*                  j-                  dg            }
|	j/                         D ]>  \  }}|
|k7  st1        j2                  t4        j6                  j9                  |             @ t        j                  d       y )NzAdded vocab lookups: %sz, settingsorthc              3   4   K   | ]  }|j                     y wr<   )prob).0lexs     r?   	<genexpr>zinit_vocab.<locals>.<genexpr>   s     93889s   r    oov_probz%Added %d lexical entries to the vocabzCreated vocabularyzAdded vectors: %s_sourced_vectors_hashesr   strings)excluderE   z Finished initializing nlp objectrC   )vocabrf   r   rL   jointablesr   srsly
read_jsonlr   rank	set_attrslenminr   cfgupdateload_vectors_into_modelmetapophashrg   to_bytesitemswarningswarnr   W113rK   )r=   re   rf   rg   	data_path	lex_attrslexemeattrsrp   sourced_vectors_hashesvectors_hashsourced_componentsourced_vectors_hashs                r?   
init_vocabr   q   s    #		-tyy/HID!I$$Y/	ii 	#F"FK	# 	&EU"YYuV}-FF%u%		&
 syy>9syy99A=H'H		j(34;S^L
KK$%W-'1 XX\\*CRH
!"Q&CII--66	{6KL7M7S7S7U 	L3333hmm228I2JK	L KK23rA   T)add_stringsrE   r   c                F   	 dg}|s|j                  d       t        || j                  |      }t        |j                  j                  j                               dk(  r1|j                  j                  j                  t        j                  k7  sW|j                  j                  j                  d   dk(  rd|j                  j                  j                  t        j                  k(  r3t        j                  t        j                  j!                  |	             | j                  D ]E  }	| j                  j                  j"                  j%                  |	j&                  t(              |	_        G y# t        $ r'}d| }d}t        j                  |||      }|dd}~ww xY w)
zHLoad word vectors from an installed model or path into a model instance.rf   rr   )ru   rs   z$Config validation error for vectors zThis typically means that there's a problem in the config.cfg included with the packaged vectors. Make sure that the vectors package you're loading is compatible with the current version of spaCy.)titler2   Nr   rt   )appendr   ru   r   
from_errorr|   rg   keysmodeVectorsModefloretshaper   rY   r   W112rK   key2rowgetrj   r   rz   )
r=   rE   r   rs   vectors_nlper   r2   errrn   s
             r?   r   r      sV    +NN9% SYYH 	K%%**,-2%%**k.@.@@!!''*a/%%**k.@.@@x}}+++67yy E99$$,,008DE' ! 6tf=G 	
 $..qDIts   .E0 0	F 9"FF pretrain_configinit_configc                 p   |}|}d }t        |d         }|^|j                         s$d| }ddg|dg}t        | j                  |      |j	                  d      5 }	|	j                         }d d d        |4t        | |      }
|
j                  |       t        j                  d|       yy	# 1 sw Y   @xY w)
Ninit_tok2veczcan't find pretrained tok2vec: rV   )locmsg)r%   errorsrbz!Loaded pretrained weights from %sTF)
r   existsr   r%   openreadr!   
from_bytesr   rL   )r=   r   r   PIweights_datar   r   r   file_layers              r?   r   r      s     	AALq01L""$3L>BC+^<SIJF'szz&IIt$ 	( ::<L	(Q'&7F	( 	(s   B,,B5ORTH)rE   r   attrvectors_loctruncatepruner   r   c                   t        |      }|r	|j                  d   j                  d      r|dk7  rt        d      t	        | j
                  j                  t        j                  |j                  d                  | j
                  _
        | j
                  D ]^  }|j                  s|j                  t        k7  s$| j
                  j                  j                  |j                  |j                         ` | j
                  j                          n|r?t!        j"                  d|       t%        |||	      \  }}	}
t!        j"                  d
|       nd\  }}	|	8|t&        j(                  k7  r%|	D ]   }|| j
                  vs| j
                  |    " ||t&        j(                  k(  r3t	        d| j
                  j                  ||d
| j
                  _
        nLt	        | j
                  j                  ||	|      | j
                  _
        | j
                  j                          |<| j*                  d    d| j*                  d    d| j
                  j                  _        n|| j
                  j                  _        | j
                  j                  j,                  | j*                  d   d<   |dk\  r0|t&        j(                  k7  r| j
                  j/                  |       y y y )Nr#   z.npzr   z@ORTH is the only attribute supported for vectors in .npz format.r   )rr   re   )rowzReading vectors from %sr   zLoaded vectors from %s)NN)rr   re   r   )rr   re   r   r   lang_rE   z.vectorsrg   r    rC   )r   partsendswithrI   r   ru   rr   numpyloadr   rg   rz   r   addrj   deduplicate_vectorsr   rL   read_vectorsr   r   r   rE   prune_vectors)r=   r   r   r   rE   r   r   rn   vectors_datavector_keysfloret_settingswords               r?   convert_vectorsr      sr    k*K{((,55f=6>R  $II%%EJJ{7G7G7M,N
		 99 	>CxxCHH0		!!%%chhCHH%=	> 			%%'KK1;?9E:6L+
 KK0+>(4%L+"t{/A/A'A# $syy(IIdO$ #{)))$+ %II--%% &	%		! %,II--%$	%		! 		--/|$'HHV$4#5Qsxx7G6H!Q		!%		"%))"3"3"8"8CHHYzdk000		& 1zrA   r   truncate_vectorsc                   t        |       }t        |      j                         }t        d |d d D              }i }|t        j
                  k(  rwt        |      dk7  rt        d      dt        |d         t        |d         t        |d         t        |d         |d	   |d
   d}|dk\  r5t        t        j                        t        |      dk(  sJ |dk\  r||d   f}t        j                  |d      }g }t        t        j                  |d             D ]  \  }	}
|
j                         }
|
j!                  d|j"                  d         }|j%                  d      }t        |      |j"                  d   k7  r*t        t        j&                  j)                  |	|             t        j*                  |d      ||	<   |j-                  |       |	|dz
  k(  s n |||fS )Nc              3   2   K   | ]  }t        |        y wr<   intrm   sizes     r?   ro   zread_vectors.<locals>.<genexpr>  s     9#d)9   r      z^Invalid header for floret vectors. Expected: bucket dim minn maxn hash_count hash_seed BOW EOWr                  )r   minnmaxn
hash_count	hash_seedboweowr    f)r   dtyper7    r   )line_numr   )r   )ensure_shapenextsplittupler   r   r|   rI   r   r   E860r   zeros	enumeratetqdmrstriprsplitr   r   E094rK   asarrayr   )r   r   r   r   header_partsr   r   r   vectors_keysilinepiecesr   s                r?   r   r     s    	[!A7==?L9Ra(899EO{!!!|!N 
 Q(Q(l1o.\!_-??
 q V[[))< A%%%q %uQx0E;;U#6LLTYYq$78 	4{{}S,"4"4Q"78zz!}v;,,,Q//V[[///LMM--c:QD! 1$$	 66rA   r   c                    t        |       } t        j                  t        |             rt        j                  t        |       d      S | j
                  d   j                  d      r&d t        j                  t        |       d      D        S | j
                  d   j                  d      rKt        j                  t        |             }|j                         }|j	                  |d         }d |D        S | j	                  dd	
      S )z%Handle .gz, .tar.gz or unzipped fileszr:gzr#   gzc              3   >   K   | ]  }|j                  d         ywutf8Ndecoderm   r   s     r?   ro   zopen_file.<locals>.<genexpr>C  s     IF#I   rzipr   c              3   >   K   | ]  }|j                  d         ywr   r   r   s     r?   ro   zopen_file.<locals>.<genexpr>H  s     6F#6r   r   )encoding)r   tarfile
is_tarfilerO   r   r   r   gzipzipfileZipFilenamelist)r   zip_filenamesr   s       r?   	open_filer  =  s    
c
C#c(#||CHf--	2			%I		#c(C0HII	2			&??3s8,!!#eAh'666xxfx--rA   c              #     K   t        |       }t        |      }	 t        d |j                         dd D              }|| |E d{    nXt        |j                               dz
  }d}|D ]  }|dz  }	 | d|  t        |       }|E d{    |j                          |j                          y# t        $ r d}Y w xY w7 7 8w)zEnsure that the first line of the data is the vectors shape.
    If it's not, we read in the data and output the shape as the first result,
    so that the reader doesn't have to deal with the problem.
    c              3   2   K   | ]  }t        |        y wr<   r   r   s     r?   ro   zensure_shape.<locals>.<genexpr>U  s     CDc$iCr   Nr   r    r   )r  r   r   r   rI   r|   close)r   lines
first_liner   widthlengthr   lines2s           r?   r   r   M  s     
 k"EeJCJ,<,<,>r,BCC  J$$&'!+ 	AaKF	%!! ;'	KKM'  
 	 	sF   C#B7 CC	ACC
%C7CCCC
C)<r  r   r   r  	itertoolsr   pathlibr   typingr   r   r   r	   r
   r   r   rx   r   	thinc.apir   r   r   r   r   r   r   rf   r   schemasr   utilr   r   r   r   r   r   r   r   r   rg   r   r   r   pretrainr!   languager"   r   rd   rO   r   boolr   r   defaultr   r   r  r   rC   rA   r?   <module>r     s         @ @    W W %  *
 
 
 3 %# 02 KV K Kj Kb  !%!%4	%4 4.%4 g	%4
 c]%4 
%4R EIE	E d+E=AE	ED	&*38nCGS>	: ##='	='$=' 	='
 =' 3-=' =' =' 
='B >I=P=P(7(7),(77:(7V.5d# . . rA   