
    i&                        d dl Z d dlZd dlmZ d dlmZ d dlmZmZm	Z	m
Z
mZ d dlZd dlmZmZmZmZmZmZ d dlmZ d dlmZ dd	lmZ dd
lmZ ddlmZ ddlmZm Z m!Z! ddl"m#Z# 	 	 	 	 	 d(dedede
e   de
e$   de$de%de%fdZ&deeee#f      de	e   fdZ'	 d)dedede
e$   de%de$f
dZ(dedee   dedede)f
dZ*d  Z+d! Z, G d" d#      Z-	 d*d$ee)e$f   d%e$d&e$de.fd'Z/y)+    N)Counter)Path)CallableIterableListOptionalUnion)ConfigModel	Optimizerfix_random_seedset_dropout_rateset_gpu_allocator)ConfigValidationError)Printer   )Errors)ConfigSchemaPretrain)Doc)dot_to_objectload_model_from_configregistry   )Exampleconfig
output_dirresume_pathepoch_resumeuse_gpusilent	skip_lastc                 z   t        |      }| d   d   t        | d   d          | d   d   }|dk\  r|rt        |       d | d   d<   t        |       }	|	j                  j                         }
t        j                  |
d   t        	      }t        |
|d
         }t        j                  d
|i      d
   }|d   }t        |	|      |d   |t        |||      }nd}j                  d   }t        d      |d   r|j                  d| d|d    d       n|j                  d|        ddd} |j                  d i | d!fd	}	 t!        ||d         D ]  }t#         | ||	                  D ]b  \  }}t%        |      }t'        ||      }j)                  |||      }|r |j                  |fi | |d   sM||d   z  dk(  sY ||d       d |d   r||d   z  dk(  s||d   dz
  k(  r ||       n ||       d_         	 |s ||d   d       y y # |s ||d   d       w w xY w)"Nno_printtrainingseedgpu_allocatorr   
initializeinit_tok2vecpretraining)schemacorpusbatcher	optimizer)r    lossi'  )	frequencyn_save_epochz/Pre-training tok2vec layer - starting at epoch z - saving every z epoch)   
   r3         )rr6   r6   r6   r6   )widthsalignsc                 *   |rdnd}j                  j                        5  |r	dz  }n	d|  | dz  }|j                  d      5 }|j                  j	                  d      j                                d d d        
j                  
j                  
j                  | d}	d	z  j                  d
      5 }|j                  t        j                  |      dz          d d d        d d d        y # 1 sw Y   {xY w# 1 sw Y   xY w# 1 sw Y   y xY w)Nz.temp zmodel-last.binmodelz.binwbtok2vec)nr_wordr/   
epoch_lossepochz	log.jsonla
)
use_paramsaveragesopenwriteget_refto_bytesr>   r/   r?   srsly
json_dumps)r@   is_tempis_lastis_temp_str	save_pathfile_logr;   r.   r   trackers          h/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/spacy/training/pretrain.py_save_modelzpretrain.<locals>._save_modelF   s   !(gbi001 	:&>:	&5}D)II	!!$' A5EMM)4==?@A #??%00	C {*005 :E,,S1D89:	: 	:
A A: :	: 	:s<   %D	/C17AD	8(C= D	1C:	6D	=D	D		D
max_epochsn_save_everyT)rK   r           )rL   ))#z# Wordsz
Total LossLosszw/s)FF)r   r   r   r   r   interpolater   resolver   r   create_pretraining_model_resume_modelattrsProgressTrackerdividerrowrange	enumerateensure_docsmake_updateupdater?   )r   r   r   r   r   r    r!   msg	allocatornlp_configPr,   r-   	objectiverow_settingsrS   r@   batch_idbatchdocsr/   progressr;   r.   rQ   s    `                     @@@rR   pretrainrq      s    6
"Cj&!-z*623z"?3I!|	)$+/F<(
 
(Cjj$$&G/8LMA7AhK0Fx01(;F	lG$S!,E+I$UKfU F#I.G=l^K[\]^l\m[nntu	
 	El^TU/;TULCGGJ\J: :(7<<9 	%E#,WVC[-A#B 5%"5)"5$	9E">>%t<CGGH55^$(Q~5F*F!*Kt45  1^,,1UaoPQ>Q5Q&E"!$G	%" ,6 y,6 s   A5H( 	H( AH( (H:examples_or_docsreturnc                     g }| D ]?  }t        |t              r|j                  |       %|j                  |j                         A |S N)
isinstancer   append	reference)rr   ro   	eg_or_docs      rR   rc   rc   p   sE    D% -	i%KK	"KK	++,	-
 K    r;   c                    t        |      }|j                  d|        |j                  d      5 }|j                         }| j	                  d      j                  |       d d d        |^t        j                  dt        |            }|r$t        |j                  d      dd  d d       d	z   }nt        t        j                        |j                  d
|        |S # 1 sw Y   xY w)Nr#   zResume training tok2vec from: rbr=   zmodel\d+\.binr      r   zResuming from epoch: )r   inforE   readrG   
from_bytesresearchstrintgroup
ValueErrorr   E1020)r;   r   r   r    rf   rO   weights_data
model_names           rR   r\   r\   z   s     6
"CHH-k];<			$	 :5zz|i ++L9: YY/[1AB
z//2126s;<q@L V\\**HH$\N34: :s   1C!!C*ro   r.   objective_funcc                     | j                  |      \  }} || j                  ||      \  }} ||       | j                  |       t        |      S )zPerform an update over a single batch of documents.

    docs (iterable): A batch of `Doc` objects.
    optimizer (callable): An optimizer.
    RETURNS loss: A float for the loss.
    )begin_updateopsfinish_updatefloat)r;   ro   r.   r   predictionsbackpropr/   	gradientss           rR   rd   rd      sQ     "..t4K$UYYkBOD)Y		" ;rz   c                    | j                  g       5  | j                          ddd       t        | |      }t        |      j                  dk(  r8|j
                  dk7  r|j
                  nd}| j                  |      j                  }	 |j                  | j                  d      g       |d   } || j                  |      }|j                  | j                  d      g       t        ||d          |S # 1 sw Y   xY w# t        $ r5 |d   }|d	   }t        t        j                  j                  ||
            w xY w)a  Define a network for the pretraining. We simply add an output layer onto
    the tok2vec input model. The tok2vec input model needs to be a model that
    takes a batch of Doc objects (as a list), and returns a list of arrays.
    Each array in the output needs to have one row per token in the doc.
    The actual tok2vec layer is stored as a reference, and only this bit will be
    serialized to file and read back in when calling the 'train' command.
    )enableNTok2VecListener*r=   zGive it a doc to infer shapes)X	componentlayer)r   r   rk   dropout)select_pipesr(   get_tok2vec_reftype__name__upstream_nameget_piper;   make_docr   r   E874formatvocabr   )rh   pretrain_configr=   original_tok2vecr   r   create_functionr;   s           rR   r[   r[      s>    
				$ c?3GG}!22%,%:%:c%AG!!y 	 ,,/066Ocll+JKLM &k2OCIIw/E	%DEFGUOI67L)   O#K0	(++iu+MNNOs   C6	"D 6C?>E c                     |d   }|'d}d}ddg|dg}t        | j                  d   ||      | j                  |      j                  }|d   r|j	                  |d         }|S )Nr   zpTo use pretrained tok2vec weights, [pretraining.component] needs to specify the component that should load them.zcomponent can't be nullr*   )locrf   )r   errorsdescr   )r   r   r   r;   rG   )rh   r   tok2vec_componentr   errr   r   s          rR   r   r      s    '4 D 	 ((+6sCD#::m,V$
 	
 LL*+11Ewog67Lrz   c                       e Zd ZddZd Zy)r^   c                     d| _         d| _        d| _        t               | _        || _        t        j                         | _        d| _        d| _	        y )NrV   r   )
r/   	prev_lossr>   r   words_per_epochr0   time	last_timelast_updater?   )selfr0   s     rR   __init__zProgressTracker.__init__   sD    	&y"rz   c                    | xj                   |z  c_         | xj                  |z  c_        t        d |D              }| j                  |xx   |z  cc<   | xj                  |z  c_        | j                  | j
                  z
  }|| j                  k\  r|t        j                         | j                  z
  z  }| j                  | _        t        j                         | _        | j                   | j                  z
  }|| j                  t        | j                   d      t        |d      t        |      f}t        | j                         | _	        |S y )Nc              3   2   K   | ]  }t        |        y wru   )len).0docs     rR   	<genexpr>z)ProgressTracker.update.<locals>.<genexpr>   s     6#SX6s   r3   )widthr4   )r/   r?   sumr   r>   r   r0   r   r   r   _smart_roundr   r   )	r   r@   r/   ro   words_in_batchwords_since_updatewpsloss_per_wordstatuss	            rR   re   zProgressTracker.update   s   		T	4666U#~5#&!\\D,<,<</$		dnn(DEC#||D!YY[DN II6MTYYb1]!4CF #499-DNMrz   N)i@B )r   
__module____qualname__r   re    rz   rR   r^   r^      s    rz   r^   figurer   max_decimalc                     t        t        t        |                   }||dz   z
  }|dk  rt        t        |             S t        ||      }dt        |      z   dz   }|| z  S )z=Round large numbers as integers, smaller numbers as decimals.r   z%.f)r   r   r   min)r   r   r   n_digits	n_decimal
format_strs         rR   r   r      se     3s6{#$HA&IA~3v;	;/	C	N*S0
F""rz   )NNTF)T)r3   r5   )0r   r   collectionsr   pathlibr   typingr   r   r   r   r	   rI   	thinc.apir
   r   r   r   r   r   thinc.configr   wasabir   r   r   schemasr   tokensr   utilr   r   r   exampler   r   boolrq   rc   r\   r   rd   r[   r   r^   r   r   r   rz   rR   <module>r      s   	    < <   /   *  B B  #'"&S7S7S7 $S7 3-	S7
 S7 S7 S7l(5g+>"? DI  RV#3;C=JN. 2;MU
&>$! !J DE#%*#&)#=@##rz   