
    iFH                         d Z ddlZddlZddlZddlZddlmZ ddlmZ ddl	m
Z
 ddlmZ ddlmZ ddlZddlmZmZ d	d
lmZ ddlmZmZmZ e
r
ddlmZmZmZ  ej2                  e      Z G d d      Z G d d      Zy)z3
Model loading, caching, and lifecycle management.
    N)Callable)	lru_cache)TYPE_CHECKING)scan_cache_dir)tqdm)BitsAndBytesConfigPreTrainedTokenizerBase   )logging   )Modalitymake_progress_tqdm_classreset_torch_cache)PreTrainedModelPreTrainedTokenizerFastProcessorMixinc            	       F    e Zd ZdZ	 	 ddddedddd	fd
ZddZddZddZy)
TimedModelad  Wraps a model + processor and auto-unloads them after a period of inactivity.

    Args:
        model: The loaded model.
        timeout_seconds: Seconds of inactivity before auto-unload. Use -1 to disable.
        processor: The associated processor or tokenizer.
        on_unload: Optional callback invoked after the model is unloaded from memory.
    Nmodelr   timeout_seconds	processor/ProcessorMixin | PreTrainedTokenizerFast | None	on_unloadzCallable | Nonec                    || _         t        |j                        | _        || _        || _        || _        t        j                  | j
                  | j                        | _
        | j                  j                          y N)r   strname_or_path_name_or_pathr   r   
_on_unload	threadingTimer_timeout_reached_timerstart)selfr   r   r   r   s        w/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/cli/serving/model_manager.py__init__zTimedModel.__init__5   sb     
 !3!34".#ood&:&:D<Q<QR    c                     | j                   j                          t        j                  | j                  | j
                        | _         | j                   j                          y)z4Reset the inactivity timer (called on each request).N)r#   cancelr    r!   r   r"   r$   r%   s    r&   reset_timerzTimedModel.reset_timerD   s@    ood&:&:D<Q<QRr(   c                    t        | d      ru| j                  h| `| `d| _        d| _        t        j                          t                | j                  j                          | j                  | j                          yyyy)z0Delete the model and processor, free GPU memory.r   N)	hasattrr   r   gccollectr   r#   r*   r   r+   s    r&   delete_modelzTimedModel.delete_modelJ   sm    4!djj&<
DJ!DNJJLKK *! + '=!r(   c                     | j                   dkD  r@| j                          t        j                  | j                   d| j                    d       y y )Nr   z was removed from memory after zs of inactivity)r   r1   loggerinfor   r+   s    r&   r"   zTimedModel._timeout_reachedW   sJ    !#KK4--..MdNbNbMccrst $r(   NNreturnN)	__name__
__module____qualname____doc__intr'   r,   r1   r"    r(   r&   r   r   +   sM     HL'+   E	
 %"ur(   r   c                   t   e Zd ZdZ	 	 	 	 	 	 	 d"dededz  dededz  dedz  ded	edz  fd
Zededz  fd       Z	d Z
ededefd       Zdedz  fdZdeddfdZ	 d#dededz  dedz  ddfdZ	 	 d#dededz  dedz  ddfdZdefdZd$dZe	 d%dddddefd       Zeed%d edz  dee   fd!              Zy)&ModelManagera  Loads, caches, and manages the lifecycle of models.

    Handlers receive a reference to this and call `load_model_and_processor()`
    to get a model ready for inference.

    Args:
        device: Device to place models on (e.g. "auto", "cuda", "cpu").
        dtype: Torch dtype override. "auto" derives from model weights.
        trust_remote_code: Whether to trust remote code when loading models.
        attn_implementation: Attention implementation override (e.g. "flash_attention_2").
        quantization: Quantization method ("bnb-4bit" or "bnb-8bit").
        model_timeout: Seconds before an idle model is unloaded. -1 disables.
        force_model: If set, preload this model at init time.
    Ndevicedtypetrust_remote_codeattn_implementationquantizationmodel_timeoutforce_modelc                    i | _         i | _        t        j                         | _        i | _        i | _        |j                         rt        |      n|| _	        | j                  |      | _        || _        || _        || _        || _        || _        | j#                          |d| _        |!| j%                  | j'                  |             y y )N)loaded_models_model_locksr    Lock_model_locks_guard_loading_subscribers_loading_tasksisdigitr<   r@   _resolve_dtyperA   rB   rC   rD   rE   rF   _validate_argsload_model_and_processorprocess_model_name)r%   r@   rA   rB   rC   rD   rE   rF   s           r&   r'   zModelManager.__init__m   s     57 8:"+.."2 QS!79 &,^^%5c&k6((/
!2#6 (*& "!#D "))$*A*A+*NO #r(   c                 ~    dd l }| dv r| S t        || d       }t        ||j                        st	        d|  d      |S )Nr   )autoNzUnsupported dtype: 'zF'. Must be 'auto' or a valid torch dtype (e.g. 'float16', 'bfloat16').)torchgetattr
isinstancerA   
ValueError)rA   rV   resolveds      r&   rP   zModelManager._resolve_dtype   sO    N"L5%.(EKK0&ug-st  r(   c                 <   | j                   '| j                   dvrt        d| j                    d      h d}| j                  d uxr | j                  j                  d      }| j                  .|s+| j                  |vrt        d| j                   d| d      y y y )	N)bnb-4bitbnb-8bitz"Unsupported quantization method: 'z$'. Must be 'bnb-4bit' or 'bnb-8bit'.>   sdpaeagerflex_attentionflash_attention_2flash_attention_3zkernels-community/z'Unsupported attention implementation: 'z'. Must be one of zF or a kernels-community kernel (e.g. 'kernels-community/flash-attn2').)rD   rY   rC   
startswith)r%   VALID_ATTN_IMPLEMENTATIONSis_kernels_communitys      r&   rQ   zModelManager._validate_args   s    (T->->F^-^4T5F5F4GGkl  &s"#77tC  
H`H`HkHk I
 $$0(((0JJ9$:R:R9S T""<!=  >DE  K ) 1r(   model_idr7   c                     d| v r| S |  dS )zBCanonicalize to `'model_id@revision'` format. Defaults to `@main`.@z@mainr=   )rf   s    r&   rS   zModelManager.process_model_name   s     (?O5!!r(   c                 t    | j                   dk(  rt        ddd      S | j                   dk(  rt        d      S y)zIReturn a BitsAndBytesConfig based on the `quantization` setting, or None.r\   Tnf4)load_in_4bitbnb_4bit_quant_typebnb_4bit_use_double_quantr]   )load_in_8bitN)rD   r   r+   s    r&   get_quantization_configz$ModelManager.get_quantization_config   sD    
*%!$)*. 
 *,%488r(   model_id_and_revisionz(ProcessorMixin | PreTrainedTokenizerFastc                 t    ddl m} |j                  dd      \  }}|j                  ||| j                        S )zLoad a processor for the given model.

        Args:
            model_id_and_revision: Model ID in ``'model_id@revision'`` format.
        r   )AutoProcessorrh   r   )revisionrB   )transformersrr   splitfrom_pretrainedrB   )r%   rp   rr   rf   rs   s        r&   _load_processorzModelManager._load_processor   s=     	/288a@(,,X\`\r\r,ssr(   
tqdm_classprogress_callbackr   c                 Z   ddl m} |j                  dd      \  }}|| j                  | j                  | j
                  | j                  | j                         |d}| |d|dd        |j                  |fi |}t        t         |j                  d         }	 |	j                  |fi |S )	a  Load a model.

        Args:
            model_id_and_revision (`str`): Model ID in ``'model_id@revision'`` format.
            tqdm_class (*optional*): tqdm subclass for progress bars during ``from_pretrained``.
            progress_callback (`Callable`, *optional*): Called with progress dicts during loading.

        Returns:
            `PreTrainedModel`: The loaded model.
        r   )
AutoConfigrh   r   )rs   rC   rA   
device_maprB   quantization_configrx   loadingconfigstatusr   stage)rt   r{   ru   rC   rA   r@   rB   ro   rv   rW   architectures)
r%   rp   rx   ry   r{   rf   rs   model_kwargsr   architectures
             r&   _load_modelzModelManager._load_model   s     	,288a@( !#'#;#;ZZ++!%!7!7#'#?#?#A$
 (=R]efg+++HEE|V-A-A!-DE+|++HEEEr(   z@tuple[PreTrainedModel, ProcessorMixin | PreTrainedTokenizerFast]c           	           j                   5   j                  j                  |t        j                               }ddd       5  | j
                  vrt        j                  d|        | |d|dd        j                  |      } j                  |||      }t        | j                  ||f fd	       j
                  |<   |j |d	|d
d       n] j
                  |   j                           j
                  |   j                  } j
                  |   j                  }| |d	|dd       ddd       ||fS # 1 sw Y   	xY w# 1 sw Y   fS xY w)a  Load a model (or return it from cache), resetting its inactivity timer.

        Args:
            model_id_and_revision: Model ID in ``'model_id@revision'`` format.
            progress_callback: If provided, called with dicts like
                ``{"status": "loading", "model": ..., "stage": ...}`` during loading.
            tqdm_class: Optional tqdm subclass for progress bars during ``from_pretrained``.
        NzLoading r~   r   r   )rx   ry   c                 <    j                   j                  | d       S r   )rI   pop)keyr%   s    r&   <lambda>z7ModelManager.load_model_and_processor.<locals>.<lambda>  s    @R@R@V@VWZ\`@a r(   )r   r   r   readyFr   r   cachedT)rL   rJ   
setdefaultr    rK   rI   r3   warningrw   r   r   rE   r,   r   r   )r%   rp   ry   rx   lockr   r   s   `      r&   rR   z%ModelManager.load_model_and_processor   s    $$ 	Y$$//0Ey~~GWXD	Y  	k$D,>,>>*?)@AB$0%EZep&qr 001FG	(()jTe )  =G$($6$6')>a	=""#89 %0%CXdi&jk""#89EEG**+@AGG ../DEOO	$0%CXdh&ij-	k. i5	Y 	Y	k. is   /EC1EEEc                   	K   |t        j                         } j                  v r> j                     j                          dt	        j
                  ddd       d y j                  v rA j                     j                  |       	 |j                          d{   }|	 y| "|g j                  <   t        j                         dt        f fdt              		fd	 	fd
}t        j                   |              j                  <   	 |j                          d{   }|y| !7 7 w)u  Load a model and stream progress as SSE events.

        Handles three cases:
        1. Model already cached → single ``ready`` event
        2. Load already in progress → join existing subscriber stream
        3. First request → start loading, broadcast to all subscribers

        Args:
            model_id_and_revision (`str`): Model ID in ``'model_id@revision'`` format.

        Yields:
            `str`: SSE ``data: ...`` lines with progress updates.
        data: r   Tr   

Npayloadc                 j    dt        j                  |        dfd}j                  |       y )Nr   r   c                  j    j                   j                  g       D ]  } | j                          y r   )rM   get
put_nowait)qmidmsgr%   s    r&   	broadcastzEModelManager.load_model_streaming.<locals>.enqueue.<locals>.broadcastG  s1    2266sB? &ALL%&r(   )jsondumpscall_soon_threadsafe)r   r   r   loopr   r%   s     @r&   enqueuez2ModelManager.load_model_streaming.<locals>.enqueueD  s0    4::g./t4C& %%i0r(   c                      |i |S r   r=   )factoryargskwargsrx   s      r&   
_tqdm_hookz5ModelManager.load_model_streaming.<locals>._tqdm_hookO  s    t.v..r(   c                    K   	 t        j                        } 	 t        j                  j                         d {    t        j                  |        	 fd}j                  |       y 7 2# t        j                  |        w xY w# t
        $ r<}t        j                  d d| d        dt        |      d       Y d }~td }~ww xY w# fd}j                  |       w xY ww)	N)ry   rx   zFailed to load z: T)exc_infoerror)r   r   messagec                      j                   j                  g       D ]  } | j                  d         j                  j                  d        y r   )rM   r   r   rN   )r   r   r%   s    r&   _send_sentinelzKModelManager.load_model_streaming.<locals>.run_load.<locals>._send_sentinelf  sG    !66::3C +T*+''++C6r(   )
r   set_tqdm_hookasyncio	to_threadrR   	Exceptionr3   r   r   r   )	previous_hooker   r   r   r   r   r%   rx   s	      r&   run_loadz3ModelManager.load_model_streaming.<locals>.run_loadR  s     : !( 5 5j A9!++55*1#-	   ))-87
 )).9% ))-8 Nse2aS9DI7SSVLMMN
7
 )).9si   C4B &A6 A4A6 B C C44A6 6BB 	C2CC CC C11C4)r   QueuerI   r,   r   r   rN   rM   appendr   get_running_loopdictr   create_task)
r%   rp   queueitemr   r   r   r   r   rx   s
   `    @@@@@r&   load_model_streamingz!ModelManager.load_model_streaming  sP     $+2==? $$$$s#//14::3RV&WXYY]^^ $%%%%%c*11%8"YY[(< 
	  +0!!#&'')	1T 	1 .gs;
	/	: 	:6 $+#6#6xz#BC $D|J	 k )l %s%   B#E+E,BEEEEc                 t    t        | j                  j                               D ]  }|j                           y)z,Delete all loaded models and free resources.N)listrI   valuesr1   )r%   timeds     r&   shutdownzModelManager.shutdownu  s1    $,,3356 	!E 	!r(   r   r   r   c                 &   | t        |t              rt        j                  S ddlm}m} | j                  j                  }||j                         v rt        j                  S ||j                         v rt        j                  S t        d|       )a  Detect whether a model is an LLM or VLM based on its architecture.

        Args:
            model (`PreTrainedModel`): The loaded model.
            processor (`ProcessorMixin | PreTrainedTokenizerFast`, *optional*):
                If a plain tokenizer (not a multi-modal processor), short-circuits to LLM.

        Returns:
            `Modality`: The detected modality (``Modality.LLM`` or ``Modality.VLM``).
        r   !MODEL_FOR_CAUSAL_LM_MAPPING_NAMES*MODEL_FOR_IMAGE_TEXT_TO_TEXT_MAPPING_NAMESzUnknown modality for: )rX   r	   r   LLM&transformers.models.auto.modeling_autor   r   	__class__r8   r   VLMrY   )r   r   r   r   model_classnames        r&   get_model_modalityzModelManager.get_model_modalityz  s      Z	;R%S<<	

  //22HOOQQ<< A H H JJ<<5o5FGHHr(   	cache_dirc           	         ddl m}m} g }t        j	                  d       t        t        |       j                        D ]<  }|j                  dk7  r|j                  j                         D ]  \  }}t        d |j                  D        d      }|s't        j                  |j                         j!                               }t#        |t$              rd|v sm|d   }	|j'                         |j'                         t)        fd|	D              sd	|j*                  v r|j*                  j-                  d	      nd
}
|j*                  |dk7  rd| nd
z   }|j/                  |
|d|j0                  d        ? |S )a^  List generative models (LLMs and VLMs) available in the HuggingFace cache.

        Args:
            cache_dir (`str`, *optional*): Path to the HuggingFace cache directory.
                Defaults to the standard cache location.

        Returns:
            `list[dict]`: OpenAI-compatible model list entries with ``id``, ``object``, etc.
        r   r   z/Scanning the cache directory for LLMs and VLMs.r   c              3   T   K   | ]   }|j                   d k(  s|j                   " yw)zconfig.jsonN)	file_name	file_path).0fs     r&   	<genexpr>z.ModelManager.get_gen_models.<locals>.<genexpr>  s#     #mAPQP[P[_lPlAKK#ms   ((Nr   c              3   4   K   | ]  }|g v s|  y wr   r=   )r   archllmsvlmss     r&   r   z.ModelManager.get_gen_models.<locals>.<genexpr>  s      P4$9OtPs   / mainrh   )owned_byidobjectcreated)r   r   r   r3   r   r   r   repos	repo_typerefsitemsnextfilesr   loadsopenreadrX   r   r   anyrepo_idru   r   last_modified)r   r   r   generative_modelsreporefrevision_infoconfig_pathr   r   authorrepo_handler   r   s               @@r&   get_gen_modelszModelManager.get_gen_models  s]   	

 HI	2889 	D~~(&*iioo&7 "]"#m9L9L#most"K$4$4$6$;$;$=>"640_5N & 78??AAHHJPPP8;t||8KT\\//4QSF"&,,sf}AcU)RT"UK%,,(."-&-'+'9'9	!		: ! r(   )rU   rU   FNNi,  Nr5   r6   r   )r8   r9   r:   r;   r   boolr<   r'   staticmethodrP   rQ   rS   r   ro   rw   typer   r   rR   r   r   r   r   r   r   r   r   r=   r(   r&   r?   r?   ]   s   " ""'*.#' "&%P%P Tz%P  	%P
 !4Z%P Dj%P %P 4Z%PN 
cDj 
 
& "S "S " "
);d)B 
	tS 	t=g 	t pt F%( F6:Tk F]ehl]l F	 FJ .2"&	) ")  $d?)  4K	) 
 
L) VT Tl!
 aeI I-^I	I I: /!#* /!T
 /!  /!r(   r?   )r;   r   r/   r   r    collections.abcr   	functoolsr   typingr   huggingface_hubr   r   rt   r   r	   utilsr   r   r   r   r   r   r   
get_loggerr8   r3   r   r?   r=   r(   r&   <module>r      sn     	   $    *   D  H H UU 
		H	%/u /udl! l!r(   