
    i4                         d Z ddlZddlZddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ  ej                  e      Z G d d	      Zd
e_         y)z+
CLI entry point for `transformers serve`.
    N)	Annotated)logging)is_serve_available   )set_torch_seedc            .       R   e Zd Z	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 	 d2deedz   ej                  d      f   dee ej                  d      f   dee	dz   ej                  d      f   d	ee	dz   ej                  d
      f   dee	dz   ej                  d      f   dee
dz   ej                  d      f   deedz   ej                  d      f   deedz   ej                  d      f   dee ej                  d      f   deedz   ej                  d      f   dee ej                  d      f   deedz   ej                  d      f   dee ej                  d      f   dee	 ej                  d      f   dee ej                  d       f   d!ee	 ej                  d"      f   d#ee ej                  d$      f   d%ee ej                  d&      f   d'ee	dz   ej                  d(      f   d)ee ej                  d*d+,      f   d-df*d.Zd/ Zd0 Zd1 Zy)3ServeNforce_modelz*Model to preload and use for all requests.)helpcontinuous_batchingzMEnable continuous batching with paged attention. Configure with --cb-* flags.cb_block_sizez6KV cache block size in tokens for continuous batching.cb_num_blocksz2Number of KV cache blocks for continuous batching.cb_max_batch_tokensz1Maximum tokens per batch for continuous batching.cb_max_memory_percentz/Max GPU memory fraction for KV cache (0.0-1.0).cb_use_cuda_graphz+Enable CUDA graphs for continuous batching.attn_implementationz2Attention implementation (e.g. flash_attention_2).compilez*Enable torch.compile for faster inference.quantizationz.Quantization method: 'bnb-4bit' or 'bnb-8bit'.devicez4Device for inference (e.g. 'auto', 'cuda:0', 'cpu').dtypez2Override model dtype. 'auto' derives from weights.trust_remote_codezTrust remote code when loading.model_timeoutzGSeconds before idle model is unloaded. Ignored when force_model is set.hostzServer listen address.portzServer listen port.enable_corszEnable permissive CORS.	log_levelz'Logging level (e.g. 'info', 'warning').default_seedzDefault torch seed.non_blockingTz1Run server in a background thread. Used by tests.)hiddenr   returnc           	         t               st        d      dd l}ddlm} ddlm} ddlm} ddl	m
} ddlm} dd	lm} |t        |       t!        j"                  d
      }|j%                  t         j&                  |j)                                    ||||||
||      | _        ddlm} |||||dj1                         D ci c]
  \  }}||| } }}| r |di | nd }! |||	|!      | _         || j*                  | j2                        | _         || j*                  | j2                        | _         || j*                  | j2                        | _         || j*                  | j4                  | j6                  | j8                  |      }"|j;                  |"||d      }#|j=                  |#      | _        |r| jA                          y | j>                  jC                          y c c}}w )NzRMissing dependencies for serving. Install with `pip install transformers[serving]`r   r   )ChatCompletionHandler)ModelManager)ResponseHandler)build_server)TranscriptionHandler)GenerationStatetransformers)r   r   r   r   r   r   r
   )ContinuousBatchingConfig)
block_size
num_blocksmax_batch_tokensmax_memory_percentuse_cuda_graph)r   r   	cb_config)model_managergeneration_state)response_handlertranscription_handlerr   info)r   r   r    )"r   ImportErroruvicornserving.chat_completionr"   serving.model_managerr#   serving.responser$   serving.serverr%   serving.transcriptionr&   serving.utilsr'   r   r   
get_loggersetLevel
log_levelslower_model_managerr(   r)   items_generation_state_chat_handler_response_handler_transcription_handlerConfigServerserverstart_serverrun)$selfr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r7   r"   r#   r$   r%   r&   r'   transformers_loggerr)   kv	cb_kwargsr/   appconfigs$                                       g/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/cli/serve.py__init__zServe.__init__"   s   \ "#rssB750?2 #<( &00@$$W%7%7	8I%JK*/ 3%'#
 	:
 ,+$7&;"3 eg

1 } qD

	 

 >G,9y9D	!0 3"
 3--!33

 "1--!33"

 ';4;N;NPTPfPf&g#!33"&"="=#
 $TVLnnV,KKOOY

s   G'c                 ~      fd}t        j                  |dd       _         j                  j                          y )Nc                      t        j                         } t        j                  |        | j                  j                  j                                y )N)asyncionew_event_loopset_event_looprun_until_completerJ   serve)looprM   s    rT   _runz Serve.start_server.<locals>._run   s:    ))+D""4(##DKK$5$5$78    zuvicorn-threadF)targetnamedaemon)	threadingThread_threadstart)rM   r^   s   ` rT   rK   zServe.start_server   s2    	9
 !''t:JSXYr_   c                 8    | j                   j                          y)z$Clear all loaded models from memory.N)rB   shutdownrM   s    rT   reset_loaded_modelszServe.reset_loaded_models   s    $$&r_   c                    | j                   j                          | j                  j                          | j                  r| j                  j	                         sy d| j
                  _        | j                  j                  d       y )NT   )timeout)rD   rh   rB   re   is_aliverJ   should_exitjoinri   s    rT   kill_serverzServe.kill_server   s`    '')$$&||4<<#8#8#:"&!$r_   )NFNNNNNNFNautorr   Fi,  	localhosti@  FwarningNF)__name__
__module____qualname__r   strtyperArgumentboolOptionintfloatrU   rK   rj   rq   r5   r_   rT   r	   r	   !   sh    qu
       di lrpvch LWIMUZbkX\ YysTz>5>>?k+llmy 'ELLmnp
	y !$J*bcc
y !$J*^__
y '$J*]^^
y"  )DL,%,,,]^^ 
#y( %4K+XYY
)y. '$J*^__
/y4 43_!``a5y6  $J*Z[[
7y< #|u||1ghhi=y> t\U\\7k%llm?y@ %T<5<<=^+_%_`AyB !#lmm
CyJ \U\\/GHHIKyL \U\\/DEEFMyN t\U\\7P%QQROyP S,%,,4]"^^_QyR  d
LELL>S,T TUSyT  ,%,,d1dee
UyZ 
[yv'%r_   r	   u  
Run a FastAPI server to serve models on-demand with an OpenAI compatible API.
Models will be loaded and unloaded automatically based on usage and a timeout.


Endpoints:
    POST /v1/chat/completions — Chat completions (streaming + non-streaming).
    GET  /v1/models           — Lists available models.
    GET  /health              — Health check.

Requires FastAPI and Uvicorn: pip install transformers[serving]
)__doc__rX   rc   typingr   ry   transformers.utilsr   transformers.utils.import_utilsr   r=   r   r>   ru   loggerr	   r5   r_   rT   <module>r      sK        & > ) 
		H	%O% O%dr_   