
    i                     d   d dl mZ d dlmZ d dlZd dlmc mZ d dl	m
Z
 d dlmZ ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddlmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z,m-Z-m.Z. ddl/m0Z0 ddl1m2Z2 ddl3m4Z4m5Z5m6Z6m7Z7m8Z8 ddl9m:Z:m;Z; ddl<m=Z= ddl>m?Z? ddl@mAZAmBZBmCZC  e(       r	  e)j                  eE      ZF e&d      e
 G d de?                    ZG e&d      e
 G d de2                    ZH e&d      e
 G d  d!e                    ZIe& G d" d#e!             ZJe e&d$%       G d& d'e                    ZK G d( d)e:      ZL G d* d+e;      ZM G d, d-eC      ZN G d. d/ej                        ZP G d0 d1ej                        ZQ G d2 d3eB      ZR G d4 d5eA      ZS G d6 d7e0      ZT G d8 d9ej                        ZU G d: d;e8      ZV G d< d=e7      ZW G d> d?e5      ZX G d@ dAe6      ZY G dB dCej                        ZZ G dD dEej                        Z[ G dF dGej                        Z\ G dH dIej                        Z] G dJ dKe4      Z^ G dL dMej                        Z_ G dN dOej                        Z` e&dP%       G dQ dReJ             Za G dS dTeJe      Zbg dUZcy)V    )Callable)	dataclassN)strict)nn   )initialization)ACT2FN)Cache)PreTrainedConfig)%ClassifierFreeGuidanceLogitsProcessorGenerationMixinGenerationModeLogitsProcessorList)GenerateDecoderOnlyOutput)BaseModelOutputBaseModelOutputWithPoolingModelOutput)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringcan_return_tupleis_vision_availableloggingtorch_compilable_check   )CONFIG_MAPPING
AutoConfig	AutoModel)Blip2VisionModel)ChameleonVQVAEConfig)ChameleonVQVAEChameleonVQVAEEncoderAttnBlock#ChameleonVQVAEEncoderConvDownsample ChameleonVQVAEEncoderResnetBlockChameleonVQVAEVectorQuantizer)IdeficsBaseModelOutputWithPastIdeficsCausalLMOutputWithPast)eager_attention_forward)SiglipVisionConfig)SiglipEncoderSiglipEncoderLayerSiglipVisionEmbeddingszdeepseek-community/Janus-Pro-1B)
checkpointc                      e Zd ZU dZdZeed<   dZeed<   dZeed<   dZ	ee
e   z  eeef   z  ed	<   d
Zeed<   dZeez  ed<   dZeed<   dZeez  ed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dZeed<    e       Zy)JanusVisionConfigz
    projection_dropout (`float`, *optional*, defaults to 0.0):
        Dropout probability for the projection layer.
    num_image_tokens (`int`, *optional*, defaults to 576):
        Number of image tokens.
    i   hidden_size   num_hidden_layers   num_attention_headsi  
image_sizegelu
hidden_actg      @	mlp_ratioTattention_bias        hidden_dropout_rate   projection_dimprojection_dropoutFuse_qk_norm{Gz?initializer_ranger   depthi@  num_image_tokensN)__name__
__module____qualname____doc__r2   int__annotations__r4   r6   r7   listtupler9   strr:   floatr;   boolr=   r?   r@   rA   rC   rD   rE   AttributeErrorintermediate_size     x/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/janus/modular_janus.pyr1   r1   @   s     Ks!!47Jd3i%S/17J Ius{ ND'**NC&))K#u#E3Nc&(rT   r1   c                   8   e Zd ZU dZdZeed<   dZeed<   dZe	ed<   dZ
eed	<   d
Zeed<   dZeed<   dZeed<   dZeed<   dZee   eedf   z  ed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   dZeed<   dZeed<   dZeed<    e       Z e       Z e       Zy )!JanusVQVAEConfiga  
    base_channels (`int`, *optional*, defaults to 128):
        Base channel count.
    channel_multiplier (`list[int]`, *optional*, defaults to `[1, 1, 2, 2, 4]`):
        Channel multipliers for each resolution.
    num_res_blocks (`int`, *optional*, defaults to 2):
        Number of residual blocks.
    num_patches (`int`, *optional*, defaults to 32):
        Num of patches the input images can be divided into.
    out_channels (`int`, *optional*, defaults to 3):
        Number of out channels.
    image_token_embed_dim (`int`, *optional*, defaults to 2048):
        Dimension of image embeddings. It should be same as the dimensionality of text embeddings.
       	embed_dimi @  num_embeddingsFdouble_latent   latent_channels    num_patchesr   in_channelsout_channels   base_channels)   rd   r   r      .channel_multiplierr   num_res_blocksr<   dropoutrB   rC   r>   r?   r4   r8   r9   image_token_embed_dimN)rF   rG   rH   rI   rY   rJ   rK   rZ   r[   rP   r]   r_   r`   ra   rc   rf   rL   rM   rg   rh   rO   rC   r?   r4   r9   rN   ri   rQ   
resolutionattn_resolutions	attn_typerS   rT   rU   rW   rW   [   s     IsNCM4OSKKL#M36ES	E#s(O3ENCGUS[#u#NCsJ!%3%!J%' IrT   rW   c                        e Zd ZU dZdZeeedZdZ	e
ez  dz  ed<   dZe
ez  dz  ed<   dZe
ez  dz  ed<   dZeed	<    fd
Z xZS )JanusConfiga  
    Example:

    ```python
    >>> from transformers import JanusForConditionalGeneration, JanusConfig, JanusVisionConfig, JanusVQVAEConfig, LlamaConfig

    >>> # Initializing a Janus vision config
    >>> vision_config = JanusVisionConfig()

    >>> # Initializing a Llama config
    >>> text_config = LlamaConfig()

    >>> # Initializing a VQ config
    >>> vq_config = JanusVQVAEConfig()

    >>> # Initializing a Janus Pro 1B style configuration
    >>> configuration = JanusConfig(vision_config=vision_config, text_config=text_config, vq_config=vq_config)

    >>> # Initializing a model from the Janus Pro 1B style configuration
    >>> model = JanusForConditionalGeneration(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```janus)text_configvision_config	vq_configNrp   rq   rr   i image_token_idc                 r   t        | j                  t              rT| j                  j                  dd      | j                  d<   t	        | j                  d      di | j                  | _        n3| j                  't
        j                  d       t	        d          | _        | j                  %t
        j                  d       t               | _        n4t        | j                  t              rt        di | j                  | _        | j                  %t
        j                  d       t               | _	        n4t        | j                  t              rt        di | j                  | _	        | j                  j                  | j                  j                  z  | j                  _        t        | <  di | y )N
model_typellamaz7`text_config` is None. Initializing with default valueszK`vision_config` is None. Initializing with default JanusVisionConfig valueszF`vq_config` is None. Initializing with default JanusVQVAEConfig valuesrS   )
isinstancerp   dictgetr   loggerinforq   r1   rr   rW   r7   
patch_sizer_   super__post_init__)selfkwargs	__class__s     rU   r~   zJanusConfig.__post_init__   sN   d&&--1-=-=-A-A,PW-XD\*-d.>.>|.LMaPTP`P`aD%KKQR-g68D%KKef!2!4D**D1!2!HT5G5G!HD>>!KK`a-/DN--??DN &*%7%7%B%BdFXFXFcFc%c"''rT   )rF   rG   rH   rI   ru   r   r1   rW   sub_configsrp   rx   r   rK   rq   rr   rs   rJ   r~   __classcell__r   s   @rU   rn   rn      st    2 J!*%K 37K((4/648M4**T1804It&&-4 NC ( (rT   rn   c                   R     e Zd ZU eed<   dZdZdZddgZddgZ	dZ
dZdZ fd	Z xZS )
JanusPreTrainedModelconfigmodelimagetextTLlamaDecoderLayerJanusVisionEncoderLayerpast_key_valuescausal_maskc                     t         |   |       t        |t              rZt	        j
                  |j                  t        j                  |j                  j                  d         j                  d             y y )N)rd   r   )r}   _init_weightsrw   JanusVisionEmbeddingsinitcopy_position_idstorcharangeshapeexpand)r   moduler   s     rU   r   z"JanusPreTrainedModel._init_weights   s[    f%f34JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 5rT   )rF   rG   rH   rn   rK   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_supports_flash_attn_supports_sdpa_can_compile_fullgraphr   r   r   s   @rU   r   r      sO    (&*#,.GH#4m"DN!i irT   r   z9
    Base class for Janus VQ-VAE mode model outputs.
    )custom_introc                   b    e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   y)JanusVQVAEOutputz
    decoded_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
        Reconstructed pixel values after encoding and decoding the input.
    embedding_loss (`torch.FloatTensor`):
        Embedding loss.
    Ndecoded_pixel_valuesembedding_loss)	rF   rG   rH   rI   r   r   FloatTensorrK   r   rS   rT   rU   r   r      s4     6:%++d29/3NE%%,3rT   r   c                       e Zd Zy)JanusBaseModelOutputWithPastNrF   rG   rH   rS   rT   rU   r   r          rT   r   c                       e Zd Zy)JanusCausalLMOutputWithPastNr   rS   rT   rU   r   r      r   rT   r   c                   J    e Zd Zddej                  dedej                  fdZy)r   pixel_valuesinterpolate_pos_encodingreturnc                 X   |j                   \  }}}}| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }|r| j                  |||      }	n| j                  | j                        }	||	z   }|S )N)dtyper   rd   )
r   patch_embeddingweightr   toflatten	transposer   position_embeddingr   )
r   r   r   _heightwidthtarget_dtypepatch_embeds
embeddings
pos_embedss
             rU   forwardzJanusVisionEmbeddings.forward   s    *001fe++2288++LOO,O,OP!))!,66q!<
#66z65QJ001B1BCJ*,
rT   N)F)rF   rG   rH   r   TensorrP   r   rS   rT   rU   r   r      s'    ELL D ]b]i]i rT   r   c                   t     e Zd ZdZdef fdZ	 d	dej                  dej                  dz  dee	   fdZ
 xZS )
JanusVisionAttentionz(Attention Class for Janus Vision Encoderr   c                 F   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _
        |j                  }|j                  }d| _        d| _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                  | j                  z  |j"                        | _        t        j                   | j                  | j                        | _        |dkD  rt        j,                  |      nt        j.                         | _        |rt        j0                  | j                        nt        j.                         | _        |r%t        j0                  | j                        | _        y t        j.                         | _        y )	Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      Frd   biasr   )r}   __init__r   r2   rY   r6   	num_headshead_dim
ValueErrorscaleattention_dropoutr@   rA   	is_causalnum_key_value_groupsr   Linearr;   q_projk_projv_projprojection_layerDropoutIdentity	LayerNormq_normk_norm)r   r   proj_dropoutqk_normr   s       rU   r   zJanusVisionAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
!'!9!900$$ %&!ii0NU[UjUjkii0NU[UjUjkii0NU[UjUjk "		$..$.. I>JQ>N"**\":TVT_T_Ta6=bll4>>22;;=6=bll4>>22;;=rT   Nhidden_statesattention_maskr   c                 "   |j                         \  }}}| j                  |      }| j                  |      }| j                  |      }	|j	                  d| j
                  | j                        }| j                  |      }|j	                  d| j
                  | j                        }| j                  |      }|j	                  ||| j
                  | j                        j                  dd      }|j	                  ||| j
                  | j                        j                  dd      }|	j                  ||| j
                  | j                        j                  dd      }	t        j                  | j                  j                  t              }
 |
| |||	|f| j                   sdn| j"                  | j$                  | j&                  d|\  }}|j	                  ||| j(                        }| j+                  |      }| j-                  |      }||fS )Nr   rd   r   r<   )rh   scalingr   )sizer   r   r   reshaper   r   r   r   r   viewr   get_interfacer   _attn_implementationr*   trainingr   r   r   rY   r   r@   )r   r   r   r   
batch_sizeseq_lenr   query_states
key_statesvalue_statesattention_interfaceattn_outputattn_weightsoutputs                 rU   r   zJanusVisionAttention.forward"  s    "/!3!3!5
GQ{{=1[[/
{{=1#++BN{{<0''DNNDMMJ
[[,
#++JQUQ^Q^_iijkmno''
GT^^T]][eefgijk
#((Wdnndmm\ffghjkl(?(M(MKK,,.E)
 %8
%
  $}}C$2H2HJJnn
%
 
%
!\ "))*gt~~N&&{3((0|##rT   N)rF   rG   rH   rI   r1   r   r   r   r   r   r   r   r   s   @rU   r   r     sO    2Q0 Q@ /3)$||)$ t+)$ +,	)$rT   r   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )JanusVisionMLPr   c                    t         |           || _        t        |j                  |j
                  z        | _        t        |j                     | _	        t        j                  |j                  | j                        | _        t        j                  | j                  |j                        | _        t        j                  |j                        | _        t        j                  |j                        | _        y r   )r}   r   r   rJ   r2   r:   rR   r	   r9   activation_fnr   r   fc1fc2r   r=   dropout1dropout2r   r   r   s     rU   r   zJanusVisionMLP.__init__O  s    !$V%7%7&:J:J%J!K#F$5$5699V//1G1GH99T33V5G5GH

6#=#=>

6#=#=>rT   r   r   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j	                  |      }|S r   )r   r   r   r   r   r   r   s     rU   r   zJanusVisionMLP.forwardY  sP    /**=9m4/m4rT   )	rF   rG   rH   r1   r   r   r   r   r   r   s   @rU   r   r   N  s+    ?0 ?U\\ ell rT   r   c                   $     e Zd Zdef fdZ xZS )r   r   c                 T   t         |   |       || _        |j                  | _        t        |      | _        t        j                  | j                  |j                        | _
        t        j                  | j                  |j                        | _        t        |      | _        y )N)eps)r}   r   r   r2   rY   r   	self_attnr   r   layer_norm_epslayer_norm1layer_norm2r   mlpr   s     rU   r   z JanusVisionEncoderLayer.__init__c  sv     ++-f5<<F<Q<QR<<F<Q<QR!&)rT   rF   rG   rH   r1   r   r   r   s   @rU   r   r   b  s    *0 * *rT   r   c                   $     e Zd Zdef fdZ xZS )JanusVisionEncoderr   c                     t         |   |       t        j                  t	        |j
                        D cg c]  }t        |       c}      | _        y c c}w r   )r}   r   r   
ModuleListranger4   r   layersr   r   r   r   s      rU   r   zJanusVisionEncoder.__init__n  s@     mmeTZTlTlNm$n%<V%D$no$ns   Ar  r   s   @rU   r  r  m  s    p0 p prT   r  c            
       r     e Zd ZeedZdef fdZ	 	 d
dej                  dz  de
dee   deez  fd	Z xZS )JanusVisionModelr   
attentionsr   c                 D    t         |   |       t        |      | _        y r   )r}   r   r  encoderr   s     rU   r   zJanusVisionModel.__init__y  s     )&1rT   Nr   r   r   r   c                     |t        d      | j                  ||      } | j                  dd|i|}|j                  }| j	                  |      }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r   inputs_embedsr   )last_hidden_statepooler_outputrS   )r   r   r  r  post_layernormr   )r   r   r   r   r   encoder_outputsr  pooled_outputs           rU   r   zJanusVisionModel.forward}  s     ?@@Ogh+74<< ,
',
,

 ,== //0AB)!Q'2++M:)/'
 	
rT   NF)rF   rG   rH   r   r   _can_record_outputsr1   r   r   r   rP   r   r   rM   r   r   r   r   s   @rU   r  r  s  sh    0*
20 2 26).
''$.
 #'
 +,	

 
+	+
rT   r  c                   *     e Zd Zdef fdZd Z xZS )JanusVisionAlignerMLPr   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w Nrd   )r}   r   r   r   r2   r?   r   r	  r
  rD   hidden_layersr	   r9   r   r  s      rU   r   zJanusVisionAlignerMLP.__init__  s    99V//1F1FG]]NSTUW]WcWcNdeRYYv,,f.C.CDe
 $F$5$56 f   &1B<c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r   r   r   r   r   layers      rU   r   zJanusVisionAlignerMLP.forward  G    /'' 	1E ..}=M!-0M	1 rT   )rF   rG   rH   r1   r   r   r   r   s   @rU   r  r    s    70 7rT   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEVectorQuantizerr   c                 N    t         |   |       |j                  gdz  | _        y )Nr   )r}   r   r_   quant_state_dimsr   s     rU   r   z"JanusVQVAEVectorQuantizer.__init__  s&     !'!3!3 4q 8rT   image_tokensr   c                 B   |j                   d   }| j                  j                  j                   d   }| j                  |      }t        j                  |dd      }|j                  |g| j                  |      }|j                  dddd      j                         }|S )Nr   r   r   )pdimr   rd   )	r   	embeddingr   F	normalizer   r*  permute
contiguous)r   r+  r   emb_dimhidden_state_quants        rU   get_codebook_entryz,JanusVQVAEVectorQuantizer.get_codebook_entry  s    !''*
~~,,2226 "^^L9[[);qbI 044j5b4CXCX5bZa5bc/771aCNNP!!rT   )
rF   rG   rH   rW   r   r   
LongTensorr   r6  r   r   s   @rU   r(  r(    s/    9/ 9"u/?/? "EDUDU "rT   r(  c                       e Zd Zy)JanusVQVAEResnetBlockNr   rS   rT   rU   r9  r9    r   rT   r9  c                       e Zd Zy)JanusVQVAEAttnBlockNr   rS   rT   rU   r;  r;    r   rT   r;  c                       e Zd Zy)JanusVQVAEConvDownsampleNr   rS   rT   rU   r=  r=    r   rT   r=  c                   $     e Zd Z fdZd Z xZS )JanusVQVAEConvUpsamplec                 t    t         |           t        j                  j	                  ||ddd      | _        y )Nr   rd   kernel_sizestridepadding)r}   r   r   r   Conv2dconv)r   r`   r   s     rU   r   zJanusVQVAEConvUpsample.__init__  s.    HHOOK!TU_`Oa	rT   c                 X    t        j                  |dd      }| j                  |      }|S )Ng       @nearest)scale_factormode)r0  interpolaterF  r   s     rU   r   zJanusVQVAEConvUpsample.forward  s(    m#IV		-0rT   )rF   rG   rH   r   r   r   r   s   @rU   r?  r?    s    brT   r?  c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZ	S )JanusVQVAEMidBlockr   channelsc                     t         |           t        |||      | _        t	        |      | _        t        |||      | _        y )Nr   r`   ra   )r}   r   r9  block_1r;  attn_1block_2)r   r   rN  r   s      rU   r   zJanusVQVAEMidBlock.__init__  sF    , !

 *(3, !
rT   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rQ  rR  rS  r   s     rU   r   zJanusVQVAEMidBlock.forward  s2    ]3M2]3rT   )
rF   rG   rH   rW   rJ   r   r   r   r   r   r   s   @rU   rM  rM    s2    
/ 
3 
U\\ ell rT   rM  c                   >     e Zd Z fdZdej
                  fdZ xZS )JanusVQVAEEncoderc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }|j                  }|j                  }t        j                  j                  ||ddd      | _        dt        |      z   }|| _        t        j                          | _        t%        | j                        D ]   }t        j                          }	t        j                          }
|||   z  }|||   z  }t%        | j
                        D ]N  }|	j'                  t)        |||             |}|| j                  dz
  k(  s5|
j'                  t+        |             P t        j,                         }|	|_        |
|_        || j                  dz
  k7  rt3        |      |_        | j"                  j'                  |        t7        |      | _        t        j                  j;                  d|dd	      | _        t        j                  j                  ||rd
|z  n|ddd      | _        y )Nr   rd   rA  )rd   rP  r^   ư>T
num_groupsnum_channelsr   affiner   ) r}   r   lenrf   num_resolutionsrg   rc   r`   r[   r]   r   r   rE  conv_inrM   in_channel_multiplierr	  downr
  appendr9  r;  Moduleblockattnr=  
downsamplerM  mid	GroupNormnorm_outconv_out)r   r   rc   r`   r[   r]   rf   r`  i_levelrd  re  block_in	block_outi_blockra  r   s                  rU   r   zJanusVQVAEEncoder.__init__  s   "6#<#<=$33,,((,, 00#66xx{MqYZdef $u-?'@ @%:"MMO	T112 	#GMMOE==?D$'<W'EEH%(:7(CCI !4!45 
?)%$,%. %d22Q66KK 3H =>
? 99;DDJDI$..22":8"DIIT"-	#0 &fh7**bxUYbf*g#0Ao ( 
rT   r   c                    | j                  |      g}t        | j                        D ]  }t        | j                        D ]  } | j                  |   j
                  |   |d         }t        | j                  |   j                        dkD  r" | j                  |   j                  |   |      }|j                  |        || j                  dz
  k7  s|j                  | j                  |   j                  |d                 |d   }| j                  |      }| j                  |      }|t        j                  |      z  }| j                  |      }|S )Nr   r   rd   )r_  r
  r^  rg   ra  rd  r]  re  rb  rf  rg  ri  r   sigmoidrj  )r   r   r   rk  rn  hidden_stater  s          rU   r   zJanusVQVAEEncoder.forward!  sT   l34T112 		WG !4!45 3@tyy177@!"%  tyy)../!3#C499W#5#:#:7#CL#QL$$\23 $..22$$TYYw%7%B%B=QSCT%UV		W *"- HH%67 !MM*;<U]]+<== MM*;<  rT   )rF   rG   rH   r   r   r7  r   r   r   s   @rU   rV  rV    s    1
f!E$4$4 !rT   rV  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )JanusVQVAEDecoderc           	      v   t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }|j                  }||j                  | j                  dz
     z  }t        j                  j                  ||ddd      | _        t        ||      | _        t        j                         | _        t#        t%        | j                              D ]  }t        j                         }t        j                         }||j                  |   z  }	t%        | j
                  dz         D ]N  }
|j'                  t)        |||	             |	}|| j                  dz
  k(  s5|j'                  t+        |             P t        j,                         }||_        ||_        |dk7  rt3        |      |_        | j                   j'                  |        t        j                  j7                  d|dd	      | _        t        j                  j                  ||ddd      | _        y )
Nrd   r   rA  rP  r   r^   rX  TrY  )r}   r   r]  rf   r^  rg   rc   r]   ra   r   r   rE  r_  rM  rg  r	  upreversedr
  rb  r9  r;  rc  rd  re  r?  upsamplerh  ri  rj  )r   r   rc   r]   ra   rl  rk  rd  re  rm  rn  ru  r   s               rU   r   zJanusVQVAEDecoder.__init__;  s   "6#<#<=$33,, 00** !6#<#<T=Q=QTU=U#VV xxaXYcde &fh7 --/d&:&: ;< 	GMMOE==?D%(A(A'(JJI !4!4q!89 
?)%$,%. %d22Q66KK 3H =>
? BBHBG!|4X>GGNN2)	. **bxUYbf*g,AVWabcrT   rq  r   c                 b   | j                  |      }| j                  |      }t        | j                        D ]  }t        | j                  dz         D ]l  } | j
                  |   j                  |   |      }t        | j
                  |   j                        dkD  sK | j
                  |   j                  |   |      }n || j                  dz
  k7  s| j
                  |   j                  |      } | j                  |      }|t        j                  |      z  }| j                  |      }|S )Nrd   r   )r_  rg  r
  r^  rg   ru  rd  r]  re  rw  ri  r   rp  rj  )r   rq  rk  rn  s       rU   r   zJanusVQVAEDecoder.forwardi  s    ||L1 xx- T112 	GG !4!4q!89 P>twww/55g>|Ltwww',,-1#A4777#3#8#8#A,#OLP $..22#www/88F	G }}\2l33}}\2rT   )rF   rG   rH   r   r   r   r   r   r   s   @rU   rs  rs  :  s)    ,d\E$5$5 %:K:K rT   rs  c                        e Zd Zg dZeedZdZdef fdZ	de
j                  de
j                  fdZeede
j                  dee
j                  e
j                  f   fd	              Z xZS )

JanusVQVAE)r;  r9  r(  r  r   r   c                 r    t         |   |       t        |      | _        d| _        | j                          y r  )r}   r   rs  decodergradient_checkpointing	post_initr   s     rU   r   zJanusVQVAE.__init__  s0     (0&+# 	rT   r+  r   c                    |j                   d   | j                  j                  d   | j                  j                  d   z  k7  rMt        d| j                  j                  d   | j                  j                  d   z   d|j                    d      | j                  j	                  |      }| j                  |      }| j                  |      }|S )aG  
        Decodes quantized token IDs into pixel values.
        Args:
            image_tokens (torch.LongTensor): Batch of token IDs.
        Returns:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                Pixel values decoded from the token IDs.
        rd   r   z4Expected `image_tokens` to have shape `(batch_size, z)`, but got shape `z`.)r   quantizer*  r   r6  post_quant_convr|  )r   r+  codebook_entryr   r   s        rU   decodezJanusVQVAE.decode  s     a DMM$B$B1$EHfHfghHi$iiFt}}GeGefgGhkokxkx  lJ  lJ  KL  lM  HM  GN N""."4"4!5R9  99,G,,^<||M2rT   c                     |j                   d   } | j                  |fddi|}| j                  |j                  j	                  |d            }t        ||j                        S )Nr   return_dictTr   )r   encoder  r+  r   r   r   )r   r   r   r   encode_outputsr   s         rU   r   zJanusVQVAE.forward  se     "''*
$\NtNvN#{{>+F+F+K+KJXZ+[\ 4n6S6STTrT   )rF   rG   rH   r   r9  r;  r  main_input_namerW   r   r   r7  r   r  r   r   rM   r   r   r   s   @rU   rz  rz  ~  s     /) %O/ 5#3#3 8I8I & 	U''	U 
u  %"3"33	4		U  	UrT   rz  c                   *     e Zd Zdef fdZd Z xZS )JanusVQVAEAlignerMLPr   c           	         t         |           t        j                  |j                  |j
                        | _        t        j                  t        d|j                        D cg c],  }t        j                  |j
                  |j
                        . c}      | _
        t        |j                     | _        y c c}w r  )r}   r   r   r   rY   r?   r   r	  r
  r4   r   r	   r9   r   r  s      rU   r   zJanusVQVAEAlignerMLP.__init__  s    99V--v/D/DE]]NSTUW]WoWoNpqRYYv,,f.C.CDq
 $F$5$56 rr!  c                 |    | j                  |      }| j                  D ]  }| j                  |      } ||      } |S r   r#  r$  s      rU   r   zJanusVQVAEAlignerMLP.forward  r&  rT   )rF   rG   rH   rW   r   r   r   r   s   @rU   r  r    s    7/ 7rT   r  c                   `     e Zd ZdZdef fdZdej                  dej                  fdZ	 xZ
S )JanusVQVAEHeadzOHead used for sampling tokens in image generation, replacing the usual lm head.r   c                    t         |           t        j                  |j                  |j
                        | _        t        |j                     | _	        t        j                  |j
                  |j                        | _        y r   )r}   r   r   r   ri   r?   proj_outr	   r9   r   rZ   vision_headr   s     rU   r   zJanusVQVAEHead.__init__  s^    		&">">@U@UV#F$5$5699V%:%:F<Q<QRrT   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r  r   r  r   s     rU   r   zJanusVQVAEHead.forward  s6    m4**=9((7rT   )rF   rG   rH   rI   rW   r   r   r   tensorr   r   r   s   @rU   r  r    s0    YS/ SU\\ ell rT   r  zl
    The Janus model which consists of a siglip vision backbone, a Llama language model and a VQ model.
    c                       e Zd Zdef fdZd Zd Zeede	j                  dee   deez  fd              Zd	e	j                   d
e	j                  de	j                  fdZee	 	 	 	 	 	 	 	 dd	e	j                   dz  de	j                  dz  de	j$                  dz  de	j                   dz  dedz  d
e	j                  dz  dedz  dee	j$                  z  defd              Z xZS )
JanusModelr   c                    t         |   |       || _        t        j	                  |j
                        | _        t        | j                  j                        | _        t        j	                  |j                        | _        t        j                  | j                  j                  j                  | j                  j                  j                        | _        t#        | j                  j                        | _        t'        | j                  j                        | _        t+        j,                  |j.                        | _        d| _        | j5                          y )N)r   F)r}   r   r   r  _from_configrq   vision_modelr  alignerrz  rr   vqmodelr   	EmbeddingrZ   rY   generation_embeddingsr  generation_alignerr  generation_headr    from_configrp   language_modelr}  r~  r   s     rU   r   zJanusModel.__init__  s     ,99&:N:NO,T->->-E-EF!..v/?/?@ &(\\$,,2E2E2T2TVZVbVbViViVsVs%t""6t||7J7J"K-dll.A.AB'336;M;MN&+#rT   c                 6    | j                   j                         S r   )r  get_input_embeddingsr   s    rU   r  zJanusModel.get_input_embeddings  s    ""7799rT   c                 :    | j                   j                  |       y r   )r  set_input_embeddingsr   values     rU   r  zJanusModel.set_input_embeddings  s    007rT   r   r   r   c                 p     | j                   |fddi|}| j                  |j                        |_        |S )Nr  T)r  r  r  r  )r   r   r   vision_outputss       rU   get_image_featureszJanusModel.get_image_features  s=    
 +**<TTTVT'+||N4T4T'U$rT   	input_idsr  image_featuresc                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        r   devicer   r   rd   z6Image features and image tokens do not match, tokens: z, features: )r  r   r  r   rs   longr  allsumr   	unsqueeze	expand_asr   r   numel)r   r  r  r  special_image_maskn_image_tokensn_image_featuress          rU   get_placeholder_maskzJanusModel.get_placeholder_mask   s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!rT   Nr   r   r   	use_cachelogits_to_keepc	           
      "   |d u |d uz  rt        d      | | j                         |      }|| j                  |d      j                  }
|
j	                  d|j
                  d         }|j                  |j                  |j                        }| j                  |||      }|j                  ||      } | j                  d||||||d|	}t        |j                  |j                  |j                  |j                   |
      S d       S )	NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oneT)r  r   )r  r  )r  r   r   r   r  r  )r  r   r   r  image_hidden_statesrS   )r   r  r  r  r   r   r   r  r   r  masked_scatterr  r   r  r   r   r  )r   r  r   r   r   r   r  r  r  r   image_embedsr  image_attention_mask	lm_outputs                 rU   r   zJanusModel.forward  sP    -t";<s   7D557	BM#22<T2R``L)11"m6I6I"6MNN+..}/C/C]EXEXYN#'#<#<~ $= $  *889M~^M'D'' 
')%+)
 
	 ,'99%55#11 ++0<0H
 	

 OS
 	
rT   )NNNNNNNr   )rF   rG   rH   rn   r   r  r  r   r   r   r   r   r   rM   r   r  r7  r  r   r
   rP   rJ   r   r   r   r   s   @rU   r  r    sb   { *:8 !--9?@R9S	+	+  "))":?:K:K"]b]n]n"0  .215.204(,26!%-.,
##d*,
 ''$.,
 t+	,

 &&-,
 ,
 ((4/,
 $;,
 ell*,
 
&,
  ,
rT   r  c                   V    e Zd ZddiZdZdZdef fdZd Zd Z	d	e
j                  d
e
j                  fdZee	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                   dz  de
j                  dz  de
j                  dz  dedz  de
j                   dz  de
j                  dz  dedz  dee
j                  z  dee   d
efd              Z	 	 	 	 	 	 d fd	Zde
j                  fdZ e
j4                         	 	 	 dd	e
j                  dz  de
j                  dz  dedz  f fd       Z xZS ) JanusForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightr   Tr   c                     t         |   |       || _        t        |      | _        t        j                  |j                  j                  |j                  j                  d      | _
        | j                          y )NFr   )r}   r   r   r  r   r   r   rp   r2   
vocab_sizelm_headr~  r   s     rU   r   z&JanusForConditionalGeneration.__init__N  s\     '
yy!3!3!?!?ASASA^A^ejk 	rT   c                 J    | j                   j                  j                         S r   )r   r  r  r  s    rU   r  z2JanusForConditionalGeneration.get_input_embeddingsW  s    zz((==??rT   c                 N    | j                   j                  j                  |       y r   )r   r  r  r  s     rU   r  z2JanusForConditionalGeneration.set_input_embeddingsZ  s    

!!66u=rT   inputsr   c                 r    | j                   j                  |      }| j                   j                  |      }|S r   )r   r  r  )r   r  rq  s      rU   'prepare_embeddings_for_image_generationzEJanusForConditionalGeneration.prepare_embeddings_for_image_generation]  s0    zz77?zz44\BrT   Nr  r   r   r   r   r  labelsr  r  r   c
                     | j                   d|||||||d|
}|j                  }t        |	t              rt	        |	 d      n|	}| j                  |dd|ddf         }d}|4 | j                  d||| j                  j                  j                  d|
}t        |||j                  |j                  |j                  |j                        S )a  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
        )r  r   r   r   r   r  r  N)logitsr  r  )lossr  r   r   r  r  rS   )r   r  rw   rJ   slicer  loss_functionr   rp   r  r   r   r   r  r  )r   r  r   r   r   r   r  r  r  r  r   outputsr   slice_indicesr  r  s                   rU   r   z%JanusForConditionalGeneration.forwardb  s    * $** 	
%)%+'	
 	
  118B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD +#33!//)) ' ; ;
 	
rT   c           	      f    t        
|   |f|||||d|}	|s|j                  dd      s||	d<   |	S )N)r   r  r   r  is_first_iterationr  Tr   )r}   prepare_inputs_for_generationry   )r   r  r   r   r   r  r  r  r   model_inputsr   s             rU   r  z;JanusForConditionalGeneration.prepare_inputs_for_generation  sW     w<
+'))1
 
 VZZT%B+7L(rT   r+  c                 x    | j                   j                  j                  |      }|j                  dddd      }|S )a,  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.
        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
        r   r   r   rd   )r   r  r  r2  )r   r+  decoded_images      rU   decode_image_tokensz1JanusForConditionalGeneration.decode_image_tokens  s:     

**11,?%--aAq9rT   logits_processorc           	      .   |j                  dd      } | j                  |j                  dd       fi |\  }}|dk(  rt        %|   d|||d d|S |j	                         t
        j                  t
        j                  fvrt        d      |j                          | j                  |j                                ||n	t               }d|d<   |j                  t        j                  d       d	|_        |j                  |d
<   | j!                  ||j"                  |      \  }}	}|j$                  |j&                  }}
t)        |j*                        dk7  rt        d|j*                   d      |d u}| j-                  |||j&                         |j                  r:|j                  dkD  r+|j/                  t1        |j                               d |_        | j3                  ||j*                  d   |d ||      } | j4                  d|||j6                  d|\  }}| j8                  j:                  j<                  j>                  }|j*                  \  }}|jA                  dd      }|j                  dd       }|jA                  dd      }||d<   ||d d d f   |j"                  k7  ||d d d f   |jB                  d   k7  z  }||d d d f   jE                  ||jF                          | jI                         |      }|jK                  dd       @| jM                  |jN                  xs d|dz  tQ        |jR                  ||z         |      |d<   tU        jV                  ||f|
|      }|jX                  }|jZ                  }|j\                  }|j^                  }|j`                  }|r|rdnd }|r|rdnd }|r|rdnd }|r|rdnd }tc        |      D ]\  } | jd                  d||dd|}d|v r!|d   jg                  |j&                        |d<    | j8                  jh                  di |||d}| jk                  ||      }|jl                  d d dd d f   jo                         } | j8                  jq                  |       }! |||!      }"|jr                  r>tU        jt                  |"d      }#tU        jv                  |#d      jy                  d      }$ntU        jz                  |"d      }$|$|d d |f<   tU        j|                  |$|$g      }$|$j                  d      }$| j                  |$      }_ |r@|r|!fz  }|r| j                         fz  }|r|j                  z  }|r|j                  z  }|rt        |!|||j                        S |S )Ngeneration_moder   generation_config)r  r   r  guidance_scalezGot incompatible mode for Image Generation, should be one of greedy or sampling. Ensure that beam search is de-activated by setting `num_beams=1`.Tr  zU`guidance_scale` is required for CFG but not provided. Setting to default value of 5.   r  r   z;Expected input ids of shape (batch_size, seq_len), but got z3Passing `inputs embeds` is not supported currently.)r  rd   )r  input_ids_seq_lengthencoder_input_idsprefix_allowed_tokens_fnr  r  )r  r   expand_sizer   boi_token_idr   static)cache_implementationr   max_cache_lenmodel_kwargsr  rS   )r  r  r  )output_attentionsoutput_hidden_statesr   )r.  )num_samples)	sequencesscoresr  r  r   r   )Fpop_prepare_generation_configr}   generateget_generation_moder   SAMPLEGREEDY_SEARCHr   validate_validate_model_kwargscopyr   r  rz   warning_prepare_model_inputsbos_token_idr   r  r]  r   _prepare_special_tokensrb  r   _get_logits_processor_expand_inputs_for_generationnum_return_sequencesr   r  r   rE   repeatgeneration_kwargsmasked_fill_pad_token_idr  ry   _prepare_static_cacher  max
max_lengthr   zerosr  r  output_scoresoutput_logitsreturn_dict_in_generater
  r  r   r  #_update_model_kwargs_for_generationr  cloner  	do_samplesoftmaxmultinomialsqueezeargmaxcatr  r  rO   r  r   r   r   )&r   r  r   r  r   r  r  r  r  model_input_namer   r  kwargs_has_attention_maskrE   r   r   input_tokensmaskr  generated_tokensr  r  r  r  r	  
raw_scores
raw_logitsdecoder_hidden_statesdecoder_attentionsir  r  rq  r  next_token_scoresprobs
next_tokenr   s&                                        rU   r  z&JanusForConditionalGeneration.generate  sF    !**%6?*I$*I*IJJ*D1+
5;+
'<
 f$7# -"3#	
   002>;P;PR`RnRn:ooT  	""$##L$5$5$78 0@/K+QdQf %)[!++3NNrs/0,):)I)I%& 594N4N%22L5
1	#\ ")9)9vy1$MiooM^EF  %3$$>!$$%68QZcZjZj$k ++0A0P0PST0T##$IJ[JjJj$kl/3,  55/!*!3'%)- 6 
 #E$"D"D #
))>>#
 	#
	<  ::2299JJ'oo
G ''1-%))*:DA'..q!4)7%& Z[!^,0A0N0NNa(,=,O,OP^,__
 	Z[!^$11$8I8V8VW3113LA-t4<.2.H.H%6%K%K%Wx%>!"3">">@PSZ@Z[) /I /L*+ !;;
4D'EU[ab .??0EE)77)77"3"K"K3RD
3RD
'>CW^b$;@QRX\'( (	UA >4== +|X\`lL  </1=>N1O1R1RS`SgSg1h-./djj// "3%9G  CCG\ZL"44QAX>DDFL ZZ//=F 0F C !**&7R@"..u!DLLRP
"\\*;D
%/QT" J
#;<J#--b1J HHTMQ(	UT #vi'
|11355
 "g&8&88"#%)>)>>%",*!-3 ' 7 7  $#rT   )	NNNNNNNNr   )NNNNNF)NNN)rF   rG   rH   _tied_weights_keysoutput_modalitiesr   rn   r   r  r  r   r   r  r   r   r7  r   r
   rP   rJ   r   r   r   r   r  r  no_gradr   r  r   r   s   @rU   r  r  I  s   *,VW)!{ @>ell u|| 
  .215.204(,26*.!%-./
##d*/
 ''$./
 t+	/

 &&-/
 /
 ((4//
   4'/
 $;/
 ell*/
 +,/
 
%/
  /
h  @
 
 U]]_ '+267;	$t#$ ((4/$ .4	$ $rT   r  )r   r  r  rz  r  rW   r1   rn   )dcollections.abcr   dataclassesr   r   torch.nn.functionalr   
functionalr0  huggingface_hub.dataclassesr    r   r   activationsr	   cache_utilsr
   configuration_utilsr   
generationr   r   r   r   generation.utilsr   modeling_outputsr   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   autor   r   r    blip_2.modeling_blip_2r!   !chameleon.configuration_chameleonr"   chameleon.modeling_chameleonr#   r$   r%   r&   r'   idefics.modeling_ideficsr(   r)   llama.modeling_llamar*   siglip.configuration_siglipr+   siglip.modeling_siglipr,   r-   r.   
get_loggerrF   rz   r1   rW   rn   r   r   r   r   r   rc  r   r   r   r  r  r  r(  r9  r;  r=  r?  rM  rV  rs  rz  r  r  r  r  __all__rS   rT   rU   <module>r;     s   % !    .  & !   3 u u 9 X X F &  9 8 5 D  e : < ^ ^ 			H	%
 <=)* )  >)2 <=#!+ #!  >#!L <=<(" <(  ><(~ i? i i$ 
	4{ 	4 	4	#A 		"? 	2 "I$299 I$XRYY (*0 *p p#
' #
LBII $" = "*	< 		8 		B 	RYY  ,J!		 J!ZA		 AH2U 2Uj299 $RYY   
l
% l

l
^x$$8/ x$v		rT   