
    i                        d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlmc m	Z
 ddlmZ ddlmZ ddlmZ ddlmZmZ dd	lmZ dd
lmZ ddlmZmZmZmZ ddlmZ ddl m!Z! ddl"m#Z#m$Z$ ddl%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/m0Z0  ejb                  e2      Z3ee G d de                    Z4 G d de&      Z5 G d de'      Z6 G d dejn                        Z8 G d de$      Z9 G d dejn                        Z: G d  d!ejn                        Z; G d" d#ejn                        Z< G d$ d%ejn                        Z= G d& d'ejn                        Z> G d( d)ejn                        Z? G d* d+ejn                        Z@ G d, d-e,      ZA G d. d/ej                        ZC G d0 d1ejn                        ZD G d2 d3ejn                        ZE G d4 d5ejn                        ZF G d6 d7ejn                        ZG G d8 d9ejn                        ZH ed:;       G d< d=e             ZI G d> d?      ZJ G d@ dAe#      ZK G dB dCe)eK      ZL G dD dEe(eKe      ZM G dF dGeK      ZN G dH dIeKe      ZOg dJZPy)K    N)	dataclass)cached_property   )initialization)Cache)GenerationMixin)BaseModelOutputWithPoolingCausalLMOutputWithPast)PreTrainedModel)Unpack)auto_docstringcan_return_tupleloggingtorch_compilable_check)merge_with_config_defaults)capture_outputs   )ChameleonPreTrainedModel#ChameleonVQVAEEncoderConvDownsample)LlamaAttentionLlamaDecoderLayerLlamaForCausalLM
LlamaModelTransformersKwargs)SiglipAttention   )
Emu3ConfigEmu3TextConfigEmu3VQVAEConfigc                   :    e Zd ZU dZdZej                  dz  ed<   y)Emu3VQVAEModelOutputz
    image_tokens (`torch.LongTensor` of shape `(batch_size, config.vocab_size`):
        Indices of the image tokens predicted by the VQ-VAE model.
    Nimage_tokens)__name__
__module____qualname____doc__r"   torch
LongTensor__annotations__     v/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/emu3/modular_emu3.pyr!   r!   -   s    
 -1L%""T)0r+   r!   c                       e Zd Zy)Emu3AttentionNr#   r$   r%   r*   r+   r,   r.   r.   8       r+   r.   c                       e Zd Zdedef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  de	dz  d	e
dz  d
eej                  ej                  f   dz  dee   dej                  fdZ xZS )Emu3DecoderLayerconfig	layer_idxc                 n    t         |   ||       t        j                  |j                        | _        y N)super__init__nnDropoutattention_dropoutdropoutselfr3   r4   	__class__s      r,   r8   zEmu3DecoderLayer.__init__>   s(    +zz&":":;r+   Nhidden_statesattention_maskposition_idspast_key_values	use_cacheposition_embeddingskwargsreturnc           
          |}| j                  |      } | j                  d||||||d|\  }}	|| j                  |      z   }|}| j                  |      }| j	                  |      }|| j                  |      z   }|S )N)r@   rA   rB   rC   rD   rE   r*   )input_layernorm	self_attnr<   post_attention_layernormmlp)
r>   r@   rA   rB   rC   rD   rE   rF   residual_s
             r,   forwardzEmu3DecoderLayer.forwardB   s     !,,];)4>> 
')%+ 3
 
q !4<<#>> 55mD/ 4<<#>>r+   )NNNFN)r#   r$   r%   r   intr8   r'   Tensorr(   r   booltupler   r   rO   __classcell__r?   s   @r,   r2   r2   =   s    <z <c < /304(,!&HL|| t+ &&-	
  $; #5<<#=>E +, 
r+   r2   c                   H     e Zd ZdZdef fdZdej                  fdZ xZ	S )Emu3VQVAEVectorQuantizera  
    A module for vector quantization using learned embedding vectors.

    This module implements the quantization process similar to te one described in
    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
    input vectors into discrete codebook vectors, which are learned during training.
    Current implementation improves over previous ones by avoiding costly matrix multiplications
    and allowing for post-hoc remapping of indices.
    r3   c                    t         |           t        j                  |j                  |j
                        | _        | j                  j                  j                  j                  d|j                  z  d|j                  z         y )Ng            ?)
r7   r8   r9   	Embeddingcodebook_size	embed_dim	embeddingweightdatauniform_r>   r3   r?   s     r,   r8   z!Emu3VQVAEVectorQuantizer.__init__l   sb    f&:&:F<L<LM""++D63G3G,GvOcOcIcdr+   hidden_statec                    |j                   \  }}}}}|j                  ddddd      j                         }|j                  d|      }t	        j
                  |dz  dd      }t	        j
                  | j                  j                  dz  d	      }	dt	        j                  || j                  j                  j                  dd            z  }
||	z   |
z
  }
t	        j                  |
d	      }|j                  ||||      }|S )
Nr   r   r      r   T)dimkeepdimrf   )shapepermute
contiguousviewr'   sumr]   r^   matmul	transposeargmin)r>   rb   
batch_sizetemporalchannelsheightwidthhidden_state_flattenedhidden_state_sumembedding_sum	distancesmin_encoding_indicess               r,   rO   z Emu3VQVAEVectorQuantizer.forwardq   s    8D8J8J5
Hh#++Aq!Q:EEG!-!2!22x!@ !99%;Q%>AtT		$.."7"7":B %;T^^=R=R=\=\]^`a=bcc	$}4y@	$||I1=388XvW\]##r+   )
r#   r$   r%   r&   r   r8   r'   rQ   rO   rT   rU   s   @r,   rW   rW   a   s&    e e
$ELL $r+   rW   c                       e Zd Zy)Emu3VQVAEEncoderConvDownsampleNr/   r*   r+   r,   r|   r|      r0   r+   r|   c                   $     e Zd Z fdZd Z xZS )Emu3VQVAEEncoderConvUpsamplec                 `    t         |           t        j                  ||ddd      | _        y )Nr   r   kernel_sizestridepadding)r7   r8   r9   Conv2dconv)r>   in_channelsr?   s     r,   r8   z%Emu3VQVAEEncoderConvUpsample.__init__   s'    IIk;AaYZ[	r+   c                 X    t        j                  |dd      }| j                  |      }|S )N       @nearestscale_factormode)Finterpolater   r>   r@   s     r,   rO   z$Emu3VQVAEEncoderConvUpsample.forward   s(    m#IV		-0r+   r#   r$   r%   r8   rO   rT   rU   s   @r,   r~   r~      s    \r+   r~   c            	       \     e Zd Zdededee   dee   f fdZdej                  fdZ xZ	S )Emu3VQVAEConv3d
in_channelout_channelr   r   c                 P   t         	|           t        |dd  |dd        D cg c]
  \  }}||z
   }}}d| _        |d d d   D ]%  }| xj                  |dz  |dz  z   |dz  fz  c_        ' | xj                  dz  c_        t	        j
                  ||||      | _        y c c}}w )Nr   r*   re   r   )r   r   )r   )r7   r8   zipr   r9   Conv3dr   )
r>   r   r   r   r   
one_kernel
one_stridepadding_sizespad_sizer?   s
            r,   r8   zEmu3VQVAEConv3d.__init__   s     	ORS^_`_aSbdjklkmdnOop5KZj0pp%dd+ 	JHLLX]X\98q=IIL	JII	
	 qs   B"r@   c                 h    t        j                  || j                        }| j                  |      }|S r6   )r   padr   r   r   s     r,   rO   zEmu3VQVAEConv3d.forward   s*    mT\\:		-0r+   )
r#   r$   r%   rP   rS   r8   r'   rQ   rO   rT   rU   s   @r,   r   r      sF    

 
 3Z	

 c

,U\\ r+   r   c                   `     e Zd Zdedef fdZdej                  dej                  fdZ xZS )Emu3VQVAESpatialNormr   out_channelsc                     t         |           t        j                  |ddd      | _        t        j
                  ||ddd      | _        t        j
                  ||ddd      | _        y )N    ư>Tnum_channels
num_groupsepsaffiner   r   r   )r7   r8   r9   	GroupNorm
norm_layerr   conv_yconv_br>   r   r   r?   s      r,   r8   zEmu3VQVAESpatialNorm.__init__   sn    
 	,,%	
 ii
 ii
r+   r@   quant_statesc                     t        j                  ||j                  dd  d      }| j                  |      }|| j	                  |      z  | j                  |      z   }|S )Nr   )sizer   )r   r   ri   r   r   r   )r>   r@   r   s      r,   rO   zEmu3VQVAESpatialNorm.forward   sX    }}\8K8KBC8PW`a6%L(AADKKP\D]]r+   	r#   r$   r%   rP   r8   r'   rQ   rO   rT   rU   s   @r,   r   r      s5    

 
8U\\  r+   r   c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalUpsampler   r   c                 J    t         |           t        ||dd      | _        y )Nr   r   r   r   r   r   r   r   r7   r8   r   r   r>   r   r   r?   s      r,   r8   z"Emu3VQVAETemporalUpsample.__init__   (    
 	#!	
	r+   r@   c                 P   |j                   \  }}}}}|j                  ddddd      j                         j                  |d|      }t	        j
                  |dd	      }|j                  ||||d      j                  ddddd      j                         }| j                  |      }|S )
Nr   r   r   rd   r   re   r   r   r   )ri   rj   rk   rl   r   r   r   )r>   r@   rq   rs   rr   rt   ru   s          r,   rO   z!Emu3VQVAETemporalUpsample.forward   s    8E8K8K5
Hh%--aAq!<GGINNz[]_ghm#IV%**:xPRS[[\]_`bcefhijuuw		-0r+   r   rU   s   @r,   r   r      s*    

 
U\\ r+   r   c                   H     e Zd Zdedef fdZdej                  fdZ xZS )Emu3VQVAETemporalDownsampler   r   c                 J    t         |           t        ||dd      | _        y )N)rd   r   r   )r   r   r   r   r   r   s      r,   r8   z$Emu3VQVAETemporalDownsample.__init__   r   r+   r@   c                 (    | j                  |      }|S r6   )r   r   s     r,   rO   z#Emu3VQVAETemporalDownsample.forward   s    		-0r+   r   rU   s   @r,   r   r      s*    

 
U\\ r+   r   c                   (     e Zd Z	 d fd	Zd Z xZS )Emu3VQVAETemporalResnetBlockc                 p   t         |           || _        ||n|| _        t	        j
                  |      | _        t        ||dd      | _        t	        j
                  |      | _	        t        ||dd      | _
        | j                  | j                  k7  r t	        j                  ||ddd      | _        y y )Nr   r   r   r   r   r   )r7   r8   r   r   r9   BatchNorm3dnorm1r   conv1norm2conv2r   nin_shortcutr   s      r,   r8   z%Emu3VQVAETemporalResnetBlock.__init__   s    
 	&+7+?K\^^K0
$!	

 ^^L1
$!	

 t000 "		!D 1r+   c                 L   |}| j                  |      }|t        j                  |      z  }| j                  |      }| j	                  |      }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S r6   )	r   r'   sigmoidr   r   r   r   r   r   )r>   r@   rM   s      r,   rO   z$Emu3VQVAETemporalResnetBlock.forward  s     

=1}55

=1

=1}55

=1t000((2H-''r+   r6   r   rU   s   @r,   r   r      s     @(r+   r   c                   ~     e Zd Z	 	 ddededz  dedz  f fdZd	dej                  dej                  dz  fdZ xZS )
Emu3VQVAEResnetBlockNr   r   quant_channelsc                    t         |           || _        ||n|}|| _        || _        |=t        j                  |ddd      | _        t        j                  |ddd      | _        n"t        ||      | _        t        ||      | _        t        j                  ||ddd      | _        t        j                  ||ddd      | _        | j                  | j                  k7  r t        j                  ||ddd      | _        y y )	Nr   r   Tr   r   r   r   r   )r7   r8   r   r   r   r9   r   r   r   r   r   r   r   r   )r>   r   r   r   r?   s       r,   r8   zEmu3VQVAEResnetBlock.__init__/  s    	&&2&:{(,!;2SW`deDJ<BTXaefDJ-nkJDJ-nlKDJYY

 YY

 t000 "		!D 1r+   r@   c                 v   | j                   dn|f}|} | j                  |g| }|t        j                  |      z  }| j	                  |      } | j
                  |g| }|t        j                  |      z  }| j                  |      }| j                  | j                  k7  r| j                  |      }||z   S Nr*   )
r   r   r'   r   r   r   r   r   r   r   )r>   r@   r   	norm_argsrM   s        r,   rO   zEmu3VQVAEResnetBlock.forward[  s    --5BN;L	 "

==9=}55

=1"

==9=}55

=1t000((2H-''r+   )NNr6   r   rU   s   @r,   r   r   .  sV     $(%)	** Dj* d
	*X(U\\ (5<<RVCV (r+   r   c                   $     e Zd Zdef fdZ xZS )Emu3VQVAEAttentionBlockr3   c                 2    t         |   |       d| _        y )Nr   )r7   r8   num_key_value_groupsra   s     r,   r8   z Emu3VQVAEAttentionBlock.__init__n  s      %&!r+   )r#   r$   r%   r   r8   rT   rU   s   @r,   r   r   m  s    & & &r+   r   c                   *     e Zd ZdZ fdZddZ xZS )Emu3VQVAEGroupNormz
    Same as the torch GroupNorm with the only difference that this ones accepts
    an optional kwarg `quant_states` which is not used. This class makes it easier to
    use SpatialNorm or GroupNorm without conditionals
    c                 $    t        |   di | y r   )r7   r8   )r>   rF   r?   s     r,   r8   zEmu3VQVAEGroupNorm.__init__|  s    "6"r+   c                     t        j                  || j                  | j                  | j                  | j
                        S r6   )r   
group_normr   r^   biasr   )r>   inputr   s      r,   rO   zEmu3VQVAEGroupNorm.forward  s)    ||E4??DKKDHHUUr+   r6   )r#   r$   r%   r&   r8   rO   rT   rU   s   @r,   r   r   u  s    #Vr+   r   c                   `     e Zd Zd fd	Zddej
                  dej
                  dz  fdZ xZS )Emu3VQVAEMiddleBlockNc                     t         |           t        |||      | _        t	        |      | _        |t        |ddd      | _        nt        ||      | _        t        |||      | _	        y )Nr   r   r   r   r   Tr   )
r7   r8   r   block_1r   attn_1r   	attn_normr   block_2)r>   r3   r   r   r?   s       r,   r8   zEmu3VQVAEMiddleBlock.__init__  so    +#$)

 .f5!/[UW]ajnoDN1.+NDN+#$)
r+   r@   r   c                 b   | j                  ||      }|}| j                  ||      }|j                  \  }}}}|j                  ||||z        j	                  dd      }| j                  |      d   }|j                  ||||      j                  dddd      }||z   }| j                  ||      }|S )Nr   r   r   r   )	r   r   ri   rl   ro   r   reshaperj   r   )r>   r@   r   rM   rq   rs   rt   ru   s           r,   rO   zEmu3VQVAEMiddleBlock.forward  s    ]LA }lC.;.A.A+
Hfe%**:x%PZZ[\^_`M215%--j&%RZZ[\^_abdef =0]LAr+   r6   r#   r$   r%   r8   r'   FloatTensorrO   rT   rU   s   @r,   r   r     s-    
(
U%6%6 
eFWFWZ^F^ 
r+   r   c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEDownBlockc           
         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  }dt        |      z   }|| _        t        j                         | _        t        | j                        D ]K  }t        j                         }t        j                         }t        j                         }|||   z  }	|||   z  }
t        | j
                        D ]~  }|j                  t        |	|
             |
}	|j                  .||j                  v s=|j                  t!        |             |j                  t        j"                  |	ddd              t        j$                         }||_        ||_        ||_        || j                  dz
  k7  rt-        |	      |_        | j                  j                  |       N y )N)r   r   r   r   r   Tr   r   )r7   r8   lenchannel_multipliernum_resolutionsnum_res_blocksbase_channelsrS   in_channel_multiplierr9   
ModuleListdownrangeappendr   attn_resolutionsr   r   Moduleblockattn
attn_normsr|   
downsample)r>   r3   r   r   r   i_levelr   r   r   block_in	block_outi_blockr   r?   s                r,   r8   zEmu3VQVAEDownBlock.__init__  s   "6#<#<=$33,,#66 $u-?'@ @%:"MMO	T112 	#GMMOE==?DJ$'<W'EEH%(:7(CCI !4!45 
q($,%. %**67fF]F];]KK 7 ?@%%bllUW]ajn&op
q 99;DDJDI(DO$..22"@"JIIT"1	#r+   r@   c                 >   t        | j                        D ]  \  }}t        | j                        D ]  } |j                  |   |      }t        |j                        dkD  s1|} |j                  |   |      }|j                  \  }}}}	|j                  ||||	z        j                  dd      } |j                  |   |      d   }|j                  |||	|      j                  dddd      }||z   } || j                  dz
  k7  s|j                  |      } |S )Nr   r   r   r   )	enumerater   r   r   r   r   r   r   ri   rl   ro   r   rj   r   r   )
r>   r@   r   blocksr  rM   rq   rs   rt   ru   s
             r,   rO   zEmu3VQVAEDownBlock.forward  s5   (3 	AOGV !4!45 = 5W 5m Dv{{#a',H$>F$5$5g$>}$MM:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= $..22 & 1 1- @	A" r+   r   rU   s   @r,   r   r     s    ##JU%6%6 r+   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Emu3VQVAEUpBlockc           	         t         |           t        |j                        | _        |j
                  | _        |j                  }|j                  |j                  d   z  }t        j                         | _
        t        t        | j                              D ]5  }t        j                         }t        j                         }t        j                         }|j                  |j                  |   z  }t        | j
                  dz         D ]e  }	|j                  t        |||             |}||j                  v s1|j                  t!        |             |j                  t#        ||             g t        j$                         }
||
_        ||
_        ||
_        |dk7  rt-        |      |
_        | j                  j1                  d|
       8 y )Nre   r   r   r   )r7   r8   r   r   r   r   r\   r   r9   r   upreversedr   r   r   r   r   r   r   r   r   r   r~   upsampleinsert)r>   r3   r   r   r   r   r   r   r   r  r  r?   s              r,   r8   zEmu3VQVAEUpBlock.__init__  s   "6#<#<=$33))''&*C*CB*GG--/d&:&: ;< 	"GMMOE==?DJ,,v/H/H/QQI !4!4q!89 V($,%.'5 %f555KK 7 ?@%%&:>8&TUV BBHBG&BM!|:8DGGNN1b!3	"r+   r@   r   c                 h   t        | j                  d d d         D ]  \  }}t        | j                  dz         D ]  } |j                  |   ||      }t        |j                        dkD  s2|} |j                  |   ||      }|j                  \  }}}	}
|j                  |||	|
z        j                  dd      } |j                  |   |      d   }|j                  ||	|
|      j                  dddd      }||z   } |t        | j                        dz
  k7  s|j                  |      } |S )Nre   r   r   r   r   )r  r  r   r   r   r   r   r   ri   rl   ro   r   rj   r
  )r>   r@   r   r   r  r  rM   rq   rs   rt   ru   s              r,   rO   zEmu3VQVAEUpBlock.forward  sD   (27 	?OGV !4!4q!89 = 5W 5m\ Rv{{#a',H$>F$5$5g$>}l$[M:G:M:M7J&%$1$6$6z8VV[^$\$f$fghjk$lM$8FKK$8$G$JM$1$9$9*feU]$^$f$fghjkmnpq$rM$,}$<M= #dgg,** & >	?  r+   r   rU   s   @r,   r  r    s(    #"JU%6%6 eFWFW r+   r  c                   >     e Zd Z fdZdej
                  fdZ xZS )Emu3VQVAEEncoderc                    t         |           |j                  }|j                  }|j                  }|j
                  }|j                  }|rd|z  n|}||d   z  }t        j                  j                  ||ddd      | _
        t        |      | _        t        ||      | _        t        j                  j                  d|dd	      | _        t        j                  j                  ||ddd      | _        t%        t'        j(                  |j*                              }	t        j,                         | _        t        j,                         | _        t3        |	      D ])  }
t5        ||      }| j.                  j7                  |       + t3        |j8                        D ]*  }t;        ||
      }| j0                  j7                  |       , y )Nr   re   r   r   r   r   r   T)r   r   r   r   r   )r7   r8   r   r   double_latentlatent_channelsr   r'   r9   r   conv_inr   
down_blockr   middle_blockr   norm_outconv_outrP   mathlog2temporal_downsample_factorr   	time_convtime_res_stackr   r   r   r   r   )r>   r3   r   r   r  r  r   r   r   temporal_down_blocksir   rN   time_res_convr?   s                 r,   r8   zEmu3VQVAEEncoder.__init__  s   ,,((,, 00#66.;q?* #5b#99xx{MqYZdef,V40B**bxUYbf*g ( 
  #499V-N-N#OP mmo+, 	(A.|\JDNN!!$'	( v,,- 	6A8()M &&}5	6r+   pixel_valuesc                 h   |j                   d   } |j                  dg|j                   dd   }| j                  |      }| j                  |      }| j	                  |      }| j                  |      }|t        j                  |      z  }| j                  |      } |j                  d|g|j                   dd   }|j                  ddddd      }| j                  D ]"  } ||      }|t        j                  |      z  }$ | j                  D ]
  } ||      } |j                  ddddd      }|S )Nr   re   r   r   r   rd   )ri   r   r  r  r  r  r'   r   r  rj   r  r  )r>   r  temporal_dimr@   r   layers         r,   rO   zEmu3VQVAEEncoder.forwardB  sH   #))!,+|++BH1C1CAB1GH \26))-8 m4}55m4---b,YATATUVUWAXY%--aAq!< NN 	:D /MU]]=99M	: (( 	1E!-0M	1 &--aAq!<r+   )r#   r$   r%   r8   r'   r(   rO   rT   rU   s   @r,   r  r    s    %6NE$4$4 r+   r  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )Emu3VQVAEDecoderr3   c                    t         	|           |j                  }|j                  |j                  d   z  }t        j                         | _        t        |j                        D ]>  }t        |j                  |j                        }| j                  j                  |       @ t        t        j                  |j                               }t        j                         | _        t        |      D ]=  }t%        |j                  |j                        }| j"                  j                  |       ? t        j&                  |j                  |ddd      | _        t+        |||      | _        t/        |      | _        |j                  |j                  d   z  }t3        ||      | _        t        j&                  ||j6                  ddd      | _        y )Nre   r   r   r   r   )r   r   )r7   r8   r\   r   r   r9   r   r  r   r   r   r  r   rP   r  r  r  r  r   r   r  r   r  r  up_blockr   r  r   r  )
r>   r3   r   r   rN   r  temp_upsample_block_numr  r   r?   s
            r,   r8   zEmu3VQVAEDecoder.__init__a  s   ))''&*C*CB*GG mmov,,- 	6A8"22AWAWM &&}5		6 #&dii0Q0Q&R"S./ 	(A,V-C-CVE[E[\DNN!!$'	( yy""
 1R`a(0''&*C*CA*FF,^XF		
r+   r@   r   c                    t        j                  ||fd      }|j                  ddddd      }| j                  D ]
  } ||      } | j                  D ]"  } ||      }|t        j
                  |      z  }$ |j                  ddddd      }t        j                  |dd      \  }} |j                  dg|j                  dd   } |j                  dg|j                  dd   }| j                  |      }| j                  ||      }| j                  ||      }| j                  ||      }|t        j
                  |      z  }| j                  |      }|S )Nr   rh   r   r   r   rd   re   )r'   catrj   r  r  r   chunkr   ri   r  r  r&  r  r  )r>   r@   r   hidden_quant_statesr"  s        r,   rO   zEmu3VQVAEDecoder.forward  sp   #ii(E1M199!Q1aH (( 	=E"'(;"<	= ^^ 	FE"'(;"<5==1D#EE	F 299!Q1aH&+kk2Eqa&P#|---bK=3F3Fqr3JK+|++BH1C1CAB1GH]3 ))-Fm\Bm\B}55m4r+   )	r#   r$   r%   r   r8   r'   rQ   rO   rT   rU   s   @r,   r$  r$  `  s+    %
 %
NU\\  r+   r$  aR  
    The VQ-VAE model used in Emu3 for encoding/decoding images into discrete tokens.
    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
    Taigman](https://huggingface.co/papers/2203.13131).
    custom_introc            
       
    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZg dZeegedZ ej$                         d        Zdef fd	Zeedej.                  d
ej.                  dee   defd              Zdej.                  fdZ xZS )	Emu3VQVAEr3   
emuvideovqr  )imageT)r   r   r   rW   r@   
attentionsc                    t        |t        j                  t        j                  f      rt	        j
                  |j                  dd       |j                  qt        j                  j                  j                  |j                        \  }}dt        j                  |      z  }t	        j                  |j                  | |       y y t        |t        j                        rt	        j                  |j                  t        j                  d             |j                  xt        j                  j                  j                  |j                        \  }}|dkD  rdt        j                  |      z  nd}t	        j                  |j                  | |       y y t        |t        j                  t        j                   t        j"                  f      rt	        j$                  |j                  d       t	        j$                  |j                  d	       t'        |d
d       ^t	        j(                  |j*                         t	        j,                  |j.                         t	        j(                  |j0                         y y t        |t        j2                        rqt	        j4                  |j                         |j6                  Et'        |j                  dd      s-t	        j(                  |j                  |j6                            y y y y )Nfan_outrelu)r   nonlinearityr      )ar   rY   g        running_mean_is_hf_initializedF)
isinstancer9   r   r   initkaiming_normal_r^   r   r'   _calculate_fan_in_and_fan_outr  sqrtr`   Linearkaiming_uniform_BatchNorm2dr   r   	constant_getattrzeros_r:  ones_running_varnum_batches_trackedrZ   normal_padding_idx)r>   modulefan_inrN   bounds        r,   _init_weightszEmu3VQVAE._init_weights  s   fryy"))45  YVT{{&!HHMMGGV	DIIf--fkkE659 ' 		*!!&--499Q<@{{&!HHMMGGV	17!DIIf--fkkE659 '  NONN6==#.NN6;;,v~t4@F//0

6--.F667 A -LL'!!-gfmmMach6iFMM&*<*<=> 7j- .r+   c                    t         |   |       || _        t        |      | _        t        |      | _        t        |      | _        dt        |j                        dz
  z  | _        t        |j                  |j                  dd      | _        t        |j                  |j                  dd      | _        dt        |j                        dz
  z  | _        | j%                          | j'                          y )Nr   r   )r   r   r   r   r   )r7   r8   r3   r  encoderr$  decoderrW   quantizer   r   vision_spatial_factorr   r  r\   
quant_convpost_quant_convspatial_scale_factoreval	post_initra   s     r,   r8   zEmu3VQVAE.__init__  s     '/'/08%&3v/H/H+IA+M%N")""F$4$4)T]
  /f44)T] 
 %&#f.G.G*H1*L$M!		r+   image_sizesrF   rG   c                    |j                   dk(  }|rL| j                  j                  }|j                  \  }}}}	|j	                  d      j                  d|ddd      }n|j                  \  }}}}}	| j                  |      }
|
j                  ddddd      }| j                  |      }|j                  ddddd      }| j                  |      }|r|j                  d      n|}t        ||      D cg c]B  \  }}|d t        |d   | j                  z        d t        |d   | j                  z        f   D }}}t        |
|      S c c}}w )Nrd   r   r   r   r   )last_hidden_stater"   )ndimr3   r  ri   	unsqueezerepeatrQ  rj   rU  rS  squeezer   rP   rT  r!   )r>   r  rZ  rF   is_imagerr   rq   rs   rt   ru   r@   conv_hidden_statescodesr"   single_imager   s                   r,   encodezEmu3VQVAE.encode  sl   
  $$){{==H2>2D2D/J&%'11!4;;AxAqQL<H<N<N9J(FE\2 +221aAqA!__-?@ 0771aAF01+3u}}Q' '*,&D
"d D3tAw)C)CCDDFqDQRGVZVpVpLpHqFqqr
 

 $+%
 	

s   1AEr@   c                    |j                   dk(  }|r|j                  d      }|j                  \  }}}}| j                  j	                  |j                               }|j                  d   }|j                  |||||      j                  ddddd      j                         }| j                  |      }	|j                  ddddd      }|	j                  ddddd      }	| j                  |	|      }
|
j                  ||| j                  j                  z  | j                  j                  || j                  z  || j                  z        }
|r	|
d d df   S |
S )Nr   r   re   r   rd   r   )r]  r^  ri   rS  r]   flattenrl   rj   rk   rV  rR  r   r3   r  r   rW  )r>   r@   ra  rq   rr   rt   ru   quantrs   
post_quantvideos              r,   decodezEmu3VQVAE.decode  sK    %%*)33A6M.;.A.A+
Hfe''(=(=(?@;;r?

:xIQQRSUVXY[\^_`kkm))%0
aAq!,''1aA6
Z/t{{===KK$$T...D---
 'uQT{1E1r+   )r#   r$   r%   r   r)   base_model_prefixmain_input_nameinput_modalities_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr   r   r   _can_record_outputsr'   no_gradrO  r8   r   r   rQ   r   r   r!   re  rk  rT   rU   s   @r,   r/  r/    s     $$O!N"& /0LM-
 U]]_? ?4 *  
!LL
7<||
OUVhOi
	
   
B2ELL 2r+   r/  c                       e Zd ZdZd Zed        Zed        Zed        Zed        Z	ed        Z
ed        Zd	eej                     d
ej                  fdZd	ej                  d
ej                  fdZy)Emu3ImageVocabularyMappingzM
    A class for mapping discrete image tokens from VQGAN to BPE tokens.
    c                 j    || _         |j                  d      | _        |j                  d      | _        y )Nz<|extra_200|>z<image>)	vocab_mapgeteol_token_idimage_token_id)r>   ry  s     r,   r8   z#Emu3ImageVocabularyMapping.__init__4  s+    "%MM/:'mmI6r+   c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w Nz<|visual tokensortedry  items
startswithr>   namevals      r,   r"   z'Emu3ImageVocabularyMapping.image_tokens9  s8    DNN,@,@,BhytSdooVfFgshiih
   A	
A	
c           	          t        | j                  j                         D cg c]  \  }}|j                  d      s| c}}      S c c}}w r~  r  r  s      r,   image_tokens_strz+Emu3ImageVocabularyMapping.image_tokens_str=  s8    T^^-A-A-Ci	ctWgGhtijjir  c                 t    | j                   D ci c]  }t        |dd       | j                  |     c}S c c}w )Nir   )r  rP   ry  )r>   tokens     r,   img2bpez"Emu3ImageVocabularyMapping.img2bpeA  s5    FJF[F[\UE"RL!4>>%#88\\\s   #5c                 j    | j                   j                         D ci c]  \  }}||
 c}}S c c}}w r6   )r  r  )r>   kvs      r,   bpe2imgz"Emu3ImageVocabularyMapping.bpe2imgE  s+    !%!3!3!56A1666s   /c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S Nr   dtype)r'   zerosmaxr  keysrP   r  r>   mappingr  r  s       r,   bpe2img_mapping_tensorz1Emu3ImageVocabularyMapping.bpe2img_mapping_tensorI  [    ++c$,,"3"3"56:%))LLL&&( 	DAqGAJ	r+   c                     t        j                  t        | j                  j	                               dz   t         j
                        }| j                  j                         D ]
  \  }}|||<    |S r  )r'   r  r  r  r  rP   r  r  s       r,   img2bpe_mapping_tensorz1Emu3ImageVocabularyMapping.img2bpe_mapping_tensorP  r  r+   	img_batchrG   c                 ,   |j                   }t        j                  |j                  d   dft        j                        | j
                  z  }| j                  |j                  d         }t        j                  ||gd      }|j                  |      S )Nr   r   r  cpure   rh   )	devicer'   onesri   rP   r{  r  tor)  )r>   r  r  eol_row
img_tokenss        r,   convert_img2bpez*Emu3ImageVocabularyMapping.convert_img2bpeW  sw    !!**iooa0!4EIIFIZIZZ00e1DE
YY
G4"=
}}V$$r+   c                     |j                   }|dd df   }| j                  |j                  d         }|j                  |      S )N.re   r  )r  r  r  )r>   r  r  r  s       r,   convert_bpe2imgz*Emu3ImageVocabularyMapping.convert_bpe2img^  sG    !!c3B3h'	00e1DE
}}V$$r+   N)r#   r$   r%   r&   r8   r   r"   r  r  r  r  r  listr'   rQ   r  r  r*   r+   r,   rw  rw  /  s    7
 j j k k ] ] 7 7    %ell); % %% %%,, %r+   rw  c                   $    e Zd ZdgZdZdZeedZy)Emu3PreTrainedModelr2   Tr2  N)	r#   r$   r%   rs  rq  rr  r2   r.   rt  r*   r+   r,   r  r  e  s)     "&)#r+   r  c                   0     e Zd ZU eed<   def fdZ xZS )Emu3TextModelr3   c           	          t         |   |       t        j                  t	        |j
                        D cg c]  }t        ||       c}      | _        y c c}w r6   )r7   r8   r9   r   r   num_hidden_layersr2   layersr=   s      r,   r8   zEmu3TextModel.__init__t  sD     mmBGH`H`BabYfi0b
bs   A)r#   r$   r%   r   r)   r8   rT   rU   s   @r,   r  r  q  s    
~ 
 
r+   r  c                   4     e Zd ZU eed<    fdZ fdZ xZS )Emu3ForCausalLMr3   c                 D    t         |   |       t        |      | _        y r6   )r7   r8   r  modelra   s     r,   r8   zEmu3ForCausalLM.__init__~  s     "6*
r+   c                  6    t               j                          y)a  
        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForCausalLM.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> inputs = processor(text=["Can you write me a poem about winter."], return_tensors="pt").to(model.device)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```N)r7   rO   )super_kwargsr?   s    r,   rO   zEmu3ForCausalLM.forward  s    & 	r+   )r#   r$   r%   r   r)   r8   rO   rT   rU   s   @r,   r  r  {  s    + r+   r  c                       e Zd Z fdZd Zd Zdej                  dej                  dej                  fdZ	e
 ed	      dej                  dej                  d
ee   deez  fd              Z ej"                         dej                  dedefd       Zdej                  dej                  dej                  fdZe
e	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej*                  dz  dej*                  dz  dej                  dz  dedz  dej                  dz  dedz  d
ee   deez  fd              Z xZS )	Emu3Modelc                     t         |   |       t        j                  |j                        | _        t        |j                        | _        t        |j                        | _        | j                          y r6   )r7   r8   r  _from_configtext_config
text_modelr/  	vq_configvqmodelrw  vocabulary_mapvocabulary_mappingrY  ra   s     r,   r8   zEmu3Model.__init__  sY     '44V5G5GH !1!12"<V=R=R"S 	r+   c                 6    | j                   j                         S r6   )r  get_input_embeddingsr>   s    r,   r  zEmu3Model.get_input_embeddings  s    3355r+   c                 :    | j                   j                  |       y r6   )r  set_input_embeddingsr>   values     r,   r  zEmu3Model.set_input_embeddings  s    ,,U3r+   r  rZ  rG   c                     | j                   j                  ||d      }|j                  D cg c]+  }| j                  j	                  |      j                         - }}t        j                  |      }|S c c}w )a  
        Tokenizes images into discrete tokens with VQGAN module. Converts
        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
        special tokens.

        Args:
            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
                The tensors corresponding to the input images.
            image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
                The sizes of the images in the batch, being (height, width) for each image.
        T)return_dict)r  re  r"   r  r  rg  r'   r)  )r>   r  rZ  vqmodel_outputstokensbpe_tokens_list
bpe_tokenss          r,   get_image_tokenszEmu3Model.get_image_tokens  sv     150C0CLR]ko0C0pTcTpTp
JPD##33F;CCE
 
 YY/
	
s   0A6zbTokenizes images into discrete tokens with VQGAN module and embeds them with text embeddings layerr,  rF   c                     | j                   j                  ||fddi|}|D cg c];  \  }}|| j                   j                  z  || j                   j                  z  dz   z  = }}}|j                  D cg c]+  }| j                  j                  |      j                         - }	}t        j                  |	      }
 | j                         |
      }t        j                  ||      }||_        |S c c}}w c c}w )z
        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
            The tensors corresponding to the input images.
        r  Tr   )r  re  rT  r"   r  r  rg  r'   r)  r  splitpooler_output)r>   r  rZ  rF   r  rt   ru   split_sizesr  r  r  image_embeddingsimage_featuress                r,   get_image_featureszEmu3Model.get_image_features  s	    1D0C0C+1
371
;A1

 "-
 t||999et||GiGi>ilm>mn
 

 UdTpTp
JPD##33F;CCE
 
 YY/
64446zB%5{C(6%

s   A C370C9r"   rt   ru   c                     |ddddf   j                  d||dz         }| j                  j                  |      }| j                  j	                  |      }|S )a  
        Decodes generated image tokens from language model to continuous pixel values
        with VQGAN module via upsampling.

        Args:
            image_tokens (`torch.LongTensor` of shape `(batch_size, num_of_tokens)`):
                The tensors corresponding to the input images.
            height (`int`):
                Height of the generated image before upsampling.
            width (`int`):
                Width of the generated image before upsampling.
        Nre   r   )rl   r  r  r  rk  )r>   r"   rt   ru   	sequencesr1  s         r,   decode_image_tokenszEmu3Model.decode_image_tokens  sX     !CRC(--b&%!)D	..>>yI##L1r+   	input_idsinputs_embedsr  c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r  r  re   r   r   z6Image features and image tokens do not match, tokens: z, features: )r  r'   tensorr  r|  longr  allrm   ri   r^  	expand_asr  r   numel)r>   r  r  r  special_image_maskn_image_tokensn_image_featuress          r,   get_placeholder_maskzEmu3Model.get_placeholder_mask  s    !.2M$2K2K2MT44CC5::^k^r^rs3 " "4!7!7!;!*d.E.E.T.T!T+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r+   NrA   rB   rC   rD   c	           	      D   |du |duz  rt        d      | | j                         |      }|Y| j                  ||      j                  }
t	        j
                  |
d      }
| j                  |||
      }|j                  ||
      } | j                  d|||||d|	}|S )ap  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        NzaYou cannot specify both input_ids and inputs_embeds at the same time, and must specify either oner   rh   )r  r  )rA   rB   rC   r  rD   r*   )	
ValueErrorr  r  r  r'   r)  r  masked_scatterr  )r>   r  r  rZ  rA   rB   rC   r  rD   rF   r  r  outputss                r,   rO   zEmu3Model.forward  s    ( -t";<s   7D557	BM#!44\;O]]N"YY~1=N!%!:!:~ "; " *889K^\M "$// 
)%+'
 
 r+   )NNNNNNNN)r#   r$   r%   r8   r  r  r'   r   r(   r  r   r   r   r   rS   r!   r  ru  rP   r  r  rQ   r   rR   r
   rO   rT   rU   s   @r,   r  r    s   64U->-> UM]M] bgbrbr & y!--<A<L<LX^_qXr	%	% 0 U]]_0@0@ # VY  $"))":?:K:K"]b]n]n"0  .215+/.204(,26!%,##d*, ''$., \\D(	,
 t+, &&-, , ((4/, $;, +,, 
'	',  ,r+   r  c                       e Zd ZdZddiZ fdZd Zd Zdej                  fdZ
d	 Zee	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                   d
z  dej"                  d
z  dej"                  d
z  dej                  d
z  ded
z  dej                   d
z  ded
z  dej                  d
z  deej"                  z  dee   deez  fd              Z	 	 	 	 	 	 	 d fd	Z xZS )Emu3ForConditionalGeneration)r1  textzlm_head.weightz$model.text_model.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NF)r   )r7   r8   r  r  r9   rA  r  hidden_size
vocab_sizelm_headrY  ra   s     r,   r8   z%Emu3ForConditionalGeneration.__init__7  sS     v&
yy!3!3!?!?ASASA^A^ejkr+   c                 6    | j                   j                         S r6   )r  r  r  s    r,   r  z1Emu3ForConditionalGeneration.get_input_embeddings>  s    zz..00r+   c                 :    | j                   j                  |       y r6   )r  r  r  s     r,   r  z1Emu3ForConditionalGeneration.set_input_embeddingsA  s    

''.r+   rG   c                     | j                   S r6   )r  r  s    r,   get_output_embeddingsz2Emu3ForConditionalGeneration.get_output_embeddingsD  s    ||r+   c                 :     | j                   j                  di |S r   )r  r  )r>   rF   s     r,   r  z0Emu3ForConditionalGeneration.decode_image_tokensG  s    -tzz--777r+   Nr  r  rZ  rA   rB   rC   r  rD   labelslogits_to_keeprF   c           
      ~    | j                   d||||||d|}|d   }t        |
t              rt        |
 d      n|
}| j	                  |dd|ddf         }d}|	4 | j
                  d||	| j                  j                  j                  d|}t        |||j                  |j                  |j                        S )a  
        image_sizes (`torch.LongTensor` of shape `(batch_size, 2)`):
            The sizes of the images in the batch, being (height, width) for each image. Image sizes can be obtained using
            [`AutoImageProcessor`]. See [`Emu3ImageProcessor.__call__`] for details ([]`Emu3Processor`] uses
            [`Emu3ImageProcessor`] for processing images).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> from transformers import Emu3Processor, Emu3ForConditionalGeneration
        >>> import torch
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> model = Emu3ForConditionalGeneration.from_pretrained("BAAI/Emu3-Chat-hf", dtype=torch.bfloat16)
        >>> processor = Emu3Processor.from_pretrained("BAAI/Emu3-Chat-hf")

        >>> conversation = [
        ...     {
        ...     "role": "system",
        ...     "content": [
        ...         {"type": "text", "text": "You are a helpful assistant."},
        ...         ],
        ...     },
        ...     {
        ...     "role": "user",
        ...     "content": [
        ...         {"type": "image"},
        ...         {"type": "text", "text": "Please describe the image."},
        ...         ],
        ...     },
        ... ]

        >>> prompt = processor.apply_chat_template(conversation, add_generation_prompt=True)
        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=[image], text=[prompt], return_tensors="pt").to(model.device, torch.bfloat16)

        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        ```)r  rA   rB   rC   r  rD   r   N)logitsr  r  )lossr  rC   r@   r3  r*   )r  r<  rP   slicer  loss_functionr3   r  r  r
   rC   r@   r3  )r>   r  r  rZ  rA   rB   rC   r  rD   r  r  rF   r  r@   slice_indicesr  r  s                    r,   rO   z$Emu3ForConditionalGeneration.forwardJ  s    @ $** 
)%+'
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD &#33!//))
 	
r+   c	                 J    t        |   |f|||||||d|	}
|s|rd |
d<   |
S )N)rC   rA   r  rB   r  rD   is_first_iterationr  )r7   prepare_inputs_for_generation)r>   r  rC   rA   r  rB   rD   r  r  rF   model_inputsr?   s              r,   r  z:Emu3ForConditionalGeneration.prepare_inputs_for_generation  sR     w<

+)'%%1

 

 "i+/L(r+   )
NNNNNNNNNr   )NNNNTNF)r#   r$   r%   output_modalities_tied_weights_keysr8   r  r  r9   r   r  r  r   r   r'   r(   r   rQ   r   rR   rP   r   r   rS   r
   rO   r  rT   rU   s   @r,   r  r  3  s   )*,RS1/ryy 8  .215+/.204(,26!%*.-.Y
##d*Y
 ''$.Y
 \\D(	Y

 t+Y
 &&-Y
 Y
 ((4/Y
 $;Y
   4'Y
 ell*Y
 +,Y
 
'	'Y
  Y
|   r+   r  )r  r  r  r  r/  r  )Qr  dataclassesr   	functoolsr   r'   torch.nnr9   torch.nn.functional
functionalr    r   r=  cache_utilsr   
generationr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   r   utils.genericr   utils.output_capturingr   chameleon.modeling_chameleonr   r   llama.modeling_llamar   r   r   r   r   siglip.modeling_siglipr   configuration_emu3r   r   r   
get_loggerr#   loggerr!   r.   r2   r   rW   r|   r~   r   r   r   r   r   r   r   r   r   r   r   r  r  r$  r/  rw  r  r  r  r  r  __all__r*   r+   r,   <module>r     s6     ! %     &   ) R - & V V 7 5 w v 4 K K 
		H	% 15 1  1	N 	
!( !H$ryy $D	%H 	299 bii :!299 !H		 .")) &.(299 .(b<(299 <(~&o &V V299 D8 8v7ryy 7tCryy CLCryy CL ~2 ~2~2B3% 3%l	2 	
J 3 
&(;_ :X# XvQ#6 Qhr+   