
    iK                        d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZmZmZ ddlmZmZmZ ddlmZ ddlmZ  ej@                  e!      Z"d Z# G d dejH                        Z%d Z&d.dZ'	 d/dejH                  dejP                  dejP                  dejP                  dejP                  dz  de)de)fdZ* G d d ejH                        Z+ G d! d"ejH                        Z, G d# d$ejH                        Z- G d% d&e
      Z. G d' d(ejH                        Z/e G d) d*e             Z0d+ Z1e G d, d-e0             Z2d-d*gZ3y)0zPyTorch Pixtral model.    )Callable)OptionalN)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutput)dynamic_rope_update)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)is_flash_attention_requestedmaybe_autocastmerge_with_config_defaults)capture_outputs   )PixtralVisionConfigc                    g }| D ]  }|j                   dd  \  }}t        j                  t        j                  |      t        j                  |      d      }t        j                  |d      j                  dd      j                  dd      \  }}||z  |z   }	|j                  |	d d df           t        j                  |      S )Nij)indexingdim   r   )	shapetorchmeshgridarangestackreshapechunkappendcat)
patch_embeds_list	max_width	positionspatchheightwidthmeshh_gridv_grididss
             }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/pixtral/modeling_pixtral.pyposition_ids_in_meshgridr3   %   s    I" $BC(~~ell62ELL4GRVWTr2::2qAGG2Ny 6)QT#$ 99Y    c                        e Zd ZU dZej
                  ed<   ddef fdZe		 	 	 ddedz  de
d   dedz  d	ed
ef   fd       Z ej                         ed               Z xZS )PixtralRotaryEmbeddinga  
    The key with pixtral embedding is just that you have a frequency for each pixel positions.
    If you have height x width pixels (or embedding pixels), then the frequency used for ROPE
    is given by indexing the pre_computed frequency on the width and height.

    What you output is of dimension (batch, height * width, dim) with dim the embed dim.

    This simply means that for each image hidden state, you are going to add
    a corresponding positional embedding, based on its index in the grid.
    inv_freqNconfigc                    t         |           || _        | j                  j                  d   | _        | j
                  }| j                  dk7  r/t        | j                  j                   d| j                   d       || j                  |      \  }}| j                  d|d       | j                  d|j                         d       y )	N	rope_typedefaultz7 does not support non-default RoPE, but got `rope_type=`r7   F)
persistentoriginal_inv_freq)super__init__r8   rope_parametersr:   compute_default_rope_parameters
ValueError	__class____name__register_bufferclone)selfr8   device
layer_typerope_init_fnr7   attention_scalingrD   s          r2   r@   zPixtralRotaryEmbedding.__init__>   s    44[A!%!E!E>>Y&>>**++bcgcqcqbrrst  '34;;&G##ZeD0(..2BuUr4   rI   ztorch.deviceseq_lenreturnztorch.Tensorc                    | j                   d   }t        | dd      xs | j                  | j                  z  }d}| j                  | j
                  z  }t        j                  |      }t        j                  |      }d|t        j                  d|d      j                         |z  z  z  }	t        j                  ||	ddd         j                         }
t        j                  ||	ddd         j                         }t        j                  |
dddddf   j                  d|d      |dddddf   j                  |dd      gd	      j                  d|dz        }t        j                  ||fd	      }||fS )
a  
        Computes the inverse frequencies according to the original RoPE implementation
        Args:
            config ([`~transformers.PreTrainedConfig`]):
                The model configuration.
            device (`torch.device`):
                The device to use for initialization of the inverse frequencies.
            seq_len (`int`, *optional*):
                The current sequence length. Unused for this type of RoPE.
        Returns:
            Tuple of (`torch.Tensor`, `float`), containing the inverse frequencies for the RoPE embeddings and the
            post-processing scaling factor applied to the computed cos/sin (unused in this type of RoPE).
        
rope_thetahead_dimNg      ?r   r   r   r   r   )rA   getattrhidden_sizenum_attention_heads
image_size
patch_sizer    r"   floatouterr'   repeatr$   )r8   rI   rM   baser   attention_factormax_patches_per_sidehwfreqsfreqs_hfreqs_wr7   s                r2   rB   z6PixtralRotaryEmbedding.compute_default_rope_parametersN   so   & %%l3fj$/c63E3EIcIc3c  &00F4E4EELL-.LL-.tQQ 7 = = ?# EFG++ass,224++aqt!t-335994
#**1.BAFa
#**+?AF 
 '"cQh
 	 99h1r:)))r4   c                    | j                   |   }t        |j                  j                  t              r/|j                  j                  dk7  r|j                  j                  nd}t        |d      5  |}|j                         }|j                         }d d d        j                  |j                        j                  |j                        fS # 1 sw Y   AxY w)NmpscpuF)device_typeenableddtype)
r7   
isinstancerI   typestrr   cossintorh   )rH   xposition_idsr_   re   embrl   rm   s           r2   forwardzPixtralRotaryEmbedding.forward{   s     l+'1!((--'E!((--[`J`ahhmmfkUC 	C'')C'')C	
 vvAGGv$cff177f&;;;	 	s   2#CCNN)NNN)rE   
__module____qualname____doc__r    Tensor__annotations__r   r@   staticmethodr   inttuplerW   rB   no_gradr
   rr   __classcell__rD   s   @r2   r6   r6   0   s    	 llV2 V  -1+/"**#d***(** t** 
~u$	%	** **X U]]_<  <r4   r6   c                     | dd| j                   d   dz  f   }| d| j                   d   dz  df   }t        j                  | |fd      S )z*Rotates half the hidden dims of the input..Nr   r   r   )r   r    r'   )ro   x1x2s      r2   rotate_halfr      sZ    	
3"!''"+"""	#B	
3q ""	#B99rc2YB''r4   c                     |j                  |      }|j                  |      }| |z  t        |       |z  z   }||z  t        |      |z  z   }||fS )a  Applies Rotary Position Embedding to the query and key tensors.

    Args:
        q (`torch.Tensor`): The query tensor.
        k (`torch.Tensor`): The key tensor.
        cos (`torch.Tensor`): The cosine part of the rotary embedding.
        sin (`torch.Tensor`): The sine part of the rotary embedding.
        unsqueeze_dim (`int`, *optional*, defaults to 1):
            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
    Returns:
        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
    )	unsqueezer   )qkrl   rm   unsqueeze_dimq_embedk_embeds          r2   apply_rotary_pos_embr      sY    $ --
&C
--
&C3w;q>C/0G3w;q>C/0GGr4   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr   r   )r   rh   )ptrainingr   r   )r    matmul	transposer   
functionalsoftmaxfloat32rn   rh   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r2   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r4   c                        e Zd ZdZ fdZ	 	 d
dej                  dej                  dz  deej                  ej                  f   dz  dee	   deej                  ej                  dz  f   f
d	Z
 xZS )PixtralAttentionzI
    Multi-headed attention compatible with ALL_ATTENTION_FUNCTIONS.
    c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        d| _        | j                  dz  | _	        d| _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        y )NFg      ࿩bias)r?   r@   r8   rS   	embed_dimrT   	num_headsrQ   	is_causalr   attention_dropoutr   r   Lineark_projv_projq_projo_projrH   r8   rD   s     r2   r@   zPixtralAttention.__init__   s    ++33$..8}}d*//iiUKiiUKiiUKiiUKr4   Nhidden_statesr   position_embeddingsr   rN   c                 B   |j                         \  }}}| j                  |      }| j                  |      }	| j                  |      }
|j	                  ||| j
                  | j                        j                  dd      }|	j	                  ||| j
                  | j                        j                  dd      }	|
j	                  ||| j
                  | j                        j                  dd      }
|\  }}t        ||	||d      \  }}	t        j                  | j                  j                  t              } || ||	|
|f| j                  sdn| j                  | j                   d|\  }}|j#                  ||d      j%                         }| j'                  |      }||fS )z#Input shape: Batch x Time x Channelr   r   r   )r           )r   r   r   )sizer   r   r   viewr   rQ   r   r   r   get_interfacer8   _attn_implementationr   r   r   r   r$   r   r   )rH   r   r   r   r   
batch_sizepatches_query_states
key_statesvalue_statesrl   rm   attention_interfacer   r   s                   r2   rr   zPixtralAttention.forward   s    "/!3!3!5
GQ{{=1[[/
{{=1#((Wdnndmm\ffghjkl__Z$..$--Xbbcdfgh
#((Wdnndmm\ffghjkl&S#7jRUWZjk#l j(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ "))*grBMMOkk+.L((r4   rs   )rE   rt   ru   rv   r@   r    rw   r{   r   r   rr   r}   r~   s   @r2   r   r      s    L* /3HL	()||() t+() #5<<#=>E	()
 +,() 
u||U\\D00	1()r4   r   c                   $     e Zd Z fdZd Z xZS )
PixtralMLPc                    t         |           || _        |j                  | _        |j                  | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _	        t        |j                     | _        y )NFr   )r?   r@   r8   rS   intermediate_sizer   r   	gate_projup_proj	down_projr   
hidden_actact_fnr   s     r2   r@   zPixtralMLP.__init__  s    !--!'!9!94#3#3T5K5KRWXyy!1!143I3IPUV4#9#94;K;KRWXV../r4   c                     | j                  | j                  | j                  |            | j                  |      z        }|S N)r   r   r   r   )rH   ro   r   s      r2   rr   zPixtralMLP.forward  s6    NN4;;t~~a/@#ADLLQRO#ST	r4   )rE   rt   ru   r@   rr   r}   r~   s   @r2   r   r     s    0r4   r   c                   h     e Zd Zddeddf fdZdej                  dej                  fdZd Z xZ	S )	PixtralRMSNormepsrN   Nc                     t         |           t        j                  t	        j
                  |            | _        || _        y)z=
        PixtralRMSNorm is equivalent to T5LayerNorm
        N)r?   r@   r   	Parameterr    onesweightvariance_epsilon)rH   rS   r   rD   s      r2   r@   zPixtralRMSNorm.__init__  s1     	ll5::k#:; #r4   r   c                 "   |j                   }|j                  t        j                        }|j	                  d      j                  dd      }|t        j                  || j                  z         z  }| j                  |j                  |      z  S )Nr   r   T)keepdim)	rh   rn   r    r   powmeanrsqrtr   r   )rH   r   input_dtypevariances       r2   rr   zPixtralRMSNorm.forward  sy    #))%((7 $$Q',,R,>%Ht?T?T4T(UU{{]--k:::r4   c                 ^    t        | j                  j                         d| j                   S )Nz, eps=)r{   r   r   r   rH   s    r2   
extra_reprzPixtralRMSNorm.extra_repr%  s*    ))*+6$2G2G1HIIr4   )gư>)
rE   rt   ru   rW   r@   r    rw   rr   r   r}   r~   s   @r2   r   r     s7    $ $$ $;U\\ ;ell ;Jr4   r   c                        e Zd Z fdZ	 d	dej
                  dej
                  deej
                  ej
                  f   dz  dee   dej
                  f
dZ	 xZ
S )
PixtralAttentionLayerc                     t         |           t        |j                  d      | _        t        |      | _        t        |      | _        t        |j                  d      | _	        y )Nh㈵>r   )
r?   r@   r   rS   attention_normr   feed_forwardr   	attentionffn_normr   s     r2   r@   zPixtralAttentionLayer.__init__*  sP    ,V-?-?TJ&v.)&1&v'9'9tDr4   Nr   r   r   r   rN   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )aR  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )r   r   r    )r   r   r   r   )rH   r   r   r   r   residualr   s          r2   rr   zPixtralAttentionLayer.forward1  s     !++M:)4>> 
') 3
 	
q !=0 m4))-8 =0r4   r   )rE   rt   ru   r@   r    rw   r{   r   r   rr   r}   r~   s   @r2   r   r   )  sn    E IM	||  #5<<#=>E	
 +, 
r4   r   c            
            e Zd Z fdZ	 	 ddej
                  dz  deej
                  ej
                  f   dz  dee   dee	z  fdZ
 xZS )	PixtralTransformerc                     t         |           || _        t        j                  j                         | _        t        |j                        D ]&  }| j                  j                  t        |             ( d| _        y )NF)r?   r@   r8   r    r   
ModuleListlayersrangenum_hidden_layersr&   r   gradient_checkpointing)rH   r8   r   rD   s      r2   r@   zPixtralTransformer.__init__S  sc    hh))+v//0 	>AKK4V<=	>&+#r4   Nr   r   r   rN   c                 X    |}| j                   D ]  } |||fd|i|} t        |      S )aI  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Embeddings which serve as input to the Transformer.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        r   )last_hidden_state)r   r	   )rH   inputs_embedsr   r   r   r   encoder_layers          r2   rr   zPixtralTransformer.forward[  sM    & &![[ 	M) %8 	M	 ??r4   rs   )rE   rt   ru   r@   r    rw   r{   r   r   r	   rr   r}   r~   s   @r2   r   r   R  sm    , /3HL	@ t+@ #5<<#=>E	@
 +,@ 
	 @r4   r   c                   H    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdZdgZeedZy)	PixtralPreTrainedModelr8   modelpixel_values)imageTr   )r   
attentionsN)rE   rt   ru   r   rx   base_model_prefixmain_input_nameinput_modalitiessupports_gradient_checkpointing_supports_attention_backend_supports_flash_attn_supports_sdpa_supports_flex_attn_no_split_modulesr   r   _can_record_outputsr   r4   r2   r   r   z  sM    $O!&*#"&N01.&r4   r   c                    |j                   }|j                  }|j                  d   }t        j                  |      j
                  }t        j                  ||f|||      }t        j                  |       j                  d      }t        j                  dg| d d z         j                  d      }t        ||      D ]  \  }	}
d||	|
|	|
f<    |d d d d d d f   j                  |j                  d   ddd      }|S )Nr   )
fill_valuerh   rI   r   r   )rh   rI   r   r    finfominfulltensorcumsumzipexpand)r(   r  rh   rI   rM   d_mincausal_maskblock_end_idxblock_start_idxstartends              r2   generate_block_attention_maskr    s    LLE]]Fll1oGKK""E**gw/EW]^KLL!23::2>MllA3):3B)?#?@GGKO/=9 .
s,-E#IuSy(). dD!Q./66v||A2rRKr4   c                        e Zd ZdZ fdZd Zeee	 d
de	j                  de	j                  dz  dee   deez  fd	                     Z xZS )PixtralVisionModelvision_encoderc                 z   t         |   |       || _        t        j                  |j
                  |j                  |j                  |j                  d      | _        |j                  | _        t        |j                  d      | _
        t        |      | _        t        |      | _        | j                          y )NF)in_channelsout_channelskernel_sizestrider   r   r   )r?   r@   r8   r   Conv2dnum_channelsrS   rV   
patch_convr   ln_prer   transformerr6   patch_positional_embedding	post_initr   s     r2   r@   zPixtralVisionModel.__init__  s     ))++++))$$
 !++$V%7%7TB-f5*@*H'r4   c                     | j                   S r   )r  r   s    r2   get_input_embeddingsz'PixtralVisionModel.get_input_embeddings  s    r4   Nr   image_sizesr   rN   c           
         ||j                   \  }}}}||fg|z  }| j                  j                  j                  }| j                  |j	                  |            }	t        |	|      D 
cg c]1  \  }
}|
dd |d   | j                  z  d |d   | j                  z  f   3 }}
}t        j                  |D cg c]  }|j                  d      j                   c}d      j                  d      }	| j                  |	      }	t        || j                  j                  | j                  j                  z        }|j                  d      j	                  |	j                   d      |d	<   | j#                  |	|      }t%        | j                        rd }n7t'        |D cg c]!  }|j                   d
   |j                   d   z  # c}|	      } | j(                  |	f||d|S c c}}
w c c}w c c}w )Nrg   .r   r   r   )r)   T)non_blockingrp   r   r   )r   r   )r   r  r   rh   rn   r	  rV   r    r'   flattenTr   r  r3   r8   rU   rI   r  r   r  r  )rH   r   r#  r   r   r   r,   r-   target_dtypepatch_embedsembedr   r(   r   rp   r   r   s                    r2   rr   zPixtralVisionModel.forward  s    +7+=+=(J65"E?+j8K --33|\'JK  #<=
t #5$q'T__457U$q'T__:T7UUV
 
 yy:K!LQ!))A,..!LRST^^_`a{{<0 0)?)?4;;CYCY)Y
 ".!7!7!:!=!=l>Q>Q`d!=!e~"==lLY'4!N:4EFqqwwr{*FN  t
) 3
 	
 	
3
 "M  Gs   ,6G!8"G'&G,r   )rE   rt   ru   r   r@   r"  r   r   r   r    rw   r   r   r{   r	   rr   r}   r~   s   @r2   r  r    sz    ("   ,0+
ll+
 \\D(+
 +,	+

 
	 +
    +
r4   r  )r   )r   )4rv   collections.abcr   typingr   r    r   activationsr   modeling_layersr   modeling_outputsr	   modeling_rope_utilsr
   modeling_utilsr   r   processing_utilsr   utilsr   r   r   utils.genericr   r   r   utils.output_capturingr   configuration_pixtralr   
get_loggerrE   loggerr3   Moduler6   r   r   rw   rW   r   r   r   r   r   r   r   r  r  __all__r   r4   r2   <module>r;     sw    $    ! 9 / 6 F & @ @ e e 5 6 
		H	% U<RYY U<r(B %II%<<% 
% <<	%
 LL4'% % %.?)ryy ?)F "JRYY J(&6 &R%@ %@P _  "  E
/ E
 E
P  !9
:r4   