
    i~A                     Z   d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z	 ddl
mZ dd	lmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlmZ  ej2                  e      Ze G d de             Z G d dej:                        Z	 d%dej:                  dej>                  dej>                  dej>                  dej>                  dz  de de fdZ! G d dej:                        Z" G d dej:                        Z# G d d e      Z$ G d! d"ej:                        Z% G d# d$ej:                        Z&y)&zTPyTorch IdeficsVision model: a copy of CLIPVisionModel using a simpler config object    N)Callable)	dataclass)nn   )ACT2FN)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONS)Unpack)ModelOutputTransformersKwargslogging   )IdeficsVisionConfigc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)IdeficsVisionModelOutputa  
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.

    Args:
        image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
            The image embeddings obtained by applying the projection layer to the pooler_output.
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.

            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.

            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__r   r   tupler        s/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/idefics/vision.pyr   r   '   sr    * .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r!   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )IdeficsVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_ids)r   )
persistent)super__init__r%   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferarangeexpandselfr%   	__class__s     r"   r1   z IdeficsVisionEmbeddings.__init__F   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr!   
embeddingsheightwidthreturnc                    |j                   d   dz
  }| j                  | j                        }|j                   d   dz
  }||k(  r||k(  r|S |dddf   }|ddddf   }|j                   d   }	|| j                  j                  z  }
|| j                  j                  z  }|
dz   |dz   }}
t        j                  |      }|j                  dt        |      t        |      |	      }|j                  dddd      }|j                  t        j                  k(  }|r4t        j                  d       |j                  t        j                         }t"        j$                  j'                  ||
|z  ||z  fd	d
      }|r|j                  t        j                        }t        |
      |j                   d   k7  st        |      |j                   d   k7  rBt)        dt        |
      t        |      f d|j                   d   |j                   d   f d      |j                  dddd      j+                  dd|	      }t        j,                  |j/                  d      |fd      S )a#  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
        resolution images.

        Source:
        https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174
        r   Nr   r.   g?r   r,   zUpcasting patch_pos_embed to fp32 for interpolation since `upsample_bicubic2d_out_frame` in nn.functional.interpolate is not implemented for 'torch.bfloat16' dtype. This will result in a slight overhead.bicubicF)scale_factormodealign_cornerszNumber of patches for images (z/) don't match the shape of position embedding ()dim)shaper?   r-   r%   r5   mathsqrtreshapeintpermutedtyper   bfloat16loggerwarning_oncetofloatr   
functionalinterpolate
ValueErrorviewcat	unsqueeze)rD   rF   rG   rH   r<   	pos_embedr=   class_pos_embedpatch_pos_embedr3   num_h_patchesnum_w_patchessqrt_num_positionsfp32_upcastings                 r"   interpolate_pos_encodingz0IdeficsVisionEmbeddings.interpolate_pos_encoding]   sf    !&&q)A-++D,=,=>	!*Q.-'FeO#AqD/#AqrE*$$R(	$++"8"88!7!77 (5s':MC<O}!YY}5)11!S9K5LcRdNegpq)11!Q1=(..%..@h .00=O--33'*<<mN`>`a	 4 
 -00@O}!6!6r!::c->PTcTiTijlTm>m0]1CSEW1W0X Y00?0E0Eb0I?K`K`acKd0d/eefh  *11!Q1=BB1b)Tyy/33A6HaPPr!   pixel_valuesrl   c                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model (z8). You should try to set `interpolate_pos_encoding=True`)rY   r,   r   r.   rQ   )rS   r4   ra   r;   weightrY   r]   flatten	transposer8   rB   r   rc   rl   r?   r-   )rD   rm   rl   
batch_sizer:   rG   rH   target_dtypepatch_embedsclass_embedsrF   s              r"   forwardzIdeficsVisionEmbeddings.forward   s8   2>2D2D/
L&%'(ET__,D (% 9)4??*;;su 
 ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
 $#d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr!   )F)r   r   r   r   r1   r   TensorrW   rl   r   boolrw   __classcell__rE   s   @r"   r$   r$   E   sm    q2 q./Q5<< /Q /QUX /Q]b]i]i /QbE$5$5 QU bgbnbn r!   r$   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr.   rO   )rR   rY   )ptrainingr   r,   )r   matmulrr   r   r_   softmaxfloat32r]   rY   r   r   
contiguous)
r|   r}   r~   r   r   r   r   kwargsattn_weightsattn_outputs
             r"   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r!   c                        e Zd ZdZdef fdZ	 d
dej                  dej                  dz  dee	   de
ej                  ej                  dz  f   fd	Z xZS )IdeficsVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr%   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).g      F)r0   r1   r%   r2   r3   num_attention_heads	num_headshead_dimra   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projrC   s     r"   r1   zIdeficsVisionAttention.__init__   s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar!   Nr   r   r   rI   c                    |j                   dd }g |d| j                  }| j                  |      }| j                  |      }| j	                  |      }|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  | j                  sdn| j                  d|\  }
} |
j                   g |d j#                         }
| j%                  |
      }
|
|fS )z#Input shape: Batch x Time x ChannelNr.   r   r,           )r   r   r   )rS   r   r   r   r   rb   rr   r   get_interfacer%   _attn_implementationr   r   r   r   r   rV   r   r   )rD   r   r   r   input_shapehidden_shapequerieskeysvaluesattention_interfacer   r   s               r"   rw   zIdeficsVisionAttention.forward   sV    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8
%
 nnJJ#}}C$,,
%
 
%
!\ *k));;;;FFHmmK0L((r!   N)r   r   r   r   r   r1   r   rx   r   r   r   rw   rz   r{   s   @r"   r   r      so    GB2 B. /3%)||%) t+%) +,	%)
 
u||U\\D00	1%)r!   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )IdeficsVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )r0   r1   r%   r   
hidden_actactivation_fnr   r   r2   intermediate_sizefc1fc2rC   s     r"   r1   zIdeficsVisionMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr!   r   rI   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )r   r   r   )rD   r   s     r"   rw   zIdeficsVisionMLP.forward  s4    /**=9/r!   )r   r   r   r1   r   rx   rw   rz   r{   s   @r"   r   r      s$    KU\\ ell r!   r   c                        e Zd Zdef fdZdej                  dej                  dee   de	ej                  ej                  dz  f   fdZ xZS )	IdeficsVisionEncoderLayerr%   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)eps)r0   r1   r2   r3   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2rC   s     r"   r1   z"IdeficsVisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRr!   r   r   r   rI   Nc                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r    )r   r   r   r   )rD   r   r   r   residual_s         r"   rw   z!IdeficsVisionEncoderLayer.forward  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r!   )r   r   r   r   r1   r   rx   r   r   r   r   rw   rz   r{   s   @r"   r   r     sd    S2 S||  +,	
 
u  %,,"55	6r!   r   c                   f     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
ez  fdZ xZS )
IdeficsVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`IdeficsVisionEncoderLayer`].

    Args:
        config: IdeficsVisionConfig
    r%   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r0   r1   r%   r   
ModuleListrangenum_hidden_layersr   layersgradient_checkpointing)rD   r%   r   rE   s      r"   r1   zIdeficsVisionEncoder.__init__;  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#Nr   r   rI   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a8  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)

        )r   )r   r	   )rD   inputs_embedsr   r   r   encoder_layers         r"   rw   zIdeficsVisionEncoder.forwardA  sH    * &![[ 	M) M	 +
 	
r!   r   )r   r   r   r   r   r1   r   rx   r   r   r   r	   rw   rz   r{   s   @r"   r   r   2  sQ    ,2 , /3
 t+
 +,	

 
	 
r!   r   c                   d     e Zd Zdef fdZ	 	 ddej                  dz  dedz  dee	z  fdZ
 xZS )	IdeficsVisionTransformerr%   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )r0   r1   r%   r2   r$   rF   r   r   r   pre_layrnormr   encoderpost_layernorm)rD   r%   r3   rE   s      r"   r1   z!IdeficsVisionTransformer.__init__e  sj    &&	1&9LL8M8MN+F3 ll9&:O:OPr!   Nrm   rl   rI   c                     |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|j                  }|dddddf   }| j                  |      }t        ||      S )z
        Returns:

        Nz You have to specify pixel_values)rl   r   r   )r   pooler_outputr    )ra   rF   r   r   r   r   r
   )rD   rm   rl   r   r   encoder_outputsr   pooled_outputs           r"   rw   z IdeficsVisionTransformer.forwardp  s     ?@@Ogh))-8+74<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r!   r   )r   r   r   r   r1   r   r   ry   r   r
   rw   rz   r{   s   @r"   r   r   d  sP    Q2 Q 2605
''$.
 #'+

 
+	+
r!   r   )r   )'r   rT   collections.abcr   dataclassesr   r   r   activationsr   modeling_layersr   modeling_outputsr	   r
   modeling_utilsr   processing_utilsr   utilsr   r   r   configuration_ideficsr   
get_loggerr   r[   r   Moduler$   rx   r^   r   r   r   r   r   r   r    r!   r"   <module>r      s/   [  $ !   ! 9 K 5 & 
 7 
		H	% <{ < <:`bii `V %II%<<% 
% <<	%
 LL4'% % %.<)RYY <)@ryy   : D.
299 .
d(
ryy (
r!   