
    iC                        d dl mZ d dlZd dlmZ d dlmZ ddlmZ	 ddl
mZ ddlmZmZ ddlmZmZ dd	lmZ dd
lmZmZmZ ddlmZmZmZmZmZmZmZ ddl m!Z! ddl"m#Z#m$Z$  ejJ                  e&      Z' ed      e G d de                    Z( G d de      Z) G d de#      Z* G d de      Z+ G d de      Z, G d de      Z- G d de      Z.e G d d e             Z/ G d! d"e      Z0 G d# d$e      Z1g d%Z2y)&    )CallableN)strict   )initialization)PreTrainedConfig)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging   )CLIPMLPCLIPAttentionCLIPEncoderCLIPEncoderLayerCLIPVisionEmbeddingsCLIPVisionModelCLIPVisionTransformer)eager_attention_forward)VisionRotaryEmbeddingapply_rotary_pos_emb_visionz&DeepGlint-AI/mlcd-vit-bigG-patch14-336)
checkpointc                      e Zd ZU dZdZdZdZeed<   dZ	eed<   dZ
eed	<   d
Zeed<   dZeed<   dZeed<   dZeee   z  eeef   z  ed<   dZeee   z  eeef   z  ed<   dZeed<   dZeed<   dZeez  ed<   dZeed<   dZeed<   y)MLCDVisionConfigav  
    num_key_value_groups (`int`, *optional*, defaults to 1):
        Number of key-value groups used in Attention.

    Example:

    ```python
    >>> from transformers import MLCDVisionConfig, MLCDVisionModel

    >>> # Initializing a MLCDVisionConfig with DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> configuration = MLCDVisionConfig()

    >>> # Initializing a MLCDVisionModel (with random weights) from the DeepGlint-AI/mlcd-vit-bigG-patch14-336 style configuration
    >>> model = MLCDVisionModel(configuration)

    >>> # Accessing the model configuration
    >>> configuration = model.config
    ```mlcd_vision_modelvision_configi  hidden_sizei    intermediate_size0   num_hidden_layers   num_attention_heads   num_key_value_groupsr   num_channelsiP  
image_size   
patch_sizegelu
hidden_actgh㈵>layer_norm_eps        attention_dropoutg{Gz?initializer_range      ?initializer_factorN)__name__
__module____qualname____doc__
model_typebase_config_keyr    int__annotations__r!   r#   r%   r'   r(   r)   listtupler+   r-   strr.   floatr0   r1   r3        v/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/mlcd/modular_mlcd.pyr   r   *   s    & %J%OK!s!s!! !#!L#47Jd3i%S/1746Jd3i%S/16J NE %(us{(#u# ##rA   r   c                       e Zd Zy)MLCDMLPN)r4   r5   r6   r@   rA   rB   rD   rD   R   s    rA   rD   c                   4    e Zd Zdededej
                  fdZy)MLCDRotaryEmbeddingnum_patches_heightnum_patches_widthreturnc                    t        j                  || j                  j                        j	                  d      j                  d|      }t        j                  || j                  j                        j	                  d      j                  |d      }t        j                  |j                         |j                         gd      }t        ||      }t        j                  || j                  j                  | j                  j                        }t        j                  || j                        }||   j                  d      }	|	S )a}  
        Calculate the Rotary Position Embedding (RoPE) for MLCDVisionModel based on the grid size.

        Args:
            num_patches_height (int): Number of patches in the height dimension.
            num_patches_width (int): Number of patches in the width dimension.

        Returns:
            torch.Tensor: Rotary positional embeddings for the given grid size.
        )devicer&   r   dim)rK   dtype)torcharangeinv_freqrK   	unsqueezeexpandstackflattenmaxrO   outer)
selfrG   rH   hpos_idswpos_idspos_idsmax_grid_sizeseqrotary_pos_emb_fullrotary_pos_embs
             rB   forwardzMLCDRotaryEmbedding.forwardW   s	    LL+DMM4H4HISSTUV]]^`bst 	 LL*4==3G3GHRRSTU\\]oqst 	
 ++x//183C3C3EFBO .0ABll=1E1ET]]M`M`a#kk#t}}= -W5==a@rA   N)r4   r5   r6   r:   rP   Tensorra   r@   rA   rB   rF   rF   V   s     # # %,, rA   rF   c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZ	S )MLCDVisionEmbeddingsconfigc                 (    t         |   |       | `y N)super__init__position_embeddingrY   re   	__class__s     rB   ri   zMLCDVisionEmbeddings.__init__y   s     #rA   pixel_valuesrI   c                 T   |j                   d   }| j                  j                  j                  }| j                  |j	                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }|S )Nr   rO   r   r&   rL   rM   )shapepatch_embeddingweightrO   torV   	transposeclass_embeddingrT   rP   cat)rY   rm   
batch_sizetarget_dtypepatch_embedsclass_embeds
embeddingss          rB   ra   zMLCDVisionEmbeddings.forward}   s    !''*
++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
rA   )
r4   r5   r6   r   ri   rP   FloatTensorrb   ra   __classcell__rl   s   @rB   rd   rd   x   s-    $/ $
E$5$5 
%,, 
rA   rd   c                        e Zd ZdZdef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de	e
   d	eej                  ej                  dz  f   f
d
Z xZS )MLCDAttentionzMulti-headed attention with RoPE. Refer to papers:
    - Attention is all you need:
        https://huggingface.co/papers/1706.03762
    - RoFormer: Enhanced Transformer with Rotary Position Embedding:
        https://huggingface.co/papers/2104.09864
    re   c                 T    t         |   |       |j                  | _        d| _        y )NF)rh   ri   r'   	is_causalrk   s     rB   ri   zMLCDAttention.__init__   s%     $*$?$?!rA   Nhidden_statesposition_embeddingsattention_maskkwargsrI   c                    |j                   d d \  }}| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }| j                  |      j                  ||| j                  | j                  f      }	|d   j                  d      j                         }
|d   j                  d      j                         }t        |||
|      \  }}|j                  dddd      j                         }|j                  dddd      j                         }|	j                  dddd      j                         }	t        j                  | j                  j                  t               } || |||	|f| j"                  sdn| j$                  | j&                  | j(                  d|\  }}|j                  dddd      j                         }|j+                  ||d      }| j-                  |      }|j                  ddd      j                         }||fS )NrL   r   r&   r   r   r/   )dropoutscalingr   )rp   q_projreshape	num_headshead_dimk_projv_projrS   r?   r   permute
contiguousr
   get_interfacere   _attn_implementationr   trainingr   scaler   viewout_proj)rY   r   r   r   r   rw   
seq_lengthquery_states
key_statesvalue_statescossinattention_interfaceattn_outputattn_weightss                  rB   ra   zMLCDAttention.forward   s;    "/!4!4Sb!9
J {{=199:zSWSaSacgcpcp:qr[[/77ZQUQ_Q_aeanan8op
{{=199:zSWSaSacgcpcp:qr "!$..q1779!!$..q1779#>|ZY\^a#b j $++Aq!Q7BBD''1a3>>@
#++Aq!Q7BBD(?(M(MKK,,.E)
 %8
%
  $}}C$,,JJnn
%
 
%
!\ "))!Q15@@B!&&z:rBmmK0!))!Q2==?L((rA   rg   )r4   r5   r6   r7   r   ri   rP   rb   r=   r   r   ra   r}   r~   s   @rB   r   r      s    /  /3	,)||,) #5<<#=>,) t+	,)
 +,,) 
u||U\\D00	1,)rA   r   c                        e Zd Zdef fdZ	 d
dej                  deej                  ej                  f   dej                  dz  dee	   deej                     f
d	Z xZS )MLCDEncoderLayerre   c                 D    t         |   |       t        |      | _        y rg   )rh   ri   r   	self_attnrk   s     rB   ri   zMLCDEncoderLayer.__init__   s     &v.rA   Nr   r   r   r   rI   c                     |}| j                  |      } | j                  d|||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )a  
        Args:
            hidden_states (`torch.FloatTensor`):
                Input to the layer of shape `(batch, seq_len, embed_dim)`.
                Represents the hidden states from the previous layer or the input embeddings.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.FloatTensor`):
                Attention mask of shape `(batch, 1, q_len, k_v_seq_len)` where padding elements are indicated by very large negative values.
        )r   r   r   r@   )layer_norm1r   layer_norm2mlp)rY   r   r   r   r   residual_s          rB   ra   zMLCDEncoderLayer.forward   s    $ !((7)4>> 
' 3)
 	
q !=0 ((7/ =0rA   rg   )r4   r5   r6   r   ri   rP   rb   r=   r   r   r|   ra   r}   r~   s   @rB   r   r      sz    // / /3	"||" #5<<#=>" t+	"
 +," 
u  	!"rA   r   c                        e Zd ZdZdef fdZ	 ddej                  deej                  ej                  f   dej                  dz  de
e   d	eez  f
d
Z xZS )MLCDEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`MLCDEncoderLayer`].

    Args:
        config: MLCDVisionConfig
    re   c                 $    t         |   |       y)z3Overwrite dummy `MLCDConfig` to `MLCDVisionConfig`.N)rh   ri   rk   s     rB   ri   zMLCDEncoder.__init__   s     rA   Ninputs_embedsr   r   r   rI   c                 V    |}| j                   D ]  } ||||fi |} t        |      S )a=  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            position_embeddings (`tuple[torch.Tensor, torch.Tensor]`):
                A tuple of two tensors, each of shape `(batch, seq_len, embed_dim)`.
                Represents absolute positional embeddings for the query and key in the attention mechanism.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.
                [What are attention masks?](../glossary#attention-mask)
        )last_hidden_state)layersr   )rY   r   r   r   r   r   encoder_layers          rB   ra   zMLCDEncoder.forward   sK    , &![[ 	M)# 	M	 +
 	
rA   rg   )r4   r5   r6   r7   r   ri   rP   r|   r=   rb   r   r   r   ra   r}   r~   s   @rB   r   r      s{    !/ ! /3	!
((!
 #5<<#=>!
 t+	!

 +,!
 
	 !
rA   r   c                   l    e Zd ZU eed<   dZdZdZdZdZ	dZ
dZeedZ ej                          d        Zy)MLCDPreTrainedModelre   mlcdTF)r   
attentionsc                 &	   | j                   j                  }t        |t              r| j                   j                  }t	        j
                  |j                  d|j                  dz  |z         t	        j
                  |j                  j                  |j                   j                  |z         t	        j                  |j                  t        j                  |j                  j                  d         j!                  d             yt        |t"              r| j                   j                  }|j                  dz  d|j                   j$                  z  dz  z  |z  }|j                  dz  |z  }t	        j
                  |j&                  j                  |       t	        j
                  |j(                  j                  |       t	        j
                  |j*                  j                  |       t	        j
                  |j,                  j                  |       yt        |t.              r| j                   j                  }|j                   j0                  dz  d|j                   j$                  z  dz  z  |z  }d|j                   j0                  z  dz  |z  }t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       yt        |t6              ro| j                   j                  }|j                   j0                  |j                   j8                  z  dz  dz  |z  }t	        j
                  |j:                  d|       yt        |t<        j>                        r?t	        j@                  |jB                         t	        jD                  |j                         yt        |t<        jF                        r,|jB                   t	        j@                  |jB                         yt        |tH              rod	|jJ                  t        j                  d
|jL                  dt        jN                        |jL                  z  z  z  }t	        j                  |jP                  |       yy)zInitialize the weightsr/   g      )meanstd)r   rL   )r&   rL   r   Nr2   r   ro   ))re   r3   
isinstancerd   initnormal_ru   	embed_dimrq   rr   r1   copy_position_idsrP   rQ   rp   rT   r   r#   r   r   r   r   rD   r    fc1fc2MLCDVisionTransformerr%   class_pos_embnn	LayerNormzeros_biasones_LinearrF   thetarN   r?   rR   )rY   modulefactorin_proj_stdout_proj_stdfc_stdpos_emb_stdrR   s           rB   _init_weightsz!MLCDPreTrainedModel._init_weights0  s#    //f23[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh.[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B([[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< 56[[33F!==448Y8YY]^^cggjppKLL--C[I-KK$JJv}}%		*v{{/FKK$ 34fllu||Avzz1TYT_T_/`cicmcm/mnoHJJv1 5rA   N)r4   r5   r6   r   r;   base_model_prefixsupports_gradient_checkpointingaccepts_loss_kwargs_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr   r   _can_record_outputsrP   no_gradr   r@   rA   rB   r   r   !  s[    &*#N"&)#
 U]]_!2 !2rA   r   c                   b     e Zd Zdef fdZ	 ddej                  dz  dee   de	e
z  fdZ xZS )	r   re   c                    t         |   |       t        |j                  |j                  z  dz        | _        t        j                  t        j                  d|j                  |j                  z  dz              | _
        y )Nr   r&   )rh   ri   rF   r    r%   vision_rotary_embeddingr   	ParameterrP   randnr   rk   s     rB   ri   zMLCDVisionTransformer.__init__V  sh     ':6;M;MQWQkQk;kop;p'q$\\%++a9K9KvOiOi9imn9n*oprA   Nrm   r   rI   c                    |t        d      |j                  d   | j                  j                  z  }|j                  d   | j                  j                  z  }| j	                  ||      }|j                  | j                  j                        }t        j                  | j                  |gd      }t        j                  ||fd      }|j                         |j                         f}| j                  |      }| j                  |      } | j                  d||d|}	|	d   }
|
d d dd d f   }| j                  |      }t!        |
|      S )	Nz You have to specify pixel_valuesrL   r   rM   )r   r   )r   pooler_outputr@   )
ValueErrorrp   re   r+   r   rs   r   rK   rP   rv   r   r   r{   pre_layrnormencoderpost_layernormr	   )rY   rm   r   rG   rH   r`   embr   r   encoder_outputsr   pooled_outputs               rB   ra   zMLCDVisionTransformer.forward[  sP   
 ?@@)//3t{{7M7MM(..r2dkk6L6LL556HJ[\'**4+=+=+D+DED$6$6#GQOii8bA"wwy#'')45))-8&$,, 
' 3
 
 ,A.)!Q'2++M:)/'
 	
rA   rg   )r4   r5   r6   r   ri   rP   r|   r   r   r=   r	   ra   r}   r~   s   @rB   r   r   U  sO    q/ q 26 
''$. 
 +, 
 
+	+	 
rA   r   c                   J    e Zd Z	 ddej                  dz  dee   deez  fdZ	y)MLCDVisionModelNrm   r   rI   c                 *     | j                   dd|i|S )a  
        Example:

        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> from transformers import AutoProcessor, MLCDVisionModel
        >>> model = MLCDVisionModel.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")
        >>> processor = AutoProcessor.from_pretrained("DeepGlint-AI/mlcd-vit-bigG-patch14-448")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.no_grad():
        ...     outputs = model(**inputs, output_attentions=True)

        >>> features = outputs.last_hidden_state
        >>> print(f"Extracted features shape: {features.shape}")
        >>> print(f"Number of attention layers: {len(outputs.attentions)}")
        >>> print(f"Attention shape: {outputs.attentions[0].shape}")
        ```rm   r@   )vision_model)rY   rm   r   s      rB   ra   zMLCDVisionModel.forward  s)    : !t   
%

 	
rA   rg   )
r4   r5   r6   rP   r|   r   r   r=   r	   ra   r@   rA   rB   r   r   ~  s?     26 
''$. 
 +, 
 
+	+	 
rA   r   )r   r   r   )3collections.abcr   rP   torch.nnr   huggingface_hub.dataclassesr    r   r   configuration_utilsr   modeling_outputsr   r	   modeling_utilsr
   r   processing_utilsr   utilsr   r   r   clip.modeling_clipr   r   r   r   r   r   r   llama.modeling_llamar   qwen2_vl.modeling_qwen2_vlr   r   
get_loggerr4   loggerr   rD   rF   rd   r   r   r   r   r   r   __all__r@   rA   rB   <module>r      s   %   . & 3 K F & @ @   ; [ 
		H	% CD#$' #$  E#$L	g 	/ D/ $9)M 9)x'' 'T.
+ .
b 02/ 02 02f&
1 &
R!
o !
HrA   