
    io                        d dl Z d dlmZ d dlmZ d dlZd dlmZ d dlm	Z	 d dlm
Z
mZmZ ddlmZ ddlmZ dd	lmZmZmZmZ dd
lmZ ddlmZ ddlmZmZmZ ddlm Z  ddl!m"Z"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)  ejT                  e+      Z, ed      e	 G d de                    Z-e G d de             Z.e G d de             Z/ G d dej`                        Z1e G d de)             Z2 ed       G d  d!e(             Z3 G d" d#e&      Z4e G d$ d%e2             Z5 ed&       G d' d(e2             Z6 ed)       G d* d+e2             Z7g d,Z8y)-    N)	dataclass)Literal)strict)BCEWithLogitsLossCrossEntropyLossMSELoss   )initialization)PreTrainedConfig)BaseModelOutputMaskedLMOutputSequenceClassifierOutputTokenClassifierOutput)PreTrainedModel)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )CONFIG_MAPPING
AutoConfig	AutoModel)ModernBertPredictionHead)SmolVLMModelSmolVLMPreTrainedModelModernVBERT/modernvbert)
checkpointc                        e Zd ZU dZdZeedZdZee	z  dz  e
d<   dZee	z  dz  e
d<   dZee
d<   d	Zee
d
<   dZee
d<   dZee
d<   dZed   e
d<   dZeez  e
d<   dZee
d<   dZee
d<    fdZ xZS )ModernVBertConfiga?  
    pixel_shuffle_factor (`int | None`, *optional*, defaults to 4):
        Scale factor used by any pixel-shuffle / upsampling operations in the vision head.
    initializer_cutoff_factor (`float | None`, *optional*, defaults to 2.0):
        The cutoff factor for the truncated_normal_initializer for initializing all weight matrices.
    classifier_pooling (`Literal["cls", "mean"]`, *optional*, defaults to `"cls"`):
        The pooling strategy to use for classification tasks.
    classifier_bias (`bool | None`, *optional*, defaults to `False`):
        Whether to add a bias term to the classification head

    Example:
    ```python
    >>> from transformers import ModernVBertConfig

    >>> # Initializing configuration
    >>> configuration = ModernVBertConfig()

    >>> # Initializing a model from the configuration (model class is implemented in
    >>> # `modernvbert.modeling_modernvbert`)

    >>> from transformers import ModernVBertModel
    >>> model = ModernVBertModel(configuration)

    >>> # Accessing the model configuration
    >>> cfg = model.config
    ```modernvbert)text_configvision_configNr"   r#   i  image_token_id   pixel_shuffle_factorg{Gz?initializer_range       @initializer_cutoff_factorcls)r*   meanclassifier_pooling        classifier_dropoutFclassifier_biastie_word_embeddingsc                 |   | j                   t        d          | _         n7t        | j                   t              rt        d   di | j                   | _         | j                  t        d          | _        n7t        | j                  t              rt        d   di | j                  | _        t        |   di | y )N
modernbertsiglip_vision_model )r"   r   
isinstancedictr#   super__post_init__)selfkwargs	__class__s     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/modernvbert/modular_modernvbert.pyr8   zModernVBertConfig.__post_init__X   s    #-l;=D(($/-l;Od>N>NOD%!/0E!F!HD**D1!/0E!F!\I[I[!\D''    )__name__
__module____qualname____doc__
model_typer   sub_configsr"   r   r6   __annotations__r#   r$   intr&   r'   floatr)   r,   r   r.   r/   boolr0   r8   __classcell__r;   s   @r<   r    r    ,   s    6 J",zJK26K!D(4/648M#d*T18NC !#!#u#'*u*16.6&))!OT! %%( (r=   r    c                       e Zd ZU dZdZej                  ed<   dZe	ej                     dz  ed<   dZ
e	ej                     dz  ed<   dZe	ej                     dz  ed<   y)ModernVBertBaseModelOutputaY  
    Base class for ModernVBERT model's outputs.
    Args:
        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
            Sequence of hidden-states at the output of the last layer of the model.
            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
            hidden_size)` is output.
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlast_hidden_statehidden_states
attentionsimage_hidden_states)r>   r?   r@   rA   rL   torchFloatTensorrD   rM   tuplerN   rO   r4   r=   r<   rK   rK   f   si    , ,0u((/59M5**+d2926Je''(4/6;?u001D8?r=   rK   c                       e Zd ZU dZdZej                  dz  ed<   dZej                  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   dZej                  dz  ed<   y)	ModernVBertMaskedLMOutputaG  
    Base class for ModernVBERT model's outputs with masked language modeling loss.
    Args:
        loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided):
            Masked language modeling (MLM) loss.
        logits (`torch.FloatTensor`):
            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
            Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
            one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
            sequence_length)`.
            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
            heads.
        image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
            Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
            sequence_length, hidden_size)`.
            image_hidden_states of the model produced by the vision encoder
    Nlosslogits.rM   rN   rO   )r>   r?   r@   rA   rU   rP   rQ   rD   rV   rM   rR   rN   rO   r4   r=   r<   rT   rT      s    , &*D%

d
") $FE$:>M5**C/047>7;Je'',-4;48**T18r=   rT   c                   .     e Zd ZdZ fdZd Zd Z xZS )ModernVBertConnectorz
    Connector module for ModernVBERT. It performs a pixel shuffle operation followed by a linear projection to match the text model's hidden size.
    Based on https://pytorch.org/docs/stable/generated/torch.nn.PixelShuffle.html
    c                     t         |           |j                  | _        t        j                  |j
                  j                  |j                  dz  z  |j                  j                  d      | _        y )Nr   Fbias)	r7   __init__r&   nnLinearr#   hidden_sizer"   modality_projectionr9   configr;   s     r<   r\   zModernVBertConnector.__init__   s^    $*$?$?!#%99  ,,0K0KQ0NO**$
 r=   c                    |j                         \  }}}t        |dz        x}}|j                  ||||      }|j                  ||t        ||z        ||z        }|j                  dddd      }|j	                  |t        ||z        t        ||z        ||dz  z        }|j                  dddd      }|j	                  |t        ||dz  z        ||dz  z        S )Ng      ?r   r      r	   )sizerE   viewpermutereshape)r9   rO   r&   
batch_size
seq_length	embed_dimheightwidths           r<   pixel_shufflez"ModernVBertConnector.pixel_shuffle   s   ,?,D,D,F)
J	Z_--166z65R[\166E,@$@ A9OcCc
 299!Q1E199,,---.-q01	
 299!Q1E"**J*>*ABCYRfhiRiEj
 	
r=   c                 \    | j                  || j                        }| j                  |      S N)rn   r&   r`   )r9   rO   s     r<   forwardzModernVBertConnector.forward   s.    "001DdF_F_`''(;<<r=   )r>   r?   r@   rA   r\   rn   rq   rH   rI   s   @r<   rX   rX      s    


&=r=   rX   c                   B    e Zd ZeZg Z ej                         d        Zy)ModernVBertPreTrainedModelc                     t        j                   |       dt        j                  dt        f fd}t        |t              ra j                  j                  t        j                  d j                  j                  j                  z        z  } ||j                  |       y t        |t              ra j                  j                  t        j                  d j                  j                  j                  z        z  } ||j                  |       y t        |t         t"        f      r^ j                  j                  t        j                   j                  j                  j$                        z  } ||j&                  |       y y )Nmodulestdc                 8   t        j                  dd      }t        j                  | j                  d|| |z  ||z         t        | t        j                  t        j                  f      r-| j                   t        j                  | j                         y y y )Nr)   r(   r-   )r+   rv   ab)getattrrb   inittrunc_normal_weightr5   r]   r^   Conv2dr[   zeros_)ru   rv   cutoff_factorr9   s      r<   init_weightz=ModernVBertPreTrainedModel._init_weights.<locals>.init_weight   s    #DKK1LcRM .3&#% &299bii"89;;*KK, + :r=   r(   )r   _init_weightsr]   ModulerF   r5   rX   rb   r'   mathsqrtr"   num_hidden_layersr`   ModernVBertForMaskedLMlm_head$ModernVBertForSequenceClassification!ModernVBertForTokenClassificationr_   
classifier)r9   ru   r   out_stdfinal_out_stds   `    r<   r   z(ModernVBertPreTrainedModel._init_weights   s   %%dF3	-		 	- 	- f23kk33diidkkF]F]FoFo@o6ppG22G< 67kk33diidkkF]F]FoFo@o6ppG041
 !KK99DIIdkkF]F]FiFi<jjM))=9
r=   N)	r>   r?   r@   r    config_class_no_split_modulesrP   no_gradr   r4   r=   r<   rs   rs      s'    $LU]]_: :r=   rs   aF  
    ModernVBertModel is a model that combines a vision encoder (SigLIP) and a text encoder (ModernBert).

    ModernVBert is the base model of the visual retriver ColModernVBert, and was introduced in the following paper:
    [*ModernVBERT: Towards Smaller Visual Document Retrievers*](https://arxiv.org/abs/2510.01149).
    )custom_introc                   @    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )ModernVBertModelrb   c                    t         |   |       t        |      | _        t	        j
                  |j                        | _        t	        j
                  |j                        | _	        t        |j                  j                  |j                  j                  z  dz  |j                  dz  z        | _        | j                          y )Nr   )r7   r\   rX   	connectorr   from_configr"   
text_modelr#   vision_modelrE   
image_size
patch_sizer&   image_seq_len	post_initra   s     r<   r\   zModernVBertModel.__init__   s      .f5#//0B0BC%11&2F2FG ""--1E1E1P1PPUVV**A-/
 	r=     
        Inputs fed to the model can have an arbitrary number of images. To account for this, pixel_values fed to
        the model have image padding -> (batch_size, max_num_images, 3, max_heights, max_widths) where
        max_num_images is the maximum number of images among the batch_size samples in the batch.
        Padding images are not needed beyond padding the pixel_values at the entrance of the model.
        For efficiency, we only pass through the vision_model's forward the real images by
        discarding the padding images i.e. pixel_values of size (image_batch_size, 3, height, width) where
        image_batch_size would be 7 when num_images_per_sample=[1, 3, 1, 2] and max_num_images would be 3.
        r   r   r   N	input_idsattention_maskposition_idsinputs_embedspixel_valuespixel_attention_maskrO   r:   returnc                    |9 | j                   j                         |      j                  |j                        }|| j	                  ||      j
                  }|;|j                  |j                  |j                        }| j                  |||      } | j                   d|||d|}	t        |	j                  |	j                  |	j                  |      S )a|  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        )r   r   )dtypedevice)r   r   rO   )r   r   r   )rL   rM   rN   rO   r4   )r   get_input_embeddingstor   get_image_featurespooler_outputr   inputs_mergerrK   rL   rM   rN   )
r9   r   r   r   r   r   r   rO   r:   outputss
             r<   rq   zModernVBertModel.forward  s    >  BDOO@@B9MPPQZQaQabM #"&"9"9)@T #: #m  
 *"5"8"8}?R?R[h[o[o"8"p ..#=Vi / M
 "$// 
')%
 	
 *%77!//)) 3	
 	
r=   )NNNNNNN)r>   r?   r@   r    r\   r   r   rP   
LongTensorTensorrQ   
BoolTensorr   r   rR   rK   rq   rH   rI   s   @r<   r   r      s    0    - '+.20426158<8</
##/
 t+/
 &&-	/

 ((4//
 ''$./
 $..5/
 #..5/
 +,/
 
+	+/
 /
r=   r   c                       e Zd Zy)ModernVBertPredictionHeadN)r>   r?   r@   r4   r=   r<   r   r   J  s    r=   r   c                   t    e Zd ZddiZdef fdZd Zd Ze e	dd	      	 	 	 	 	 	 	 	 dde
j                  de
j                  d
z  de
j                  d
z  de
j                  d
z  de
j                  d
z  de
j                  d
z  de
j                  d
z  de
j                  d
z  dee   deez  fd              Z xZS )r   zlm_head.weightz1model.text_model.embeddings.tok_embeddings.weightrb   c                 l   t         |   |       |j                  j                  | _        t	        |      | _        t        |j                        | _        t        j                  |j                  j                  | j                  |j                  j                        | _        | j                          y )NrZ   )r7   r\   r"   
vocab_sizer   modelr   projection_headr]   r^   r_   decoder_biasr   r   ra   s     r<   r\   zModernVBertForMaskedLM.__init__R  s      ,,77%f-
89K9KLyy!3!3!?!?W]WiWiWvWvw 	r=   c                     | j                   S rp   r   )r9   s    r<   get_output_embeddingsz,ModernVBertForMaskedLM.get_output_embeddings^  s    ||r=   c                     || _         y rp   r   )r9   new_embeddingss     r<   set_output_embeddingsz,ModernVBertForMaskedLM.set_output_embeddingsa  s	    %r=   r   r   r   Nr   r   r   r   r   r   rO   labelsr:   r   c	                 \    | j                   d|||||||d|	}
|
d   }| j                  | j                  |            }d}|<t               } ||j	                  d| j
                        |j	                  d            }t        |||
j                  |
j                  |
j                        S )  
        pixel_attention_mask (`torch.Tensor` of shape `(batch_size, image_size, image_size)`, *optional*):
            Mask to avoid performing attention on padding pixel indices.
        image_hidden_states (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)`):
            The hidden states of the image encoder after modality projection.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            text_config.]` or `model.image_token_id`. Tokens with indices set to `model.image_token_id` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., text_config.]`.
        r   r   r   r   r   r   rO   r   N)rU   rV   rM   rN   rO   r4   )
r   r   r   r   rf   r   rT   rM   rN   rO   )r9   r   r   r   r   r   r   rO   r   r:   r   rM   rV   rU   	criterions                  r<   rq   zModernVBertForMaskedLM.forwardd  s    H $** 	
)%'%!5 3	
 	
  
d22=AB(*IV[[T__=v{{2OD(!//)) ' ; ;
 	
r=   NNNNNNNN)r>   r?   r@   _tied_weights_keysr    r\   r   r   r   r   rP   r   r   rQ   r   r   r   rR   rT   rq   rH   rI   s   @r<   r   r   N  s*   *,_`
0 
&  - '+.20426158<8<*.0
##0
 t+0
 &&-	0

 ((4/0
 ''$.0
 $..50
 #..50
   4'0
 +,0
 
*	*0
 0
r=   r   za
    The ModernVBert Model with a sequence classification head on top that performs pooling.
    c                   `    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )r   rb   c                    t         |   |       |j                  | _        || _        t	        |      | _        t        |j                        | _        t        j                  |j                        | _        t        j                  |j                  j                  |j                        | _        | j!                          y rp   )r7   r\   
num_labelsrb   r   r   r   r"   headr]   Dropoutr.   dropr^   r_   r   r   ra   s     r<   r\   z-ModernVBertForSequenceClassification.__init__  s      ++%f-
-f.@.@A	JJv889	))F$6$6$B$BFDUDUV 	r=   r   r   r   Nr   r   r   r   r   r   rO   r   r:   r   c	                     | j                   d|||||||d|	}
|
d   }| j                  j                  dk(  r
|dddf   }n| j                  j                  dk(  r||j                  dd \  }}n|j                  dd \  }}||j                  n|j                  }|(t        j                  ||f|t
        j                        }||j                  d      z  j                  d	
      |j                  d	d      z  }| j                  |      }| j                  |      }| j                  |      }d}|| j                  j                  | j                  d	k(  rd| j                  _        nl| j                  d	kD  rL|j                  t
        j                   k(  s|j                  t
        j"                  k(  rd| j                  _        nd| j                  _        | j                  j                  dk(  rIt%               }| j                  d	k(  r& ||j'                         |j'                               }n |||      }n| j                  j                  dk(  r=t)               } ||j+                  d| j                        |j+                  d            }n,| j                  j                  dk(  rt-               } |||      }t/        |||
j0                  |
j2                        S )r   r   r   r*   Nr+   r   )r   r   r   rd   )dimT)r   keepdim
regressionsingle_label_classificationmulti_label_classificationrU   rV   rM   rN   r4   )r   rb   r,   shaper   rP   onesrG   	unsqueezesumr   r   r   problem_typer   r   longrE   r   squeezer   rf   r   r   rM   rN   )r9   r   r   r   r   r   r   rO   r   r:   r   rL   ri   seq_lenr   pooled_outputrV   rU   loss_fcts                      r<   rq   z,ModernVBertForSequenceClassification.forward  s   F $** 	
)%'%!5 3	
 	
 $AJ;;))U2 1!Q$ 7[[++v5(&3&9&9"1&=#
G&/oobq&9#
G)2)>Y%%MDXDXF%!&Z,A&X]XbXb!c!2^5M5Mb5Q!Q V V[\ V ]`n`r`rt as a ! 		"34		-0/{{''/??a'/;DKK,__q(fllejj.HFLL\a\e\eLe/LDKK,/KDKK,{{''<7"9??a'#FNN$4fnn6FGD#FF3D))-JJ+-B @&++b/R))-II,./'!//))	
 	
r=   r   )r>   r?   r@   r    r\   r   r   rP   r   r   rQ   r   r   r   rR   r   rq   rH   rI   s   @r<   r   r     s    0   - '+.20426158<8<*.Q
##Q
 t+Q
 &&-	Q

 ((4/Q
 ''$.Q
 $..5Q
 #..5Q
   4'Q
 +,Q
 
)	)Q
 Q
r=   r   zw
    The ModernVBert Model with a token classification head on top, e.g. for Named Entity Recognition (NER) tasks.
    c                   `    e Zd Zdef fdZe edd      	 	 	 	 	 	 	 	 ddej                  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dee   deez  fd              Z xZS )r   rb   c                 t   t         |   |       |j                  | _        t        |      | _        t        |j                        | _        t        j                  |j                        | _        t        j                  |j                  j                  |j                        | _        | j                          y rp   )r7   r\   r   r   r   r   r"   r   r]   r   r.   r   r^   r_   r   r   ra   s     r<   r\   z*ModernVBertForTokenClassification.__init__  s      ++%f-
-f.@.@A	JJv889	))F$6$6$B$BFDUDUV 	r=   r   r   r   Nr   r   r   r   r   r   rO   r   r:   r   c	                 l    | j                   d|||||||d|	}
|
d   }| j                  |      }| j                  |      }| j                  |      }d}|<t	               } ||j                  d| j                        |j                  d            }t        |||
j                  |
j                        S )r   r   r   Nr   r   r4   )
r   r   r   r   r   rf   r   r   rM   rN   )r9   r   r   r   r   r   r   rO   r   r:   r   rL   rV   rU   r   s                  r<   rq   z)ModernVBertForTokenClassification.forward*  s    H $** 	
)%'%!5 3	
 	
 $AJ II&78 II&78!23')HFKKDOO<fkk"oND$!//))	
 	
r=   r   )r>   r?   r@   r    r\   r   r   rP   r   r   rQ   r   r   r   rR   r   rq   rH   rI   s   @r<   r   r     s   
0 
  - '+.20426158<8<*.1
##1
 t+1
 &&-	1

 ((4/1
 ''$.1
 $..51
 #..51
   4'1
 +,1
 
&	&1
 1
r=   r   )r    rs   r   r   r   r   )9r   dataclassesr   typingr   rP   torch.nnr]   huggingface_hub.dataclassesr   r   r   r    r
   r{   configuration_utilsr   modeling_outputsr   r   r   r   modeling_utilsr   processing_utilsr   utilsr   r   r   utils.genericr   autor   r   r   modernbert.modeling_modernbertr   smolvlm.modeling_smolvlmr   r   
get_loggerr>   loggerr    rK   rT   r   rX   rs   r   r   r   r   r   __all__r4   r=   r<   <module>r      s    !    . A A & 3  . & @ @ - 8 8 E K 
		H	% 455(( 5(  65(p @ @ @: 9 9 9<$=299 $=N $:!7 $: $:N M
| M
M
`	 8 	 R
7 R
 R
j 
l
+E l

l
^ 
K
(B K

K
\r=   