
    iA                        d Z ddlZddlmZ ddlmZ ddlZddl	Z	ddl	m
Z
 ddlmZ ddlmZ dd	lmZ dd
lmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z# ddl$m%Z% ddl&m'Z'm(Z(m)Z)  e jT                  e+      Z,de	jZ                  de	jZ                  fdZ.de	jZ                  de	jZ                  fdZ/de	jZ                  de0fdZ1dHde	jZ                  de2de3de0de	jZ                  f
dZ4dIdZ5d Z6 G d d e
jn                        Z8 G d! d"e
jn                        Z9 G d# d$e
jn                        Z:ee G d% d&e                    Z; G d' d(e
jn                        Z< G d) d*e
jn                        Z= G d+ d,e
jn                        Z> G d- d.e
jn                        Z? G d/ d0e
jn                        Z@ G d1 d2e@      ZA G d3 d4e
jn                        ZB G d5 d6e      ZCe G d7 d8e             ZD G d9 d:e
jn                        ZE G d; d<e
jn                        ZF G d= d>eD      ZG G d? d@eD      ZH G dA dBe
jn                        ZI G dC dDeD      ZJe G dE dFeD             ZKg dGZLy)JzPyTorch GroupViT model.    N)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)capture_outputs   )GroupViTConfigGroupViTTextConfigGroupViTVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr   )r   s    /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/groupvit/modeling_groupvit.pycontrastive_lossr&   *   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r&   t)r(   caption_loss
image_losss      r%   groupvit_lossr-   /   s,    #J/L!*,,.1J:%,,r'   dimc                     | j                  |      }|j                  |d      d   }t        j                  | t        j                        j                  ||d      }||j                         z
  |z   }|S )NTkeepdimr   memory_format      ?)softmaxmaxr"   
zeros_likelegacy_contiguous_formatscatter_detach)r   r.   y_softindexy_hardrets         r%   hard_softmaxr?   5   sk    ^^C FJJsDJ)!,EfE4R4RS\\]`bgilmF
6==?
"V
+CJr'   tauhardc                 :   t         j                  j                  j                  t        j                  d| j
                  | j                        t        j                  d| j
                  | j                              }|j                  | j                        }| |z   |z  }|j                  |      }|rd|j                  |d      d   }t        j                  | t         j                        j                  ||d      }||j                         z
  |z   }	|	S |}	|	S )N        )r   dtyper4   Tr0   r   r2   )r"   distributionsgumbelGumbeltensorr   rD   sampleshaper5   r6   r7   r8   r9   r:   )
r   r@   rA   r.   gumbel_distgumbelsr;   r<   r=   r>   s
             r%   gumbel_softmaxrM   ?   s    %%,,33SfllCSfllCK   .G3&G__S!F

3
-a0!!&8V8VW``adfkmpqv}}&/ J Jr'   c                    ||z  | j                   d   z  dz  }||kD  r4t        t        j                  ||z              }| j                   d   |z  }n3t        t        j                  ||z              }| j                   d   |z  }| j                   d   }| j                   d   }| j	                  ||||      } t
        j                  j                  | ||fd|      } | S )a  
    Args:
        attentions (`torch.Tensor`): attention map of shape [batch_size, groups, feat_height*feat_width]
        height (`int`): height of the output attention map
        width (`int`): width of the output attention map
        align_corners (`bool`, *optional*): the `align_corner` argument for `nn.functional.interpolate`.

    Returns:
        `torch.Tensor`: resized attention map of shape [batch_size, groups, height, width]
             ?r   r   bilinearsizemodealign_corners)rJ   intnproundreshaper   r    interpolate)	
attentionsheightwidthrU   scale
feat_widthfeat_height
batch_sizegroupss	            r%   resize_attention_maprc   U   s     e^z//22s:E~%%-01
 &&q)Z7"((6E>23%%a(K7
!!!$Ja F##JZPJ**&%z + J r'   c           	      H   g }t        j                         5  d}| D ]i  }|j                  ddd      j                         }||}n||z  }t	        |j                  ddd      j                         g| }|j                  |       k 	 ddd       |d   }|S # 1 sw Y   xY w)a1  
    Args:
        attentions (`tuple(torch.FloatTensor)`: tuple of attention maps returned by `GroupViTVisionTransformer`
        hw_shape (`tuple(int)`): height and width of the output attention map
    Returns:
        `torch.Tensor`: the attention map of shape [batch_size, groups, height, width]
    Nr   rO   r   )r"   no_gradpermute
contiguousrc   append)r[   hw_shape	attn_mapsprev_attn_masks
attn_maskscur_attn_mapfinal_groupings          r%   get_grouping_from_attentionsrp   s   s     I	 +$ 		+J#++Aq!4??AJ&","1J">/0G0G1a0P0[0[0]i`hiL\*		++ r]N!+ +s   A1BB!c                   *     e Zd Zdef fdZd Z xZS )GroupViTCrossAttentionLayerconfigc                 "   t         |           t        |      | _        t	        j
                  |j                  |j                        | _        t        |      | _
        t	        j
                  |j                  |j                        | _        y Neps)super__init__GroupViTAttentionattnr   	LayerNormhidden_sizelayer_norm_epsnorm2GroupViTMLPmlp	norm_postselfrs   	__class__s     r%   ry   z$GroupViTCrossAttentionLayer.__init__   sb    %f-	\\&"4"4&:O:OP
v&f&8&8f>S>STr'   c                     |}|| j                  ||      d   z   }|| j                  | j                  |            z   }| j                  |      }|S )N)encoder_hidden_statesr   )r{   r   r   r   )r   querykeyxs       r%   forwardz#GroupViTCrossAttentionLayer.forward   sQ    		%s	;A>>A''NN1r'   )__name__
__module____qualname__r   ry   r   __classcell__r   s   @r%   rr   rr      s    U3 Ur'   rr   c                   2     e Zd Zdef fdZddZd Z xZS )GroupViTAssignAttentionrs   c                    t         |           |j                  dz  | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _        t	        j
                  |j                  |j                        | _	        |j                  | _
        y )N      )rx   ry   r}   r^   r   Linearq_projk_projv_projproj
assign_epsr   s     r%   ry   z GroupViTAssignAttention.__init__   s    ''-
ii 2 2F4F4FGii 2 2F4F4FGii 2 2F4F4FGIIf00&2D2DE	 ++r'   c                     |r| j                   rt        |d|      }|S |rt        |d      }|S t        j                  j                  |d      }|S )N)r.   rA   r.   )trainingrM   r?   r   r    r5   )r   r{   rF   rA   s       r%   get_attnz GroupViTAssignAttention.get_attn   sX    dmm!$BT:D  #Db1  }},,Tr,:r'   c                 t   |}| j                  |      }| j                  |      }| j                  |      }||j                  dd      z  | j                  z  }| j                  |      }| j                  |dd      }||j                  dd      | j                  z   z  }||z  }| j                  |      }||fS )Nr   re   F)rF   rA   Tr.   r1   )	r   r   r   	transposer^   r   sumr   r   )r   r   r   valueraw_attnr{   	soft_attnouts           r%   r   zGroupViTAssignAttention.forward   s    E" kk# E" CMM"b11TZZ?}}X&MM(5uME	txxBx5GHUliinI~r'   )TT)r   r   r   r   ry   r   r   r   r   s   @r%   r   r      s    ,3 ,	r'   r   c                   0     e Zd Zdef fdZd Zd Z xZS )GroupViTTokenAssignrs   c                 d   t         |           || _        t        j                  |j
                  |j                        | _        t        |j                  t        j                  j                        r|j                  n|j                  |j                  f}|D cg c]  }t        ||j
                  z         c}\  }}t        ||||      | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        t%        |      | _        t)        |      | _        t        j                  |j
                  |j                        | _        t/        ||j
                  ||j
                        | _        y c c}w ru   )rx   ry   num_output_groupr   r|   r}   r~   norm_tokens
isinstanceassign_mlp_ratiocollectionsabcIterablerV   GroupViTMixerMLP	mlp_internorm_post_tokensnorm_xrr   pre_assign_attnr   assign
norm_new_xr   mlp_channels)	r   rs   num_group_tokenr   r   r   
tokens_dimchannels_dimr   s	           r%   ry   zGroupViTTokenAssign.__init__   sK    0<<(:(:@U@UV &11;??3K3KL ##))6+B+BC 	
 JZ#ZACF,>,>(>$?#Z 
L)&/:O_` "V-?-?VEZEZ [ll6#5#56;P;PQ:6B-f5,,v'9'9v?T?TU'0B0BLRXRdRde $[s   F-c                 J    | j                  |      }| j                  |      }|S )z
        Args:
            group_tokens (torch.Tensor): group tokens, [batch_size, num_group_tokens, channels]

        Returns:
            projected_group_tokens (torch.Tensor): [batch_size, num_output_groups, channels]
        )r   r   )r   group_tokensprojected_group_tokenss      r%   project_group_tokenz'GroupViTTokenAssign.project_group_token   s+     "&!=!%!6!67M!N%%r'   c                    | j                  |      }| j                  |      }| j                  |      }| j                  ||      }| j	                  ||      \  }}||z  }|| j                  | j                  |            z   }||fS )z
        Args:
            image_tokens (`torch.Tensor`): image tokens, of shape [batch_size, input_length, channels]
            group_tokens (`torch.Tensor`): group tokens, [batch_size, num_group_tokens, channels]
        )r   r   r   r   r   r   r   )r   image_tokensr   r   new_image_tokens	attentions         r%   r   zGroupViTTokenAssign.forward   s     ''5{{<0!%!9!9,!G!%!5!56Ll![&*kk2H,&W#)22+d.?.?P`@a.bb**r'   )r   r   r   r   ry   r   r   r   r   s   @r%   r   r      s    f3 f*&+r'   r   c                   0   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZeed	<   dZeed
<   dee   fdZy)GroupViTModelOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    segmentation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels, logits_height, logits_width)`):
        Classification scores for each pixel.

        <Tip warning={true}>

        The logits returned do not necessarily have the same size as the `pixel_values` passed as inputs. This is
        to avoid doing two interpolations and lose some quality when a user needs to resize the logits to the
        original image size as post-processing. You should always check your logits shape and resize as needed.

        </Tip>
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`GroupViTVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`GroupViTVisionModel`].
    Nlosslogits_per_imagelogits_per_textsegmentation_logitstext_embedsimage_embedstext_model_outputvision_model_outputr   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw))r   r   N)getattrto_tuple).0kr   s     r%   	<genexpr>z/GroupViTModelOutput.to_tuple.<locals>.<genexpr>3  s=      
  LLDGRYZ^`aRbRkRkRmm
s   -0)tuplekeysr   s   `r%   r   zGroupViTModelOutput.to_tuple2  s#     
YY[
 
 	
r'   )r   r   r   __doc__r   r"   FloatTensor__annotations__r   r   r   r   r   r   r   r   r   r   r    r'   r%   r   r     s    > &*D%

d
")15e''$.504OU&&-448**T18,0K""T)0-1L%##d*148186:3:
%* 
r'   r   c            	            e Zd ZdZ	 	 	 	 ddeee   z  eeef   z  deeeef   z  dedef fdZddej                  de
d	ej                  fd
Z xZS )GroupViTPatchEmbeddingsz#
    Image to Patch Embedding.
    
image_size
patch_sizenum_channels	embed_dimc                 ^   t         |           t        |t        j                  j
                        r|n||f}t        |t        j                  j
                        r|n||f}|d   |d   z  |d   |d   z  z  }|| _        || _        || _        t        j                  ||||      | _        y )Nr   r   )kernel_sizestride)rx   ry   r   r   r   r   r   r   num_patchesr   Conv2d
projection)r   r   r   r   r   r   r   s         r%   ry   z GroupViTPatchEmbeddings.__init__>  s     	#-j+//:R:R#SZZdfpYq
#-j+//:R:R#SZZdfpYq
!!}
15*Q-:VW=:XY$$&))L)\fgr'   pixel_valuesinterpolate_pos_encodingr   c                 8   |j                   \  }}}}|sV|| j                  d   k7  s|| j                  d   k7  r2t        d| d| d| j                  d    d| j                  d    d	      | j                  |      j	                  d      j                  dd      }|S )Nr   r   zInput image size (*z) doesn't match model ().rO   )rJ   r   
ValueErrorr   flattenr   )r   r   r   ra   r   r\   r]   r   s           r%   r   zGroupViTPatchEmbeddings.forwardO  s    2>2D2D/
L&%'++u8J/J (% 9+,Adooa.@-AE  OOL)11!4>>q!Dr'   )      r   i   F)r   r   r   r   rV   listr   ry   r"   Tensorboolr   r   r   s   @r%   r   r   9  s     9<,.h$s)OeCHo5h %S/)h 	h
 h"	ELL 	D 	]b]i]i 	r'   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e	dej                  fd
Z
 xZS )GroupViTVisionEmbeddingsrs   c                    t         |           t        |j                  |j                  |j
                  |j                        | _        | j                  j                  }t        j                  t        j                  d||j                              | _        t        j                  |j                        | _        t        j                   |j                  |j"                        | _        |j                  | _        || _        y )N)r   r   r   r   r   rv   )rx   ry   r   r   r   r   r}   patch_embeddingsr   r   	Parameterr"   zerosposition_embeddingsDropoutdropoutr|   r~   	layernormrs   )r   rs   r   r   s      r%   ry   z!GroupViTVisionEmbeddings.__init__\  s     7((((,,((	!
 ++77#%<<A{FL^L^0_#` zz&..1f&8&8f>S>ST ++r'   
embeddingsr\   r]   r   c                 0   |j                   d   }| j                  j                   d   }t        j                  j	                         s||k(  r||k(  r| j                  S | j                  }|j                   d   }|| j
                  z  }|| j
                  z  }	t        |dz        }
|j                  d|
|
|      }|j                  dddd      }t        j                  j                  |||	fdd	      }|j                  dddd      j                  dd|      }|S )
a  
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing and no class embeddings.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   re   rP   r   r   rO   bicubicFrR   )rJ   r   r"   jit
is_tracingr   r   rY   rg   r   r    rZ   view)r   r   r\   r]   r   num_positionspatch_pos_embedr.   
new_height	new_widthsqrt_num_positionss              r%   r   z1GroupViTVisionEmbeddings.interpolate_pos_encodingl  s#    !&&q)0066q9 yy##%+*F6UZ?+++22r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nr'   r   r   c                 
   |j                   \  }}}}| j                  ||      }| j                  |      }|j                         \  }}}	|r|| j	                  |||      z   }n|| j
                  z   }| j                  |      }|S )N)r   )rJ   r   r   rS   r   r   r   )
r   r   r   ra   r   r\   r]   r   seq_len_s
             r%   r   z GroupViTVisionEmbeddings.forward  s    2>2D2D/
L&%**<Rj*k
^^J/
!+!2
GQ $#d&C&CJPVX]&^^J#d&>&>>J\\*-
r'   r   )r   r   r   r   ry   r"   r   rV   r   r   r   r   r   s   @r%   r   r   [  sc    3  $5<< $ $UX $]b]i]i $LELL D ]b]i]i r'   r   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
GroupViTTextEmbeddingsrs   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nposition_idsr   re   F)
persistent)rx   ry   r}   r   	Embedding
vocab_sizetoken_embeddingmax_position_embeddingsposition_embeddingregister_bufferr"   r#   expandr   rs   r   r   s      r%   ry   zGroupViTTextEmbeddings.__init__  s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r'   N	input_idsr  inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nre   r   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )rJ   r  weightr   r  r  )r   r  r  r  
seq_lengthmax_position_embeddingr   r   s           r%   r   zGroupViTTextEmbeddings.forward  s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r'   NNN)r   r   r   r   ry   r"   
LongTensorr   r   r   r   r   s   @r%   r  r    sk    

1 

 .20426	##d* &&- ((4/	
 
r'   r  c            
           e Zd ZdZdededededef
 fdZed        Zd	 Z	dde
j                  de
j                  d
z  de
j                  fdZ	 	 dde
j                  de
j                  d
z  ded
z  dee
j                     fdZ xZS )GroupViTStagezMThis corresponds to the `GroupingLayer` class in the GroupViT implementation.rs   depthnum_prev_group_tokenr   r   c           	      f   t         |           || _        || _        |dkD  r:t	        j
                  t        j                  d||j                              | _	        nd | _	        t	        j                  t        |      D cg c]  }t        |       c}      | _        |dkD  rt        |||      | _        nd | _        |dkD  rc|dkD  r^t	        j                   t	        j"                  |j                  |j$                        t'        |||j                  dz  |            | _        y d | _        y c c}w )Nr   r   )rs   r   r   rv   rO   )rx   ry   r&  r   r   r   r"   r   r}   group_token
ModuleListrangeGroupViTEncoderLayerlayersr   
downsample
Sequentialr|   r~   r   group_projector)r   rs   r&  r'  r   r   r  r   s          r%   ry   zGroupViTStage.__init__  s    	
.Q!||EKK?FL^L^,_`D#Dmm5QV<$Xa%9&%A$XYQ1 /!1DO #DO!#!(;#%==V//V5J5JK )=v?Q?QUV?VXgh$D 
 $(D # %Ys    D.c                     | j                   d uS N)r)  r   s    r%   with_group_tokenzGroupViTStage.with_group_token  s    t++r'   c                 z    | j                   r,|d d d | j                   f   |d d | j                   d f   fS |d fS r2  )r3  r   )r   r   s     r%   split_xzGroupViTStage.split_x  sN      Q/4/////0!A8L8L7L7N4N2OOOd7Nr'   Nr   r)  r   c                 <    ||S t        j                  ||gd      S )Nr   r   )r"   cat)r   r   r)  s      r%   concat_xzGroupViTStage.concat_x  s#    Hyy![)q11r'   hidden_statesprev_group_tokenoutput_attentionsc                    | j                   rM| j                  j                  |j                  d      dd      }| j                  || j	                  |      z   }nd}|}| j                  ||      }| j                  D ]  } ||d      } | j                  |      \  }}d}| j                  | j                  ||      \  }}||f}	|r|	|fz   }	|	S )a  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the grouping tensors of Grouping block.
        r   re   N)attention_mask)	r3  r)  r  rS   r0  r8  r-  r5  r.  )
r   r9  r:  r;  r)  r   cat_xlayerr   outputss
             r%   r   zGroupViTStage.forward  s       **11-2D2DQ2GRPK##/)D,@,@AQ,RRKa-[[ 	6E%5E	6 e,;	??&??1k:LAyk",Gr'   r2  NF)r   r   r   r   r   rV   ry   propertyr3  r5  r"   r   r8  r   r   r   r   r   r   s   @r%   r%  r%    s    W ($ (  ( "	 (
  (  (D , ,2%,, 2U\\D5H 2TYT`T` 2 15).	&||&  ,,-&  $;	&
 
u  	!&r'   r%  c            
            e Zd Z	 	 	 d
dededz  dedz  dedz  f fdZdej                  dej                  fd	Z xZ	S )r   Nrs   r}   intermediate_sizeoutput_sizec                    t         |           || _        t        |j                     | _        ||n|j                  }||n|j                  }||n|}t        j                  ||      | _
        t        j                  ||      | _        y r2  )rx   ry   rs   r   
hidden_actactivation_fnr}   rD  r   r   fc1fc2)r   rs   r}   rD  rE  r   s        r%   ry   zGroupViTMLP.__init__,  s     	#F$5$56%0%<k&BTBT1B1N-TZTlTl%0%<k+99[*;<99.<r'   r9  r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r2  )rI  rH  rJ  )r   r9  s     r%   r   zGroupViTMLP.forward<  s4    /**=9/r'   r"  )
r   r   r   r   rV   ry   r"   r   r   r   r   s   @r%   r   r   +  s`     #'(,"&=$= 4Z= :	=
 4Z= U\\ ell r'   r   c                        e Zd Z fdZ xZS )r   c                 f    t         |   |j                  dd            }|j                  dd      S Nr   rO   )rx   r   r   )r   r   r   s     r%   r   zGroupViTMixerMLP.forwardD  s-    GOAKK1-.{{1a  r'   )r   r   r   r   r   r   s   @r%   r   r   C  s    ! !r'   r   c                        e Zd ZdZ fdZdej                  dedefdZ	 	 ddej                  d	ej                  dz  d
ej                  dz  de
ej                  ej                  dz  f   fdZ xZS )rz   z=Multi-headed attention from 'Attention Is All You Need' paperc                 
   t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r   r   )rx   ry   rs   r}   r   num_attention_heads	num_headshead_dimr   r^   attention_dropoutr   r   r   r   r   r   out_projr   s     r%   ry   zGroupViTAttention.__init__L  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar'   rH   r  bszc                     |j                  ||| j                  | j                        j                  dd      j	                         S rN  )r  rR  rS  r   rh   )r   rH   r  rV  s       r%   _shapezGroupViTAttention._shape_  s7    {{3GQQRSUVWbbddr'   Nr9  r=  r   r   c                 &   |j                         \  }}}|du}| j                  |      | j                  z  }	|rE| j                  | j	                  |      d|      }
| j                  | j                  |      d|      }nD| j                  | j	                  |      d|      }
| j                  | j                  |      d|      }|| j                  z  d| j                  f} | j                  |	||      j                  | }	 |
j                  | }
 |j                  | }|
j                  d      }t        j                  |	|
j                  dd            }|j                         || j                  z  ||fk7  r/t        d|| j                  z  ||f d|j                                |{|j                         |d||fk7  r#t        d|d||f d|j                                |j                  || j                  ||      |z   }|j                  || j                  z  ||      }t        j                  j                  |d      }|j                  || j                  ||      }|j                  || j                  z  ||      }t        j                  j!                  || j                   | j"                  	      }t        j                  ||      }|j                         || j                  z  || j                  fk7  r7t        d
|| j                  || j                  f d|j                                |j                  || j                  || j                        }|j                  dd      }|j%                  |||      }| j'                  |      }||fS )z#Input shape: Batch x Time x ChannelNre   r   rO   z$Attention weights should be of size z	, but is z!Attention mask should be of size r   )pr   z `attn_output` should be of size )rS   r   r^   rX  r   r   rR  rS  r  r"   bmmr   r   r   r    r5   r   r   rY   rU  )r   r9  r=  r   kwargsrV  tgt_lenr   is_cross_attentionquery_states
key_statesvalue_states
proj_shapesrc_lenattn_weightsattn_weights_reshaped
attn_probsattn_outputs                     r%   r   zGroupViTAttention.forwardb  s{    #0"4"4"6Wi2$> {{=1DJJ>T[[1F%GSQJ;;t{{3H'I2sSLT[[%?SIJ;;t{{='A2sKLDNN*B>
Ct{{<#>CCZP$Z__j1
(|((*5//!$yyz/C/CAq/IJ3#7'"JJ6dnn8LgW^7_6` a %%'(* 
 %""$a'(BB 7a'8R7SS\]k]p]p]r\st  (,,S$..'7SVddL',,S4>>-A7GTL}},,\r,B !- 1 1#t~~wPW X,11#2FQXY]]**<4<<RVR_R_*`
ii
L9#"6!OO2CRVR_R_3`2a b$$&') 
 "&&sDNNGT]]S!++Aq1!))#w	BmmK0111r'   NN)r   r   r   r   ry   r"   r   rV   rX  r   r   r   r   r   s   @r%   rz   rz   I  s    GB&eU\\ eC ec e /3:>	D2||D2 t+D2  %0047	D2 
u||U\\D00	1D2r'   rz   c                        e Zd Zdef fdZdej                  dej                  dee   de	ej                  ej                  dz  f   fdZ xZS )	r,  rs   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y ru   )rx   ry   r}   r   rz   	self_attnr   r|   r~   layer_norm1r   r   layer_norm2r   s     r%   ry   zGroupViTEncoderLayer.__init__  sm    ++*62<<F<Q<QRv&<<F<Q<QRr'   r9  r=  r\  r   Nc                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r9  r=  r   )rl  rk  rm  r   )r   r9  r=  r\  residualr  s         r%   r   zGroupViTEncoderLayer.forward  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r'   )r   r   r   r   ry   r"   r   r   r   r   r   r   r   r   s   @r%   r,  r,    sc    S~ S||  +,	
 
u  %,,"55	6r'   r,  c                   \    e Zd ZU eed<   dZdZdZee	dZ
 ej                         d        Zy)GroupViTPreTrainedModelrs   groupvit)imagetextT)r9  r[   c                 T   | j                   j                  }t        |t        j                  t        j
                  f      rNt        j                  |j                  d|       |j                  t        j                  |j                         nt        |t        j                  t        j                  f      rt        j                  |j                         t        j                  |j                         t        |dd      ]t        j                  |j                         t        j                  |j                          t        j                  |j"                         | j                   j$                  }t        |t&              rt        j                  |j(                  j                  d|dz         t        j                  |j*                  j                  d|dz         t        j,                  |j.                  t1        j2                  |j.                  j4                  d         j7                  d             yt        |t8              r| j                   j$                  }|j:                  dz  d	|j                   j<                  z  dz  z  |z  }|j:                  dz  |z  }t        j                  |j>                  j                  |
       t        j                  |j@                  j                  |
       t        j                  |jB                  j                  |
       t        j                  |jD                  j                  |
       yt        |tF              r| j                   j$                  }|j                   jH                  dz  d	|j                   j<                  z  dz  z  |z  }d	|j                   jH                  z  dz  |z  }t        j                  |jJ                  j                  |
       t        j                  |jL                  j                  |
       yy)zInitialize the weightsrC   )meanstdNrunning_meang{Gz?re   r  r   rO   )rw  )'rs   initializer_ranger   r   r   r   initnormal_r  biaszeros_r|   BatchNorm1dones_r   rx  running_varnum_batches_trackedinitializer_factorr  r  r  copy_r  r"   r#   rJ   r  rz   r   num_hidden_layersr   r   r   rU  r   r}   rI  rJ  )r   module
init_rangefactorin_proj_stdout_proj_stdfc_stds          r%   _init_weightsz%GroupViTPreTrainedModel._init_weights  s    [[22
fryy"))45LLSjA{{&FKK(r~~ >?KK$JJv}}%v~t4@F//0

6--.F667//f45LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 12[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B,[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< -r'   N)r   r   r   r   r   base_model_prefixinput_modalitiessupports_gradient_checkpointingr,  rz   _can_record_outputsr"   rf   r  r   r'   r%   rq  rq    sD    "(&*#-'
 U]]_"= "=r'   rq  c                   x     e Zd Zdeddf fdZ	 	 	 d
dej                  dedz  dedz  dedz  dee	z  f
d	Z
 xZS )GroupViTVisionEncoderrs   r   Nc                 h   t         |           || _        t        j                  t        t        |j                              D cg c]P  }t        ||j                  |   |j                  |   |j                  |   |dkD  r|j                  |dz
     nd      R c}      | _        d| _        y c c}w )Nr   r   )rs   r&  r   r   r'  F)rx   ry   rs   r   r*  r+  r$   depthsr%  num_group_tokensnum_output_groupsstagesgradient_checkpointing)r   rs   ir   s      r%   ry   zGroupViTVisionEncoder.__init__  s    mm s6==12	  ! --*$*$;$;A$>%+%=%=a%@LMPQE)A)A!a%)HWX	
 ',#	s   AB/r9  output_hidden_statesr;  return_dictc                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|rdnd }|rdnd }d }t	        | j
                        D ]3  \  }}	|r||fz   } |	|||      }
|
d   }|
d   }|s%|
d   +||
d   fz   }5 |r||fz   }|st        d |||fD              S t        |||      S )Nr   r   r   rO   c              3   &   K   | ]	  }||  y wr2  r   )r   vs     r%   r   z0GroupViTVisionEncoder.forward.<locals>.<genexpr>0  s     gqYZYfgs   )last_hidden_stater9  r[   )rs   r;  r  r  	enumerater  r   r   )r   r9  r  r;  r  all_hidden_statesall_groupingsr   r  stagelayer_outputss              r%   r   zGroupViTVisionEncoder.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY"6BD/T!$++. 
	DHAu#$58H$H!!-?PQM)!,M(+L ]1%5%A -q1A0C C
	D   1]4D Dg]4E}$Uggg+;LYf
 	
r'   r"  )r   r   r   r   ry   r"   r   r   r   r   r   r   r   s   @r%   r  r    sl    ,3 , ,( -1)-#'%
||%
 #Tk%
  $;	%

 D[%
 
	 %
r'   r  c                   f     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
ez  fdZ xZS )
GroupViTTextEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self-attention layers. Each layer is a
    [`GroupViTEncoderLayer`].

    Args:
        config: GroupViTTextConfig
    rs   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w rA  )
rx   ry   rs   r   r*  r+  r  r,  r-  r  )r   rs   r  r   s      r%   ry   zGroupViTTextEncoder.__init__?  sP    mm5QWQiQiKj$ka%9&%A$kl&+# %ls   A#Nr=  r\  r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )r  )r-  r   )r   r  r=  r\  r9  encoder_layers         r%   r   zGroupViTTextEncoder.forwardE  sH    ( &![[ 	M) M	 +
 	
r'   r2  )r   r   r   r   r   ry   r"   r   r   r   r   r   r   r   r   s   @r%   r  r  6  sQ    ,1 , /3
 t+
 +,	

 
	 
r'   r  c                        e Zd Zdef fdZe ed      e	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
e   d
ef
d                     Z xZS )GroupViTTextTransformerrs   c                 
   t         |   |       |j                  }t        |      | _        t        |      | _        t        j                  ||j                        | _
        |j                  | _        | j                          y ru   )rx   ry   r}   r  r   r  encoderr   r|   r~   final_layer_normeos_token_id	post_initr  s      r%   ry   z GroupViTTextTransformer.__init__g  sf     &&	08*62 "YF<Q<Q R #//r'   F)tie_last_hidden_statesNr  r=  r  r\  r   c                 Z   |t        d      |j                         }|j                  d|d         }| j                  ||      }t	        | j
                  ||d       }|j                  dd         | j                  d||dd|}|d   }| j                  |      }| j                  d	k(  rm|t        j                  |j                  d   |j                  
      |j                  t        j                  |j                        j!                  d      f   }	n|t        j                  |j                  d   |j                  
      |j                  t        j                  |j                        | j                  k(  j                         j!                  d      f   }	t#        ||	      S )NzYou have to specify input_idsre   )r  r  )rs   r  r=  past_key_values	is_causalT)r  r=  r  r   rO   r   )rD   r   r   )r  pooler_outputr   )r   rS   r  r   r	   rs   popr  r  r  r"   r#   rJ   r   torV   argmaxr   )
r   r  r=  r  r\  input_shaper9  encoder_outputsr  pooled_outputs
             r%   r   zGroupViTTextTransformer.forwards  s    <==nn&NN2{27	),W+;;') 	
 	

;%+74<< ,
'),
 	,
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M */'
 	
r'   r"  )r   r   r   r   ry   r   r   r   r"   r   r   r   r   r   r   r   s   @r%   r  r  f  s    
1 
  E2 *..2,0	:
<<$&:
 t+:
 llT)	:

 +,:
 
$:
  3  :
r'   r  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
e	 	 	 ddej                  dz  d	ej                  dz  d
ej                  dz  dee   deez  f
d       Z xZS )GroupViTTextModelrs   )rt  c                 d    t         |   |       t        |      | _        | j	                          y r2  )rx   ry   r  
text_modelr  r   s     r%   ry   zGroupViTTextModel.__init__  s&     1&9r'   r   c                 B    | j                   j                  j                  S r2  r  r   r  r   s    r%   get_input_embeddingsz&GroupViTTextModel.get_input_embeddings  s    ))999r'   c                 :    || j                   j                  _        y r2  r  )r   r   s     r%   set_input_embeddingsz&GroupViTTextModel.set_input_embeddings  s    5:""2r'   Nr  r=  r  r\  c                 .     | j                   d|||d|S )a9  
        Examples:

        ```python
        >>> from transformers import CLIPTokenizer, GroupViTTextModel

        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTTextModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r  r=  r  r   )r  )r   r  r=  r  r\  s        r%   r   zGroupViTTextModel.forward  s/    . t 
)%
 	
 	
r'   r"  )r   r   r   r   r   r  ry   r   Moduler  r  r   r"   r   r   r   r   r   r   r   r   s   @r%   r  r    s     1 :bii :;  *..2,0	
<<$&
 t+
 llT)	

 +,
 
+	+
 
r'   r  c                        e Zd Zdef fdZe	 	 	 	 d
dej                  dz  dedz  dedz  dedz  de	e
z  f
d	       Z xZS )GroupViTVisionTransformerrs   c                     t         |           || _        |j                  }t	        |      | _        t        |      | _        t        j                  ||j                        | _        y ru   )rx   ry   rs   r}   r   r   r  r  r   r|   r~   r   r  s      r%   ry   z"GroupViTVisionTransformer.__init__  sP    &&	26:,V4iV5J5JKr'   Nr   r  r;  r  r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      | j                  |      }| j                  ||||      }|d   }| j                  |      }|j                  d      }|s
||f|dd  z   S t        |||j                  |j                        S )Nz You have to specify pixel_values)r9  r  r;  r  r   r   r   )r  r  r9  r[   )rs   r;  r  r  r   r   r  r   rv  r   r9  r[   )	r   r   r  r;  r  r9  r  r  r  s	            r%   r   z!GroupViTVisionTransformer.forward  s    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY?@@5,,'!5/#	 ' 
 ,A. !NN+<=)..1.5%}58KKK)/')77&11	
 	
r'   NNNN)r   r   r   r   ry   r   r"   r   r   r   r   r   r   r   s   @r%   r  r    s    L3 L  26,0)-#''
''$.'
 #Tk'
  $;	'

 D['
 
+	+'
 '
r'   r  c                        e Zd ZU eed<   dZdZi Zdef fdZde	fdZ
e	 	 	 	 ddej                  dz  dedz  d	edz  d
edz  deez  f
d       Z xZS )GroupViTVisionModelrs   r   )rs  c                 d    t         |   |       t        |      | _        | j	                          y r2  )rx   ry   r  vision_modelr  r   s     r%   ry   zGroupViTVisionModel.__init__  s'     5f=r'   r   c                 B    | j                   j                  j                  S r2  )r  r   r   r   s    r%   r  z(GroupViTVisionModel.get_input_embeddings#  s      ++<<<r'   Nr;  r  r  c                 ,    | j                  ||||      S )a)  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, GroupViTVisionModel

        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> model = GroupViTVisionModel.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r;  r  r  )r  )r   r   r;  r  r  r\  s         r%   r   zGroupViTVisionModel.forward&  s(    >   %/!5#	 ! 
 	
r'   r  )r   r   r   r   r   main_input_namer  r  ry   r   r  r   r"   r   r   r   r   r   r   r   s   @r%   r  r    s      $O!3 =&= =  26)-,0#'#
''$.#
  $;#
 #Tk	#

 D[#
 
+	+#
 #
r'   r  c                       e Zd ZU eed<   def fdZee	 	 ddej                  dej                  dz  dej                  dz  de
e   deez  f
d	              Zeed
ej                  de
e   deez  fd              Zee	 	 	 	 	 	 	 	 ddej                   dz  d
ej"                  dz  dej                  dz  dej                   dz  dedz  dedz  dedz  dedz  de
e   deez  fd              Z xZS )GroupViTModelrs   c           
      6   t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  | _	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  t%        j(                  | j                  | j                  d      t%        j*                  | j                        t%        j,                  d      t%        j(                  | j                  | j                  d            | _        t%        j&                  t%        j(                  | j                  | j                  d      t%        j*                  | j                        t%        j,                  d      t%        j(                  | j                  | j                  d            | _        t%        j2                  t5        j6                  | j8                  j:                              | _        | j?                          y )NzOconfig.text_config is expected to be of type GroupViTTextConfig but is of type .zSconfig.vision_config is expected to be of type GroupViTVisionConfig but is of type T)r|  )inplace) rx   ry   r   text_configr   	TypeErrortypevision_configr   projection_dimprojection_intermediate_dimr}   text_embed_dimvision_embed_dimr  r  r  r  r   r/  r   r~  ReLUvisual_projectiontext_projectionr   r"   rH   rs   logit_scale_init_valuelogit_scaler  )r   rs   r  r  r   s       r%   ry   zGroupViTModel.__init__Q  s    &,,.@A++,-Q0 
 &..0DE--./q2 
 ((,,$33+1+M+M()55 - 9 91+>5mD!#IId++T-M-MTXYNN4;;<GGD!IId668K8KRVW	"
  "}}IId))4+K+KRVWNN4;;<GGD!IId668K8KRVW	 
 <<T[[5W5W(XY 	r'   Nr  r=  r  r\  r   c                 x     | j                   d|||dd|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import CLIPTokenizer, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> tokenizer = CLIPTokenizer.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```T)r  r=  r  r  r   )r  r  r  )r   r  r=  r  r\  text_outputsr  s          r%   get_text_featureszGroupViTModel.get_text_features|  sV    . 4C4?? 4
)%	4

 4
 %22%)%9%9-%H"r'   r   c                 p     | j                   |fddi|}| j                  |j                        |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, GroupViTModel
        >>> from transformers.image_utils import load_image

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```r  T)r  r  r  )r   r   r\  vision_outputss       r%   get_image_featuresz GroupViTModel.get_image_features  sB    4 6GT5F5F|5pae5pio5p'+'='=n>Z>Z'[$r'   return_lossr;  r  output_segmentationc	           
         ||n| j                   j                  }||n| j                   j                  }|rd}||n| j                   j                  }| j	                  |||d      }
 | j
                  d|||d|	}|
j                  }| j                  |      }|j                  }| j                  |      }||j                  dd      z  }||j                  dd      z  }| j                  j                         }t        j                  ||j                               |z  }|j                         }d}|rh|
j                  }| j                  |j!                  d|j"                  d               }|
j$                  }t'        ||j"                  dd       }||j                  dd      z  }t        j                  ||j                               |z  }|j!                  |j"                  d   d|j"                  d         j)                  ddd	      }|j!                  |j"                  d   |j"                  d	   d      }t        j                  ||      |z  }|j!                  |j"                  d   |j"                  d	   |j"                  d   |j"                  d
         }d}|rt+        |      }t-        ||||||||
      S )a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        output_segmentation (`bool`, *optional*):
            Whether or not to return the segmentation logits.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, GroupViTModel

        >>> model = GroupViTModel.from_pretrained("nvidia/groupvit-gcc-yfcc")
        >>> processor = AutoProcessor.from_pretrained("nvidia/groupvit-gcc-yfcc")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```NTr  r  re   r   rO   r   r   r   )r   r   r   r   r   r   r   r   r   )rs   r;  r  r  r  r  r  r  r  normr  expr"   matmulr*   r  rY   rJ   r[   rp   rg   r-   r   )r   r  r   r=  r  r  r;  r  r  r\  r  r  r   r   r  r   r   
seg_logitsimage_group_embedsr[   groupinglogits_per_image_groupflatten_groupingr   s                           r%   r   zGroupViTModel.forward  s   X 2C1N-TXT_T_TqTq#6#BHgHg 	  $$8$D $++JjJj 	
 **%/!5	 + 
 4C4?? 4
)%4
 	4
 &33--l;"00**;7 $l&7&7B&7&MM!K$4$4T$4$JJ &&**,,,{LNN4DES*,,.
 "0!A!A!%!7!78J8R8RSUWiWoWoprWs8t!u'22J3J@R@RSTSU@VWH "46H6M6MRT^b6M6c!c%*\\2Dkmmo%VYd%d"%;%C%C""1%r;+<+<Q+?&gaA #
  (//q0A8>>RSCTVXY &<>NOR]]J#++  #Z%5%5a%8(..:KX^^\]M^J  1D"-+ *#%* .	
 		
r'   rh  )NNNNNNNN)r   r   r   r   r   ry   r   r   r"   r   r   r   r   r   r  r  r#  r   r   r   r   r   r   s   @r%   r  r  M  s   )~ )V  /3,0	<< t+ llT)	
 +, 
+	+  B ll +, 
+	+	  :  .215.204#')-,0+/|
##d*|
 ''$.|
 t+	|

 &&-|
 D[|
  $;|
 #Tk|
 "D[|
 +,|
 
$	$|
  |
r'   r  )r  rq  r  r  )r   Fre   r   )Mr   collections.abcr   dataclassesr   typingr   numpyrW   r"   r    r   rz  activationsr   masking_utilsr	   modeling_layersr
   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   configuration_groupvitr   r   r   
get_loggerr   loggerr   r&   r-   rV   r?   floatr   rM   rc   rp   r  rr   r   r   r   r   r   r  r%  r   r   rz   r,  rq  r  r  r  r  r  r  r  __all__r   r'   r%   <module>r
     sc     !     & ! / 9 K - & j j 7 5 \ \ 
		H	%
`U\\ `ell `
-ell -u|| - C 5<< e t RU _d_k_k ,<:"))  -bii -`4+")) 4+n -
+ -
  -
`bii DGryy GV%RYY %PZBII Zz")) 0!{ !]2		 ]2B5 B -=o -= -=`7
BII 7
t-
")) -
`J
5 J
Z,
/ ,
^2
		 2
j3
1 3
l n
+ n
 n
b cr'   