
    i                        d Z ddlZddlmZ ddlmZ ddlZddlmZ ddlm	Z
 ddlmZ dd	lmZmZ dd
lmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlmZmZ ddl m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z) ddl*m+Z+ ddl,m-Z- ddl.m/Z/ ddl0m1Z1m2Z2  e(jf                  e4      Z5e e'd       G d de%                    Z6dejn                  defdZ8 e+dd d!"      	 	 	 	 d^d#ed!ejn                  d$ejn                  dz  d%edz  d&ejn                  dz  d'ejn                  dz  d(ejr                  dz  d)e:d*e:dz  de;fd+       Z< G d, d-ejz                        Z> G d. d/ejz                        Z? G d0 d1ejz                        Z@d2e?iZA G d3 d4ejz                        ZB G d5 d6ejz                        ZC G d7 d8ejz                        ZD G d9 d:e      ZE G d; d<ejz                        ZFe' G d= d>e             ZG G d? d@ejz                        ZH G dA dBejz                        ZI	 d_dCejz                  dDejn                  dEejn                  dFejn                  d$ejn                  dz  dGeJdHeJfdIZK G dJ dKejz                        ZL G dL dMe      ZM G dN dOejz                        ZN G dP dQejz                        ZO e'dR       G dS dTeG             ZP G dU dVejz                        ZQ e'dW       G dX dYeG             ZR e'dZ       G d[ d\eGe             ZSg d]ZTy)`zPyTorch GIT model.    N)Callable)	dataclass)nn   )initialization)ACT2FN)CacheDynamicCache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPastBaseModelOutputWithPoolingCausalLMOutputWithPast)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)apply_chunking_to_forward)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)deprecate_kwarg)merge_with_config_defaults)capture_outputs   )	GitConfigGitVisionConfigz}
    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
ej                  df   dz  ed<   dZe
ej                  df   dz  ed<   y)GitVisionModelOutputz
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
        The image embeddings obtained by applying the projection layer to the pooler_output.
    Nimage_embedslast_hidden_state.hidden_states
attentions)__name__
__module____qualname____doc__r%   torchFloatTensor__annotations__r&   r'   tupler(        u/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.pyr$   r$   8   sr    
 .2L%##d*126u((4/6:>M5**C/047>7;Je'',-4;r2   r$   	group_idsreturnc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )au  
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Args:
        group_ids (`torch.Tensor`):
            A tensor of shape `(bs, len)` assigning each token to a vision group. Tokens with the same group
            come from the same input image. Text is denoted by `-1`.
    	batch_idxhead_idxq_idxkv_idxr5   c                    	j                   d   }|j                  |dz
        }|j                  |dz
        }	| |f   }	| |f   }t        j                  ||k  |d      }t        j                  ||k  |d      }||k(  |dk\  z  S )Nr   )maxr   )shapeclampr-   where)
r7   r8   r9   r:   
seq_lengthq_idx_clampedkv_idx_clampedq_groupkv_groupr4   s
            r3   
inner_maskz0token_type_ids_mask_function.<locals>.inner_maskV   s    __R(
 
Q7*q.9 I}45Y67++ej0'2>;;v
2HbA8#155r2   )intbool)r4   rF   s   ` r3   token_type_ids_mask_functionrI   L   s3    6c 6S 6 6c 6d 6 r2   input_embedsz5.6.0inputs_embeds)versionnew_nameconfigattention_maskpast_key_valuesposition_idstoken_type_idspixel_valuesis_trainingis_first_iterationc	                    |r|t        d      | j                         ||||d}
||n|du xs |j                   xs |du}||r|dk(  j                  |j                        }t
        j                  j                  |dd      ddddf   }|| z  }t        j                  |j                         d	      dz
  }t        j                  ||d      }t        |      |
d
<   t        di |
S )a  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Gemma3 uses a bidirectional mask for images.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when training)rN   rK   rO   rP   rQ   r   )r   r   r   )valuer<   dimor_mask_functionr1   )
ValueErrorget_text_configis_initializedtodevicer   
functionalpadr-   cumsumrG   r@   rI   r   )rN   rK   rO   rP   rQ   rR   rS   rT   rU   kwargsmask_kwargsis_imageis_previous_imagenew_image_startr4   s                  r3   create_causal_mask_mappingrh   g   s   ( ~-VWW ((*&(*$K ) 	%g_-K-K)Kg|cgOg 
 !&8 #a'++M,@,@AMM--ha-HCRCP"&7%77LL!4!4!6A>B	KK)R8	*Fy*Q&'$3{33r2   c                        e Zd ZdZ fdZ	 	 	 	 d
dej                  dz  dej                  dz  dej                  dz  dedej                  f
d	Z
 xZS )GitEmbeddingsz;Construct the embeddings from word and position embeddings.c                    t         |           t        j                  |j                  |j
                  |j                        | _        t        j                  |j                  |j
                        | _	        t        j                  |j
                  |j                        | _
        t        j                  |j                        | _        | j                  dt!        j"                  |j                        j%                  d      d       y )N)padding_idxepsrQ   r   r<   F
persistent)super__init__r   	Embedding
vocab_sizehidden_sizepad_token_idword_embeddingsmax_position_embeddingsposition_embeddings	LayerNormlayer_norm_epsDropouthidden_dropout_probdropoutregister_bufferr-   arangeexpandselfrN   	__class__s     r3   rs   zGitEmbeddings.__init__   s    !||F,=,=v?Q?Q_e_r_rs#%<<0N0NPVPbPb#c f&8&8f>S>STzz&"<"<=ELL)G)GHOOPWXej 	 	
r2   N	input_idsrQ   rK   past_key_values_lengthr5   c                 ,   ||j                         }n|j                         d d }|d   }|| j                  d d |||z   f   }|| j                  |      }n|}| j                  |      }||z  }| j	                  |      }| j                  |      }|S )Nr<   r   )sizerQ   rx   rz   r{   r   )	r   r   rQ   rK   r   input_shaperA   
embeddingsrz   s	            r3   forwardzGitEmbeddings.forward   s      #..*K',,.s3K ^
,,Q0FVlIl0l-lmL --i8J&J"66|D))
^^J/
\\*-
r2   )NNNr   )r)   r*   r+   r,   rs   r-   
LongTensorr.   rG   Tensorr   __classcell__r   s   @r3   rj   rj      ss    E

 .20426&'##d* &&- ((4/	
 !$ 
r2   rj   c                        e Zd Zd	 fd	Z	 	 d
dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )GitSelfAttentionNc                    t         |           |j                  |j                  z  dk7  r2t	        |d      s&t        d|j                   d|j                   d      || _        |-t        j                  d| j                  j                   d       |j                  | _        t        |j                  |j                  z        | _        | j                  | j                  z  | _        t        |j                  j                  |j                  j                   z  dz  d	z         | _        |j$                  | xj"                  |j$                  z  c_        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j(                  |j                  | j                        | _        t'        j0                  |j2                        | _        y )
Nr   embedding_sizezThe hidden size (z6) is not a multiple of the number of attention heads ()zInstantiating z without passing a `layer_idx` is not recommended and will lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` when creating this class.   r   )rr   rs   rv   num_attention_headshasattrr[   	layer_idxloggerwarning_oncer   r)   rG   attention_head_sizeall_head_sizevision_config
image_size
patch_sizeimage_patch_tokensnum_image_with_embeddingr   LinearquerykeyrW   r}   attention_probs_dropout_probr   r   rN   r   r   s      r3   rs   zGitSelfAttention.__init__   s    : ::a?PVXhHi#F$6$6#7 8 445Q8  # !8!8 9 :, , $*#=#= #&v'9'9F<V<V'V#W !558P8PP"%v';';'F'FI]I]IhIh'hmn&nqr&r"s**6##v'F'FF#YYv1143E3EF
99V//1C1CDYYv1143E3EF
zz&"E"EFr2   r'   rO   rP   rc   r5   c                    |j                   d d }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }	| |j                  ||	| j                        \  }}	t        j                  ||j	                  dd            }
|
t        j                  | j                        z  }
||
|z   }
t        j                  j                  |
d      }| j!                  |      }t        j                  ||	      }|j#                  dddd      j%                         }|j'                         d d | j(                  fz   }|j                  |      }||fS )Nr<   r   r   rX   r   r   )r>   r   r   view	transposer   rW   updater   r-   matmulmathsqrtr   r`   softmaxr   permute
contiguousr   r   )r   r'   rO   rP   rc   r   hidden_shapequery_layer	key_layervalue_layerattention_scoresattention_probscontext_layernew_context_layer_shapes                 r3   r   zGitSelfAttention.forward   s    $))#2.CCbC$*B*BCjj/44\BLLQPQRHH]+00>HHAN	jj/44\BLLQPQR&%4%;%;I{TXTbTb%c"I{ !<<Y5H5HR5PQ+dii8P8P.QQ%/.@ --//0@b/I ,,7_kB%--aAq9DDF"/"4"4"6s";t?Q?Q>S"S%**+BCo--r2   NNNr)   r*   r+   rs   r-   r   r.   r	   r   r   r0   r   r   r   s   @r3   r   r      sh    G> 48(,	%.||%. ))D0%. 	%.
 +,%. 
u||	%.r2   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )GitSelfOutputc                 (   t         |           t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                        | _
        y Nrm   )rr   rs   r   r   rv   denser{   r|   r}   r~   r   r   s     r3   rs   zGitSelfOutput.__init__  s`    YYv1163E3EF
f&8&8f>S>STzz&"<"<=r2   r'   input_tensorr5   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   r{   r   r'   r   s      r3   r   zGitSelfOutput.forward  7    

=1]3}|'CDr2   r)   r*   r+   rs   r-   r   r   r   r   s   @r3   r   r     1    >U\\  RWR^R^ r2   r   eagerc                        e Zd Zd	 fd	Z	 	 d
dej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZ xZS )GitAttentionNc                     t         |           t        |j                     ||      | _        t        |      | _        y )Nr   )rr   rs   GIT_SELF_ATTENTION_CLASSES_attn_implementationr   r   outputr   s      r3   rs   zGitAttention.__init__$  s4    .v/J/JKF^gh	#F+r2   r'   rO   rP   rc   r5   c                 Z     | j                   |||fi |\  }}| j                  ||      }|S r   )r   r   )r   r'   rO   rP   rc   attn_output_attention_outputs           r3   r   zGitAttention.forward)  sD     #
 	
Q  ;;{MBr2   r   r   r   r   s   @r3   r   r   #  sg    , 48(,	 ||  ))D0  	 
 +,  
u||	 r2   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitIntermediatec                    t         |           t        j                  |j                  |j
                        | _        t        |j                  t              rt        |j                     | _        y |j                  | _        y r   )rr   rs   r   r   rv   intermediate_sizer   
isinstance
hidden_actstrr   intermediate_act_fnr   s     r3   rs   zGitIntermediate.__init__<  s]    YYv1163K3KL
f''-'-f.?.?'@D$'-'8'8D$r2   r'   r5   c                 J    | j                  |      }| j                  |      }|S r   )r   r   r   r'   s     r3   r   zGitIntermediate.forwardD  s&    

=100?r2   r   r   s   @r3   r   r   ;  s#    9U\\ ell r2   r   c                   n     e Zd Z fdZdej
                  dej
                  dej
                  fdZ xZS )	GitOutputc                 (   t         |           t        j                  |j                  |j
                        | _        t        j                  |j
                  |j                        | _        t        j                  |j                        | _        y r   )rr   rs   r   r   r   rv   r   r{   r|   r}   r~   r   r   s     r3   rs   zGitOutput.__init__L  s`    YYv779K9KL
f&8&8f>S>STzz&"<"<=r2   r'   r   r5   c                 r    | j                  |      }| j                  |      }| j                  ||z         }|S r   r   r   s      r3   r   zGitOutput.forwardR  r   r2   r   r   s   @r3   r   r   K  r   r2   r   c                        e Zd Zd
 fd	Z	 	 ddej
                  dej                  dz  dedz  dee	   de
ej
                     f
dZd	 Z xZS )GitLayerNc                     t         |           |j                  | _        d| _        t	        ||      | _        t        |      | _        t        |      | _	        y )Nr   r   )
rr   rs   chunk_size_feed_forwardseq_len_dimr   	attentionr   intermediater   r   r   s      r3   rs   zGitLayer.__init__Z  sK    '-'E'E$%f	B+F3'r2   r'   rO   rP   rc   r5   c                      | j                   ||fd|i|}t        | j                  | j                  | j                  |      }|S )NrP   )r   r   feed_forward_chunkr   r   )r   r'   rO   rP   rc   r   layer_outputs          r3   r   zGitLayer.forwardb  s_     *4>>
 ,
 	
 1##T%A%A4CSCSUe
 r2   c                 L    | j                  |      }| j                  ||      }|S r   )r   r   )r   r   intermediate_outputr   s       r3   r   zGitLayer.feed_forward_chunku  s,    "//0@A{{#68HIr2   r   r   )r)   r*   r+   rs   r-   r   r.   r	   r   r   r0   r   r   r   r   s   @r3   r   r   Y  sl    ( 48(,	|| ))D0 	
 +, 
u||	&r2   r   c                        e Zd Z fdZ	 	 	 d
dej
                  dej                  dz  dedz  dedz  de	e
   defd	Z xZS )
GitEncoderc           	          t         |           || _        t        j                  t        |j                        D cg c]  }t        ||       c}      | _        d| _	        y c c}w NF)
rr   rs   rN   r   
ModuleListrangenum_hidden_layersr   layergradient_checkpointing)r   rN   ir   s      r3   rs   zGitEncoder.__init__|  sP    ]]vG_G_A`#aAHVQ$7#ab
&+# $bs   A$Nr'   rO   rP   	use_cacherc   r5   c                 T    | j                   D ]  } ||||fi |} t        ||      S )Nr&   rP   )r   r   )r   r'   rO   rP   r   rc   layer_modules          r3   r   zGitEncoder.forward  sI     !JJ 	L( 	M	 '++
 	
r2   )NNN)r)   r*   r+   rs   r-   r   r.   r	   rH   r   r   r   r   r   r   s   @r3   r   r   {  so    , 48(,!%
||
 ))D0
 	

 $;
 +,
 
!
r2   r   c                   R    e Zd ZU eed<   dZdZdZ ej                         d        Z
y)GitPreTrainedModelrN   git)imagetextTc                    t        |t              rt        j                  |j                  d| j
                  j                         t        j                  |j                  j                  | j
                  j                         t        j                  |j                  j                  | j
                  j                         t        j                  |j                  t        j                  |j                  j                  d         j                  d             t        |t         j"                        rct        j                  |j                  d| j
                  j                         |j$                   t        j&                  |j$                         yyt        |t         j(                        rt        j                  |j                  d| j
                  j                         |j*                  Et-        |j                  dd      s-t        j&                  |j                  |j*                            yyyt        |t         j.                        r?t        j&                  |j$                         t        j0                  |j                         yt        |t2              rZt        j                  |j                  t        j                  |j                  j                  d         j                  d             yy)	zInitialize the weights        )meanstd)r  r<   ro   N_is_hf_initializedF)r   GitVisionEmbeddingsinitnormal_class_embeddingrN   initializer_rangepatch_embeddingweightposition_embeddingcopy_rQ   r-   r   r>   r   r   r   biaszeros_rt   rl   getattrr{   ones_rj   )r   modules     r3   _init_weightsz GitPreTrainedModel._init_weights  s    f12LL//ct{{?\?\]LL//66DKK<Y<YZLL2299t{{?\?\]JJv**ELL9L9L9R9RSU9V,W,^,^_f,ghfbii(LLSdkk6S6ST{{&FKK( '-LLSdkk6S6ST!!-gfmmMach6iFMM&*<*<=> 7j--KK$JJv}}%.JJv**ELL9L9L9R9RSU9V,W,^,^_f,gh /r2   N)r)   r*   r+   r    r/   base_model_prefixinput_modalitiessupports_gradient_checkpointingr-   no_gradr  r1   r2   r3   r   r     s6    (&*#U]]_i ir2   r   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )r  rN   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestrider  r   r   rQ   ro   rp   )rr   rs   rN   rv   	embed_dimr   r   r   	Parameterr-   randnr  Conv2dnum_channelsr  num_patchesnum_positionsrt   r  r   r   r   r   s     r3   rs   zGitVisionEmbeddings.__init__  s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr2   r   heightwidthr5   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr<   g      ?r   r   bicubicF)r   modealign_cornersrX   )r>   r  r  	unsqueezer-   jit
is_tracingrQ   r   r   reshaper   r   r`   interpolater   cat)r   r   r(  r)  r&  r  r'  class_pos_embedpatch_pos_embedrY   
new_height	new_widthsqrt_num_positionss                r3   interpolate_pos_encodingz,GitVisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr2   rS   c                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().dtyper   r   r<   rX   )r>   r   r[   r  r  r>  r^   flattenr   r  r   r-   r3  r9  r  rQ   )r   rS   r9  
batch_sizer   r(  r)  target_dtypepatch_embedsclass_embedsr   s              r3   r   zGitVisionEmbeddings.forward  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr2   )F)r)   r*   r+   r!   rs   r-   r   rG   r9  r.   r   r   r   s   @r3   r  r    sd    q q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf r2   r  c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )GitVisionMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r   )rr   rs   rN   r   r   activation_fnr   r   rv   r   fc1fc2r   s     r3   rs   zGitVisionMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr2   r'   r5   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r   )rH  rG  rI  r   s     r3   r   zGitVisionMLP.forward  s4    /**=9/r2   r   r   s   @r3   rE  rE    s$    KU\\ ell r2   rE  r  r   r   rW   scalingr   c                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr<   r   )rY   r>  )ptrainingr   r   )r-   r   r   r   r`   r   float32r^   r>  r   rN  r   )
r  r   r   rW   rO   rK  r   rc   attn_weightsr   s
             r3   eager_attention_forwardrQ    s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r2   c                        e Zd ZdZ fdZ	 d	dej                  dej                  dz  dee   de	ej                  ej                  dz  f   fdZ
 xZS )
GitVisionAttentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r<  g      F)rr   rs   rN   rv   r!  r   	num_headshead_dimr[   scaleattention_dropoutr   	is_causalr   r   k_projv_projq_projout_projr   s     r3   rs   zGitVisionAttention.__init__5  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar2   Nr'   rO   rc   r5   c                    |j                   dd }g |d| j                  }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }| j                  |      j                  |      j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  | j                  sdn| j                  d|\  }
} |
j                   g |d j#                         }
| j%                  |
      }
|
|fS )z#Input shape: Batch x Time x ChannelNr<   r   r   r  )rY  rK  r   )r>   rV  r\  r   r   rZ  r[  r   get_interfacerN   r   rQ  rY  rW  rN  r   r1  r   r]  )r   r'   rO   rc   r   r   querieskeysvaluesattention_interfacer   rP  s               r3   r   zGitVisionAttention.forwardI  sM    $))#2.88b8$--8++m,11,?II!QO{{=)..|<FFq!L]+00>HHAN(?(M(MKK,,.E)
 %8
%
 nnJJ#}}C$,,
%
 
%
!\ *k));;;;FFHmmK0L((r2   r   )r)   r*   r+   r,   rs   r-   r   r   r   r0   r   r   r   s   @r3   rS  rS  2  sf    GB. /3)||) t+) +,	)
 
u||U\\D00	1)r2   rS  c                        e Zd Zdef fdZdej                  dej                  dee   de	ej                  ej                  dz  f   fdZ xZS )	GitVisionEncoderLayerrN   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   )rr   rs   rv   r!  rS  	self_attnr   r{   r|   layer_norm1rE  mlplayer_norm2r   s     r3   rs   zGitVisionEncoderLayer.__init__m  sm    +++F3<<F<Q<QR'<<F<Q<QRr2   r'   rO   rc   r5   Nc                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r'   rO   r1   )rh  rg  rj  ri  )r   r'   rO   rc   residualr   s         r3   r   zGitVisionEncoderLayer.forwardu  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r2   )r)   r*   r+   r!   rs   r-   r   r   r   r0   r.   r   r   r   s   @r3   re  re  l  sc    S S||  +,	
 
u  %,,"55	6r2   re  c                   f     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
ez  fdZ xZS )
GitVisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`GitVisionEncoderLayer`].

    Args:
        config: GitVisionConfig
    rN   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w r   )
rr   rs   rN   r   r   r   r   re  layersr   )r   rN   r   r   s      r3   rs   zGitVisionEncoder.__init__  sP    mmERXRjRjLk$lq%:6%B$lm&+# %ms   A#NrO   rc   r5   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a8  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)

        r&   )rp  r   )r   rK   rO   rc   r'   encoder_layers         r3   r   zGitVisionEncoder.forward  sH    * &![[ 	M) M	 +
 	
r2   r   )r)   r*   r+   r,   r!   rs   r-   r   r   r   r0   r   r   r   r   s   @r3   rn  rn    sP    , , /3
 t+
 +,	

 
	 
r2   rn  c            
       r     e Zd Zdef fdZe	 	 d	dej                  dz  dedz  de	e
   defd       Z xZS )
GitVisionTransformerrN   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rr   rs   rN   rv   r  r   r   r{   r|   pre_layrnormrn  encoderpost_layernorm)r   rN   r!  r   s      r3   rs   zGitVisionTransformer.__init__  sj    &&	-f5LL8M8MN'/ ll9&:O:OPr2   NrS   r9  rc   r5   c                     |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|j                  }| j                  |      }t        |      S )Nz You have to specify pixel_valuesr9  rK   rr  r1   )r[   r   rw  rx  r&   ry  r   )r   rS   r9  rc   r'   encoder_outputsr&   s          r3   r   zGitVisionTransformer.forward  s     ?@@Ogh))-8&$,, 
'


 ,== //0AB/
 	
r2   r   )r)   r*   r+   r!   rs   r   r-   r.   rH   r   r   r   r   r   r   s   @r3   ru  ru    sh    Q Q  2605
''$.
 #'+
 +,	

 

 
r2   ru  zY
    The vision model from CLIP, used in GIT, without any head or projection on top.
    c                        e Zd ZU eed<   dZdZeedZ	def fdZ
dej                  fdZe ed	      e	 	 ddej$                  d
z  dedee   deez  fd                     Z xZS )GitVisionModelrN   rS   )r  r'   r(   c                 d    t         |   |       t        |      | _        | j	                          y r   )rr   rs   ru  vision_model	post_initr   s     r3   rs   zGitVisionModel.__init__  s'     08r2   r5   c                 B    | j                   j                  j                  S r   )r  r   r  r   s    r3   get_input_embeddingsz#GitVisionModel.get_input_embeddings  s      ++;;;r2   F)tie_last_hidden_statesNr9  rc   c                 ,     | j                   d||d|S )a  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, GitVisionModel

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```)rS   r9  r1   )r  )r   rS   r9  rc   s       r3   r   zGitVisionModel.forward  s.    < !t   
%%=
 
 	
r2   r   )r)   r*   r+   r!   r/   main_input_namer  re  rS  _can_record_outputsrs   r   Moduler  r   r   r   r-   r.   rH   r   r   r0   r   r   r   r   s   @r3   r~  r~    s     $O!.( <bii <  E2 26).
''$.
 #'
 +,	

 
	 
  3  
r2   r~  c                   \     e Zd Zdef fdZdej                  dej                  fdZ xZS )GitProjectionrN   c                 0   t         |           || _        t        j                  t        j
                  |j                  j                  |j                        t        j                  |j                  |j                  j                              | _
        y r   )rr   rs   rN   r   
Sequentialr   r   rv   r{   r|   visual_projectionr   s     r3   rs   zGitProjection.__init__$  sf    !#IIf**668J8JKLL++1E1E1T1TU"
r2   r   r5   c                 $    | j                  |      S r   )r  )r   r   s     r3   r   zGitProjection.forward,  s    %%j11r2   )	r)   r*   r+   r    rs   r-   r   r   r   r   s   @r3   r  r  #  s*    
y 
2%,, 25<< 2r2   r  zy
    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
    c                   H    e Zd ZeedZ fdZd Zd Ze	e
e	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dedz  dedz  dedee   deej                     ez  fd                     Z xZS )GitModelr  c                 l   t         |          | _        t              | _        t        j                        | _        t              | _	        t              | _        j                  6t        j                  fdt        j                        D              | _        | j#                          y )Nc              3      K   | ]B  }t        j                  t        j                  d d j                  j
                               D yw)r   N)r   r"  r-   zerosr   rv   ).0r   rN   s     r3   	<genexpr>z$GitModel.__init__.<locals>.<genexpr>F  s;      ; U[[Av/C/C/O/OPQ;s   AA)rr   rs   rN   rj   r   r~  r   image_encoderr   rx  r  r  r   r   ParameterListr   img_temporal_embeddingr  r   s    `r3   rs   zGitModel.__init__;  s     '/+F,@,@A!&)!.v!6**6*,*:*: ;v>>?; +D' 	r2   c                 .    | j                   j                  S r   r   rx   r  s    r3   r  zGitModel.get_input_embeddingsN  s    ...r2   c                 &    || j                   _        y r   r  )r   rW   s     r3   set_input_embeddingszGitModel.set_input_embeddingsQ  s    */'r2   Nr   rO   rQ   rS   rK   rP   r   r9  rc   r5   c	           	      N   |du |duz  rt        d      |r|t        | j                        }d}
|0t        |t              s|j                         n|j                         }
|||j                  d   dk(  r||
z   }| j                  ||||
      }t        j                  |t        j                        d   }||j                  d	k(  r| j                  ||
      j                  }n|j                  dk(  rg }t        |j                  d         D ]O  }| j                  |dd|ddddf   |
      j                  }|| j                  |   z  }|j!                  |       Q t        j"                  |d      }nt        d      | j%                  |      }|j'                  |j)                  d      |j)                  d      z  dd      }t        j"                  ||fd      }t        j*                  |t        j                        d   }t        j"                  ||gd      }|t        j"                  t        j*                  |      |gd      }n{|y|j                  d   dk(  rgt        j,                  |j                  d   |
|j                  d   z
  dz   f|j.                  |j0                        }t        j"                  ||gd      }t3        | j                  |||d||      }|} | j4                  |f|||d|	}t7        |j                  |j8                        S )a   
        Examples:

        ```python
        >>> from transformers import AutoProcessor, AutoModel
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
        >>> model = AutoModel.from_pretrained("microsoft/git-base")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = "this is an image of two cats"

        >>> inputs = processor(images=image, text=text, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        ```Nz:You must specify exactly one of input_ids or inputs_embeds)rN   r   r   )r   rQ   rK   r   r=  ).r      r{     rX   z#pixel_values must be of rank 4 or 5r<   )r>  r_   )rO   rP   r   r   )r[   r
   rN   r   r	   get_seq_lengthr>   r   r-   
zeros_likerG   ndimr  r&   r   r  appendr3  r  repeatr   	ones_likeonesr>  r_   rh   rx  r   rP   )r   r   rO   rQ   rS   rK   rP   r   r9  rc   r   embedding_outputrR   visual_features	frame_idxvisual_features_frameprojected_visual_featuresimage_token_type_idsextended_attention_maskcausal_maskr'   r|  s                         r3   r   zGitModel.forwardT  sw   L -t";<YZZ0*$++>O "#& "/59  ..0$335 # O$?IOOTUDVZ[D['*@@L??%'#9	 + 
 ))*:%))LVT#  A%"&"4"4 ;S #5 ###   ""a'"$!&|'9'9!'<!= BI,0,>,>$Q	1a%78Sk -? -'' * *T-H-H-SS)#**+@AB #())O"C !!FGG(,(>(>(O% )B(H(H %%a(,E,J,J1,MMqRS)%
  %yy*CEU)V\]^#(??3LTYT]T]#^_e#f "YY(<n'MSUVN)!&EOO<P,QSa+bhj!k(Y__Q-?1-D ',jj%%a(*@>CWCWXYCZ*Z]^*^_$**%,,'#
 #YY(?'PVXYN 1KK
 )3?4<<4
&+	4

 4
 '-??+;;
 	
r2   )NNNNNNNF)r)   r*   r+   r   r   r  rs   r  r  r   r   r   r-   r   r	   rH   r   r   r0   r   r   r   r   s   @r3   r  r  0  s    "&
&/0   *..2,0,0-1(,!%).F
<<$&F
 t+F
 llT)	F

 llT)F
 ||d*F
 F
 $;F
 #'F
 +,F
 
u||	9	9F
    F
r2   r  z`
    GIT Model with a `language modeling` head on top for autoregressive language modeling.
    c                       e Zd ZddiZ fdZd Zd Zeee		 	 	 	 	 	 	 	 	 	 dde
j                  dz  de
j                  dz  d	e
j                  dz  d
e
j                  dz  de
j                  dz  de
j                  dz  dedz  dedz  dedee
j                  z  dee   dee
j                     ez  fd                     Z	 	 	 	 	 d fd	Z xZS )GitForCausalLMzoutput.weightz%git.embeddings.word_embeddings.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        | j                          y r   )
rr   rs   r  r   r   r   rv   ru   r   r  r   s     r3   rs   zGitForCausalLM.__init__  sF     F#ii 2 2F4E4EF 	r2   c                     | j                   S r   r   r  s    r3   get_output_embeddingsz$GitForCausalLM.get_output_embeddings  s    {{r2   c                     || _         y r   r  )r   new_embeddingss     r3   set_output_embeddingsz$GitForCausalLM.set_output_embeddings  s	    $r2   Nr   rO   rQ   rS   rK   labelsrP   r   r9  logits_to_keeprc   r5   c                    |d} | j                   |f|||||||	d|}|j                  }t        |
t              rt	        |
 d      n|
}| j                  |dd|ddf         }d}|| j                   j                  j                  d   j                  j                  j                  }|dd|dddf   j                         }|ddddf   j                         } | j                  |j                  d| j                  j                        |j                  d      fd| j                  j                  i|}t!        |||j"                  |j$                  |j&                        S )	a0  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`

        Examples:

        Image captioning example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> print(generated_caption)
        two cats sleeping on a pink blanket next to remotes.
        ```

        Visual question answering (VQA) example:

        ```python
        >>> from transformers import AutoProcessor, AutoModelForCausalLM
        >>> from huggingface_hub import hf_hub_download
        >>> from PIL import Image

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")

        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
        >>> image = Image.open(file_path).convert("RGB")

        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values

        >>> question = "what does the front of the bus say at the top?"

        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)

        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
        ['what does the front of the bus say at the top? special']
        ```

        Video captioning example:

        ```python
        >>> import av
        >>> import numpy as np
        >>> from PIL import Image
        >>> from huggingface_hub import hf_hub_download
        >>> from transformers import AutoProcessor, AutoModelForCausalLM

        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")

        >>> # set seed for reproducibility
        >>> np.random.seed(45)


        >>> def read_video_pyav(container, indices):
        ...     '''
        ...     Decode the video with PyAV decoder.
        ...     Args:
        ...         container (`av.container.input.InputContainer`): PyAV container.
        ...         indices (`list[int]`): List of frame indices to decode.
        ...     Returns:
        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
        ...     '''
        ...     frames = []
        ...     container.seek(0)
        ...     start_index = indices[0]
        ...     end_index = indices[-1]
        ...     for i, frame in enumerate(container.decode(video=0)):
        ...         if i > end_index:
        ...             break
        ...         if i >= start_index and i in indices:
        ...             frames.append(frame)
        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])


        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
        ...     '''
        ...     Sample a given number of frame indices from the video.
        ...     Args:
        ...         clip_len (`int`): Total number of frames to sample.
        ...         frame_sample_rate (`int`): Sample every n-th frame.
        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
        ...     Returns:
        ...         indices (`list[int]`): List of sampled frame indices
        ...     '''
        ...     converted_len = int(clip_len * frame_sample_rate)
        ...     end_idx = np.random.randint(converted_len, seg_len)
        ...     start_idx = end_idx - converted_len
        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
        ...     return indices


        >>> # load video
        >>> file_path = hf_hub_download(
        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
        ... )
        >>> container = av.open(file_path)

        >>> # sample frames
        >>> num_frames = model.config.num_image_with_embedding
        >>> indices = sample_frame_indices(
        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
        ... )
        >>> frames = read_video_pyav(container, indices)

        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values

        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)

        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
        ```
        NF)rO   rQ   rS   rK   rP   r   r9  r   r<   r   ru   )losslogitsrP   r'   r(   )r   r&   r   rG   slicer   rx  r   r   r   r   r   loss_functionr   rN   ru   r   rP   r'   r(   )r   r   rO   rQ   rS   rK   r  rP   r   r9  r  rc   outputsr'   slice_indicesr  r  num_image_tokensshifted_logitss                      r3   r   zGitForCausalLM.forward  s~   l I+3488
,
)%%'+%=
,
 
,
  118B>SV8W~ot4]k]1mQ+>?@#xx//55a8BBGGZZ#A'7':A$=>IIKNAqrE]--/F%4%%##B(>(>?B  ;;11 	D &#33!//))
 	
r2   c                 D    t        	|   |f||||d|}|s|s||d<   |S )N)rP   rO   r   rU   rS   )rr   prepare_inputs_for_generation)
r   r   rP   rS   rO   r   rU   rc   model_inputsr   s
            r3   r  z,GitForCausalLM.prepare_inputs_for_generation  sI     w<
+)1
 
 Y+7L(r2   )
NNNNNNNNFr   )NNNNF)r)   r*   r+   _tied_weights_keysrs   r  r  r   r   r   r-   r   r	   rH   rG   r   r   r0   r   r   r  r   r   s   @r3   r  r    s`    *+RS%   *..2,0,0-1&*(,!%).-.z
<<$&z
 t+z
 llT)	z

 llT)z
 ||d*z
 t#z
 z
 $;z
 #'z
 ell*z
 +,z
 
u||	5	5z
    z
~   r2   r  )r  r  r   r~  )NNFN)r  )Ur,   r   collections.abcr   dataclassesr   r-   r    r   r	  activationsr   cache_utilsr	   r
   configuration_utilsr   
generationr   masking_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   pytorch_utilsr   utilsr   r   r   r   r   utils.deprecationr   utils.genericr   utils.output_capturingr   configuration_gitr    r!   
get_loggerr)   r   r$   r   rI   r.   rH   dictrh   r  rj   r   r   r   r   r   r   r   r   r   r  rE  floatrQ  rS  re  rn  ru  r~  r  r  r  __all__r1   r2   r3   <module>r     s     $ !   & ! . 3 ) 6 9  G & 6  1 7 5 9 
		H	% 	<; 	< 	<ELL X 6 ?K +/-1&*1414<<14 LL4'14 T\	14
 ,,%14 LL4'14 ##d*14 14 t14 
14 L14h*BII *ZB.ryy B.LBII   
 299  0bii  		 ) D
 
: i i i>P")) Pf299 . %II%<<% 
% <<	%
 LL4'% % %.6) 6)t6 D.
ryy .
b$
299 $
N 
5
' 5

5
p
2BII 
2 
h
! h

h
V 
i' i
iX Qr2   