
    i"                       d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZmZmZ ddlmZ ddlmZ ddlmZ ddlmZmZmZmZ ddlm Z m!Z! ddl"m#Z# ddl$m%Z%m&Z&m'Z'm(Z(m)Z)m*Z* ddl+m,Z, ddl-m.Z.m/Z/ ddl0m1Z1m2Z2m3Z3  e)jh                  e5      Z6e' G d de!             Z7dWde	jp                  de	jr                  de:dz  fdZ;	 dXde	jx                  de	jr                  de	jz                  de:fdZ>ee' G d  d!e                    Z?e e'd"#       G d$ d%e%                    Z@e e'd&#       G d' d(e%                    ZA G d) d*e
j                        ZC	 dYd+e
j                  d,e	jp                  d-e	jp                  d.e	jp                  d/e	jp                  dz  d0eDd1eDfd2ZE G d3 d4e
j                        ZF G d5 d6e
j                        ZG G d7 d8e      ZH G d9 d:e
j                        ZI G d; d<e7      ZJ G d= d>e
j                        ZK G d? d@e
j                        ZL G dA dBe
j                        ZM G dC dDe      ZN G dE dFe7      ZO G dG dHe7      ZP G dI dJe7      ZQ e'dK#       G dL dMe7e             ZR G dN dOe
j                        ZS e'dP#       G dQ dRe7             ZT e'dS#       G dT dUe7e             ZUg dVZVy)ZzPyTorch KOSMOS-2 model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)CacheDynamicCacheEncoderDecoderCache)GenerationMixin)FlashAttentionKwargs)GradientCheckpointingLayer)BaseModelOutput)BaseModelOutputWithPastAndCrossAttentionsBaseModelOutputWithPooling!CausalLMOutputWithCrossAttentions)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tuplelogging	torch_int)merge_with_config_defaults)OutputRecordercapture_outputs   )Kosmos2ConfigKosmos2TextConfigKosmos2VisionConfigc                   |    e Zd ZU eed<   dZdZddgZdZdZ	dZ
 ej                         dej                  fd       Zy	)
Kosmos2PreTrainedModelconfig)imagetextTKosmos2VisionEncoderLayerKosmos2TextBlockFmodulec                    t        | j                  d      r| j                  j                  }n6t        | j                  d      r | j                  j                  j                  }t        | j                  d      r| j                  j                  }n6t        | j                  d      r | j                  j
                  j                  }t        |t              rt        j                  |j                  d|j                  dz  z         t        j                  |j                  j                  |j                  j                  |z         t        j                  |j                  j                  |j                  j                  |z         t        j                   |j"                  t%        j&                  |j"                  j(                  d	         j+                  d
             nt        |t,              r|j                  dz  d|j                  j.                  z  dz  z  z  }|j                  dz  |z  }t        j                  |j0                  j                  |       t        j                  |j2                  j                  |       t        j                  |j4                  j                  |       t        j                  |j6                  j                  |       nt        |t8              r|j                  j:                  dz  d|j                  j.                  z  dz  z  z  }d|j                  j:                  z  dz  |z  }t        j                  |j<                  j                  |       t        j                  |j>                  j                  |       nt        |t@              rt        j                  |j0                  j                         t        j                  |j2                  j                  |       t        j                  |j4                  j                  |       t        j                  |j6                  j                  |       n\t        |tB              rXt        j                  |j<                  j                         t        j                  |j>                  j                  |       nt        |tD              r-t        j                  |jF                  j                         nt        |tH              rLt        j                  |jJ                  j                         t        j                  |jL                         n[t        |tN              rt        j                  |jP                  j                  d       |jP                  jR                  t        jT                  |jP                  j                  |jP                  jR                            nt        |tV        jX                        r?t        jZ                  |j                         t        jT                  |j\                         nnt        |t^              r^|ja                  |jb                  |jd                  z   |jf                  |jR                        }t        j                   |jh                  |       t        |tV        jj                        r-|j\                   t        jT                  |j\                         yyy)zInitialize the weightsinitializer_factorvision_configinit_stdtext_config              )meanstd)r4   r    r5      N)6hasattrr&   r-   r.   r/   r0   
isinstanceKosmos2VisionEmbeddingsinitnormal_class_embedding	embed_dimpatch_embeddingweightinitializer_rangeposition_embeddingcopy_position_idstorcharangeshapeexpandKosmos2VisionAttentionnum_hidden_layersq_projk_projv_projout_projKosmos2VisionMLPhidden_sizefc1fc2KosmosTextAttentionKosmos2TextFFNKosmos2TextForCausalLMlm_headKosmos2ImageToTextProjectiondenselatent_queryKosmos2TextTransformerembed_tokenspadding_idxzeros_r   	LayerNormones_bias(Kosmos2TextSinusoidalPositionalEmbeddingget_embeddingnum_positionsoffsetembedding_dimweightsLinear)selfr+   factorr4   in_proj_stdout_proj_stdfc_stdemb_weightss           }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/kosmos2/modeling_kosmos2.py_init_weightsz$Kosmos2PreTrainedModel._init_weights:   s    4;; 45[[33FT[[/2[[..AAF4;;
+++&&CT[[-0++))22Cf56LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 67!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B 01!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**< 34LL--37LL--37LL--37LL//S9/LL**4LL**4 67LL..C8 <=LL,,#6LL,,- 67LL,,33#3G""..:F//66v7J7J7V7VWX-JJv}}%KK$ HI ..$$v}}4f6J6JFL^L^K JJv~~{3fbii(V[[-DKK$ .E(    N)__name__
__module____qualname__r!   __annotations__input_modalitiessupports_gradient_checkpointing_no_split_modules_supports_attention_backend_supports_flash_attn_supports_sdparE   no_gradr   Modulero    rp   rn   r%   r%   0   sV    (&*#46HI"& NU]]_8%BII 8% 8%rp   r%   maskdtypetgt_lenc                 2   | j                         \  }}||n|}| ddddddf   j                  |d||      j                  |      }d|z
  }|j                  |j                  t        j
                        t	        j                  |      j                        S )z_
    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
    Nr          ?)sizerH   tomasked_fillrE   boolfinfomin)r~   r   r   bszsrc_lenexpanded_maskinverted_masks          rn   _expand_maskr   v   s     99;LC ,g'GD$)*11#q'7KNNuUM-'M$$]%5%5ejj%A5;;uCUCYCYZZrp   input_ids_shapedevicepast_key_values_lengthc                    | \  }}t        j                  ||ft        j                  |      j                  |      }t        j                  |j                  d      |      }|j                  ||dz   j                  |j                  d      d      k  d       |j                  |      }|dkD  r0t        j                  t        j                  ||||      |gd      }|ddddddf   j                  |d|||z         S )zB
    Make causal mask used for bi-directional self-attention.
    )r   r5   r    r   r   r   dimN)rE   fullr   r   rF   r   masked_fill_viewr   catzerosrH   )r   r   r   r   r   r   r~   	mask_conds           rn   _make_causal_maskr      s     #LC::w(%++e*<*@*@PDTYYr]6:Ii9q="6"6tyy}a"HH!L775>D!yy%++g/EU[abdhioqrdAq !((a'DZ:Z[[rp   c                   @    e Zd ZU dZdZeej                     dz  ed<   y)'BaseModelOutputWithProjectionAttentionsaq  
    projection_attentions (`tuple(torch.FloatTensor)`):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    Nprojection_attentions)	rq   rr   rs   __doc__r   tuplerE   FloatTensorrt   r}   rp   rn   r   r      s%     >B5!2!23d:Arp   r   ze
    Base class for text model's outputs that also contains a pooling of the last hidden states.
    )custom_introc                       e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
eej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed<   dZeed	<   d
ee   fdZy)Kosmos2ModelOutputa  
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlast_hidden_statepast_key_valueshidden_states
attentionsimage_embedsr   vision_model_outputreturnc                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)text_model_outputr   Ngetattrto_tuple.0krh   s     rn   	<genexpr>z.Kosmos2ModelOutput.to_tuple.<locals>.<genexpr>   =      
  LLDGRYZ^`aRbRkRkRmm
   -0r   keysrh   s   `rn   r   zKosmos2ModelOutput.to_tuple   #     
YY[
 
 	
rp   )rq   rr   rs   r   r   rE   r   rt   r   r
   r   r   r   r   r   r   r   r   r   r}   rp   rn   r   r      s    & 37u((4/6$(OUT\(59M5**+d2926Je''(4/6-1L%##d*1=A5!2!23d:A6:3:
%* 
rp   r   zC
    Model output class for `Kosmos2ForConditionalGeneration`.
    c                   H   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   dZeej                     dz  ed	<   dZeed
<   dee   fdZy)*Kosmos2ForConditionalGenerationModelOutputa*  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
        input) to speed up sequential decoding.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
        Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
    projection_attentions (`tuple(torch.FloatTensor)`, *optional*):
        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
        sequence_length)`.

        Attentions weights given by `Kosmos2ImageToTextProjection`, after the attention softmax, used to compute
        the weighted average in the self-attention heads.
    vision_model_output (`BaseModelOutputWithPooling`, *optional*):
        The output of the [`Kosmos2VisionModel`].
    Nlosslogitsr   r   r   r   r   r   r   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr   r   r   s     rn   r   zFKosmos2ForConditionalGenerationModelOutput.to_tuple.<locals>.<genexpr>   r   r   r   r   s   `rn   r   z3Kosmos2ForConditionalGenerationModelOutput.to_tuple   r   rp   )rq   rr   rs   r   r   rE   r   rt   r   r   r
   r   r   r   r   r   r   r   r   r   r}   rp   rn   r   r      s    . &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/6-1L%##d*1=A5!2!23d:A6:3:
%* 
rp   r   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )r:   r&   c                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestrider`   r7   r    rD   r6   
persistent)super__init__r&   rP   r>   
image_size
patch_sizer   	ParameterrE   randnr=   Conv2dnum_channelsr?   num_patchesrc   	EmbeddingrB   register_bufferrF   rH   rh   r&   	__class__s     rn   r   z Kosmos2VisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]joprp   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r    r   Nr5   g      ?r   r7   bicubicF)r   modealign_cornersr   )rG   rB   r@   	unsqueezerE   jit
is_tracingrD   r   r   reshapepermuter   
functionalinterpolater   r   )rh   r   r   r   r   rB   rc   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                rn   interpolate_pos_encodingz0Kosmos2VisionEmbeddings.interpolate_pos_encoding  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCrp   pixel_valuesc                 `   |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  j                  j
                  }| j                  |j                  |            }|j                  d      j                  dd      }| j                  j                  |dd      }	t        j                  |	|gd	      }
|r|
| j                  |
||      z   }
|
S |
| j                  | j                        z   }
|
S )
NzInput image size (*z) doesn't match model ().r   r7   r    r5   r   )rG   r   
ValueErrorr?   r@   r   r   flatten	transposer=   rH   rE   r   r   rB   rD   )rh   r   r   
batch_size_r   r   target_dtypepatch_embedsclass_embedsr   s              rn   forwardzKosmos2VisionEmbeddings.forward<  s6   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++2288++LOO,O,OP#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJrp   F)rq   rr   rs   r#   r   rE   Tensorintr   r   r   __classcell__r   s   @rn   r:   r:      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Z_ZfZf rp   r:   r+   querykeyvalueattention_maskscalingdropoutc                 p   t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |d      }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr5   r   ptrainingr    r7   )	rE   matmulr   r   r   softmaxr  r	  
contiguous)
r+   r   r   r  r  r  r  kwargsattn_weightsattn_outputs
             rn   eager_attention_forwardr  P  s     <<s}}R'<=GL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$rp   c                        e Zd ZdZ fdZ	 d	dej                  dej                  dz  dee   de	ej                  ej                  dz  f   fdZ
 xZS )
rI   =Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )N;embed_dim must be divisible by num_heads (got `embed_dim`:  and `num_heads`: r   r2   F)r   r   r&   rP   r>   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr  	is_causalr   rg   rL   rM   rK   rN   r   s     rn   r   zKosmos2VisionAttention.__init__i  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Arp   Nr   r  r  r   c                    |j                   dd }g |d| j                  }| j                  |      }| j                  |      }| j	                  |      }|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  | j                  sdn| j                  d|\  }
} |
j                   g |d j#                         }
| j%                  |
      }
|
|fS )#Input shape: Batch x Time x ChannelNr5   r    r7   r1   )r  r  r  )rG   r  rK   rL   rM   r   r   r   get_interfacer&   _attn_implementationr  r  r  r	  r  r   r  rN   )rh   r   r  r  input_shapehidden_shapequeriesr   valuesattention_interfacer  r  s               rn   r   zKosmos2VisionAttention.forward}  sV    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8
%
 nnJJ#}}C$,,
%
 
%
!\ *k));;;;FFHmmK0L((rp   N)rq   rr   rs   r   r   rE   r   r   r   r   r   r   r   s   @rn   rI   rI   f  sf    GB. /3%)||%) t+%) +,	%)
 
u||U\\D00	1%)rp   rI   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )rO   c                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r%  )r   r   r&   r	   
hidden_actactivation_fnr   rg   rP   intermediate_sizerQ   rR   r   s     rn   r   zKosmos2VisionMLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJrp   r   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r%  )rQ   r)  rR   rh   r   s     rn   r   zKosmos2VisionMLP.forward  s4    /**=9/rp   )rq   rr   rs   r   rE   r   r   r   r   s   @rn   rO   rO     s$    KU\\ ell rp   rO   c                        e Zd Zdef fdZdej                  dej                  dee   de	ej                  ej                  dz  f   fdZ xZS )	r)   r&   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   rP   r>   rI   	self_attnr   r^   layer_norm_epslayer_norm1rO   mlplayer_norm2r   s     rn   r   z"Kosmos2VisionEncoderLayer.__init__  sm    ++/7<<F<Q<QR#F+<<F<Q<QRrp   r   r  r  r   Nc                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r  r}   )r4  r2  r6  r5  )rh   r   r  r  residualr   s         rn   r   z!Kosmos2VisionEncoderLayer.forward  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0rp   )rq   rr   rs   r#   r   rE   r   r   r   r   r   r   r   r   s   @rn   r)   r)     sd    S2 S||  +,	
 
u  %,,"55	6rp   r)   c                   f     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
ez  fdZ xZS )
Kosmos2VisionEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Kosmos2VisionEncoderLayer`].

    Args:
        config: Kosmos2VisionConfig
    r&   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r   r   r&   r   
ModuleListrangerJ   r)   layersgradient_checkpointing)rh   r&   r   r   s      rn   r   zKosmos2VisionEncoder.__init__  sQ    mmPUV\VnVnPo$p1%>v%F$pq&+# %qs   A#Nr  r  r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )r   )r?  r   )rh   inputs_embedsr  r  r   encoder_layers         rn   r   zKosmos2VisionEncoder.forward  sH    ( &![[ 	M) M	 7+
 	
rp   r%  )rq   rr   rs   r   r#   r   rE   r   r   r   r   r   r   r   r   s   @rn   r:  r:    sQ    ,2 , /3
 t+
 +,	

 
	 
rp   r:  c                        e Zd ZeedZdef fdZe e	d      e
	 	 ddej                  dz  ded	ee   d
efd                     Z xZS )Kosmos2VisionTransformer)r   r   r&   c                 4   t         |   |       |j                  }t        |      | _        t        j                  ||j                        | _        t        |      | _
        t        j                  ||j                        | _        | j                          y r/  )r   r   rP   r:   r   r   r^   r3  pre_layrnormr:  encoderpost_layernorm	post_init)rh   r&   r>   r   s      rn   r   z!Kosmos2VisionTransformer.__init__  so     &&	1&9LL8M8MN+F3 ll9&:O:OPrp   F)tie_last_hidden_statesNr   r   r  r   c                     |t        d      | j                  ||      }| j                  |      } | j                  dd|i|}|d   }|d d dd d f   }| j	                  |      }t        ||      S )Nz You have to specify pixel_values)r   rB  r   )r   pooler_outputr}   )r   r   rG  rH  rI  r   )rh   r   r   r  r   encoder_outputsr   pooled_outputs           rn   r   z Kosmos2VisionTransformer.forward  s     ?@@Ogh))-8&$,, 
'


 ,A.)!Q'2++M:)/'
 	
rp   r<  )rq   rr   rs   r)   rI   _can_record_outputsr#   r   r   r   r   rE   r   r   r   r   r   r   r   r   s   @rn   rE  rE    s    2,
	2 	  E2 26).
''$.
 #'
 +,	

 
$
  3  
rp   rE  c                   4    e Zd ZdZddedededz  f fdZddedededz  fdZeddedededz  fd	       Z e	j                         	 	 	 	 dd
e	j                  dz  de	j                  dz  dede	j                  dz  fd       Zed        Zedd       Z xZS )ra   zDThis module produces sinusoidal positional embeddings of any length.Nrc   re   r\   c                     t         |           d| _        || _        || _        || _        | j                  || j                  z   ||       y )Nr7   )r   r   rd   rc   re   r\   make_weights)rh   rc   re   r\   r   s       rn   r   z1Kosmos2TextSinusoidalPositionalEmbedding.__init__<  sH    **&-$++5}kRrp   num_embeddingsc                     | j                  |||      }t        | d      r;|j                  | j                  j                  | j                  j
                        }| j                  d|d       y )Nrf   r   Fr   )rb   r8   r   rf   r   r   r   )rh   rT  re   r\   rm   s        rn   rS  z5Kosmos2TextSinusoidalPositionalEmbedding.make_weightsE  s[    ((T4#%..t||/A/A$,,J]J].^KYFrp   c                    |dz  }t        j                  d      |dz
  z  }t        j                  t        j                  |t        j
                        j                         | z        }t        j                  | t        j
                        j                         j                  d      |j                  d      z  }t        j                  t        j                  |      t        j                  |      gd      j                  | d      }|dz  dk(  r-t        j                  |t        j                  | d      gd      }|	d||ddf<   |j                  t        j                               S )	z
        Build sinusoidal embeddings.

        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
        "Attention Is All You Need".
        r7   i'  r    r   r   r   r5   N)mathlogrE   exprF   int64floatr   r   sincosr   r   r   get_default_dtype)rT  re   r\   half_dimembs        rn   rb   z6Kosmos2TextSinusoidalPositionalEmbedding.get_embeddingM  s    !A%hhuoA.iiXU[[AGGISDPQll>=CCEOOPQRUXUbUbcdUeeii338a@EEnVXY1!))S%++na"@AqIC""#CQvve--/00rp   	input_idsrB  r   rD   c                    |L|j                         \  }}|l| j                  || j                  |      j                  |j                        }n5|j                         d d \  }}|| j                  ||| j                        }| j                  dz   |z   |z   }|| j                  j                  d      kD  r4| j                  || j                  z   | j                  | j                         | j                  j                  d|j                  d            j                  ||| j                  j                  d         j                         S )Nr5   r    r   )r   "create_position_ids_from_input_idsr\   r   r   &create_position_ids_from_inputs_embedsrf   rS  rd   re   index_selectr   rG   detach)rh   ra  rB  r   rD   r   seq_lenmax_poss           rn   r   z0Kosmos2TextSinusoidalPositionalEmbedding.forwardc  s8     $>>+LC##FFt//1G "Y%%&  )--/4LC##JJ!#94;K;K 
 ""Q&03IIT\\&&q))g3T5G5GIYIYZ||((L,=,=b,ABGGWVZVbVbVhVhikVlmttvvrp   c                    | j                         dd }|d   }t        j                  |dz   ||z   dz   t        j                  | j                        }|j                  d      j                  |      j                         |z   S )z
        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.

        Args:
            inputs_embeds: torch.Tensor

        Returns: torch.Tensor
        Nr5   r    r   r   )r   rE   rF   longr   r   rH   r  )rB  r   r\   r   sequence_lengthrD   s         rn   rd  zOKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_inputs_embeds  s     $((*3B/%a.||!O_{:Q>ejjYfYmYm
 %%a(//<GGILbbbrp   c                     | j                  |      j                         }t        j                  |d      j	                  |      |z   |z  }|j                         |z   S )a  
        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
        are ignored. This is modified from fairseq's `utils.make_positions`.

        Args:
            x: torch.Tensor x:

        Returns: torch.Tensor
        r    r   )ner   rE   cumsumtype_asrj  )ra  r\   r   r~   incremental_indicess        rn   rc  zKKosmos2TextSinusoidalPositionalEmbedding.create_position_ids_from_input_ids  sW     ||K(,,.$||Da8@@FI__cgg"'')K77rp   r%  )NNr   Nr   )rq   rr   rs   r   r   r   rS  staticmethodrb   rE   r{   r   r   rd  rc  r   r   s   @rn   ra   ra   8  s   NSc S# SCRVJ SG3 Gs GQTW[Q[ G 1c 1# 1CRVJ 1 1( U]]_ *.-1&',0w<<$&w ||d*w !$	w
 llT)w w8 c c" 8 8rp   ra   c                       e Zd ZdZ	 	 	 	 	 ddededededz  dedz  dedz  d	edz  f fd
Z	 	 	 ddej                  dej                  dz  de
dz  dej                  dz  deej                  ej                  dz  e
dz  f   f
dZ xZS )rS   r  Nr>   r  r  
is_decoderadd_inner_attn_layernormr`   	layer_idxc	                 x   t         	|           || _        || _        || _        || _        ||z  | _        d| _        | j                  |z  | j                  k7  rt        d| j                   d| d      | j                  dz  | _	        || _
        || _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        t        j                  |||      | _        d | _        |r't        j&                  ||j(                        | _        y y )NTr  r  r   r2   )r`   r0  )r   r   r&   r>   r  r  r  r  r   r  rt  rv  r   rg   rL   rM   rK   rN   inner_attn_lnr^   r3  )
rh   r&   r>   r  r  rt  ru  r`   rv  r   s
            rn   r   zKosmosTextAttention.__init__  s    	""!Y.MMI%$..8MdnnM]$YKr3  }}d*$"ii	94@ii	94@ii	94@		)YTB "#!#iV=R=R!SD $rp   r   encoder_hidden_statesr   r  r   c                    |du}|j                   dd }g |d| j                  }| j                  |      }	|	j                  |      j	                  dd      }	d}
|St        |t              rA|j                  j                  | j                        }
|r|j                  }n|j                  }n|}|r|n|}|rK|I|
rGj                  | j                     j                  }|j                  | j                     j                  }ng |j                   dd d| j                  }| j                  |      j                  |      j	                  dd      }| j!                  |      j                  |      j	                  dd      }|Kj#                  ||| j                        \  }}|r)t        |t              rd|j                  | j                  <   t%        j&                  | j(                  j*                  t,              } || |	|||f| j.                  sdn| j0                  | j2                  d|\  }} |j4                  g |d j7                         }| j8                  | j9                  |      }| j;                  |      }||fS )	r  Nr5   r    r7   FTr1   )r  r  )rG   r  rK   r   r   r9   r   
is_updatedgetrv  cross_attention_cacheself_attention_cacher?  r   r#  rL   rM   updater   r  r&   r  r  r	  r  r  r   r  rx  rN   )rh   r   ry  r   r  r  is_cross_attentionr   r!  query_statesr{  curr_past_key_valuescurrent_states
key_statesvalue_stateskv_shaper$  r  r  s                      rn   r   zKosmosTextAttention.forward  s{    3$>#))#2.88b8$--8{{=1#((6@@AF
&/+>?,77;;DNNK
%+:+P+P(+:+O+O('6$2D.-/"=*-44T^^DIIJ/66t~~FMMLF--cr2FBFFH^499(CMMaQRSJ;;~6;;HEOOPQSTUL*+?+F+FzS_aeaoao+p(
L%*_FY*ZAEO..t~~>(?(M(MKK,,.E)
 %8	%
  $}}C$,,LL	%
 	%
!\ *k));;;;FFH),,[9KmmK0L((rp   )r1   FFTN)NNN)rq   rr   rs   r   r   r[  r   r   rE   r   r
   r   r   r   r   s   @rn   rS   rS     s    G "'05 !%$T $T 	$T
 $T 4K$T #'+$T Tk$T $;$TR 6:(,.2E)||E)  %||d2E) 	E)
 t+E) 
u||U\\D0%$,>	?E)rp   rS   c                   *     e Zd Zdef fdZd Z xZS )rT   r&   c                    t         |           |j                  | _        t        |j                     | _        |j                  | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        t        j                  |j                  |j                        | _        y r/  )r   r   r  r	   activation_functionr)  activation_dropoutr   rg   r>   ffn_dimrQ   rR   r^   r3  ffn_layernormr   s     rn   r   zKosmos2TextFFN.__init__  s    ~~#F$>$>?"(";";99V--v~~>99V^^V-=-=>\\&..f>S>STrp   c                 b   | j                  | j                  |            }t        j                  j	                  || j
                  | j                        }| j                  |      }| j                  |      }t        j                  j	                  || j                  | j                        }|S )Nr  )	r)  rQ   r   r   r  r  r	  r  rR   r,  s     rn   r   zKosmos2TextFFN.forward$  s    **488M+BC--mt?V?Vaeanan-o**=9/--mt||VZVcVc-drp   )rq   rr   rs   r"   r   r   r   r   s   @rn   rT   rT     s    
U0 
Urp   rT   c                       e Zd Zddef fdZ	 	 	 	 	 ddej                  dej                  dz  dej                  dz  dej                  dz  dedz  d	edz  d
e	ej                  e	ej                  ej                  f   dz  f   fdZ xZS )r*   Nr&   c           	         t         |           |j                  | _        t        || j                  |j                  |j
                  dd|      | _        |j                  | _        t        j                  | j                  |j                        | _        |j                  ret        || j                  |j                  |j
                  dd|      | _        t        j                  | j                  |j                        | _        t        |      | _        t        j                  | j                  |j                        | _        y )NT)r>   r  r  rt  ru  rv  r0  F)r   r   r>   rS   attention_headsr  r2  r  r   r^   r3  self_attn_layer_normadd_cross_attentionencoder_attnencoder_attn_layer_normrT   ffnfinal_layer_norm)rh   r&   rv  r   s      rn   r   zKosmos2TextBlock.__init__/  s    )),nn,,,,%)
 ~~$&LLVEZEZ$[!%% 3.. 0000).#!D ,.<<FLaLa+bD(!&) "T^^AVAV Wrp   r   r  ry  encoder_attention_maskr   output_attentionsr   c           	      4   |}| j                  |      } | j                  d||||d|\  }}	t        j                  j	                  || j                  | j
                        }||z   }|t        | d      st        d|  d      |}| j                  |      } | j                  d|||||d|\  }}	t        j                  j	                  || j                  | j
                        }||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r   r   r  r  r  r  z'If `encoder_hidden_states` are passed, z` has to be instantiated with cross-attention layers by setting `config.add_cross_attention=True`)r   ry  r  r   r  r}   )r  r2  r   r   r  r	  r8   r   r  r  r  r  )
rh   r   r  ry  r  r   r  r  r8  r   s
             rn   r   zKosmos2TextBlock.forwardN  sc    !11-@)4>> 
'+)/	

 
q --mt||VZVcVc-d =0 !,40 =dV DD D 
 %H 88GM0t00  +&;5 /"3   M1 MM11-4<<Z^ZgZg1hM$}4M !--m< / =0rp   r%  )NNNNF)rq   rr   rs   r"   r   rE   r   r
   r   r   r   r   r   r   s   @rn   r*   r*   .  s    X0 XD /3596:(,).6||6 t+6  %||d2	6
 !&t 36 6  $;6 
u  %(9(95;L;L(L"MPT"TT	U6rp   r*   c            #       r    e Zd ZU eed<   dZe eedd       eedd      dZ	def fdZ
d	 Z	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dedej                  d
z  f
dZeee	 	 	 	 	 	 	 	 	 	 	 	 	 ddej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  dej                  d
z  ded
z  dej                  d
z  dej                  d
z  ded
z  ded
z  ded
z  ded
z  dee   deez  fd                     Z xZS ) rZ   r&   r(   r    r2  )index
layer_namer  )r   r   cross_attentionsc           	         t         |   |       |j                  | _        |j                  | _        |j                  rt        j                  |j                        nd| _        t        j                  |j                  |j                  |j                        | _        t        |j                  |j                  |j                        | _        t        j"                  t%        |j&                        D cg c]  }t)        ||       c}      | _        t        j*                  |j                  |j,                        | _        d| _        | j3                          y c c}w )Nr   )r\   )rc   re   r\   )rv  F)r   r   r  	layerdropscale_embeddingrW  sqrtr>   embed_scaler   r   
vocab_sizepad_token_idr[   ra   max_position_embeddingsembed_positionsr=  r>  r?  r*   r^   r3  
layer_normr@  rJ  )rh   r&   ir   s      rn   r   zKosmos2TextTransformer.__init__  s    ~~)):@:P:P499V%5%56VYLL):):F<L<LZ`ZmZmnG 88 **++ 
 mmTYZ`ZgZgTh$iq%5f%J$ij,,v'7'79N9NO&+# %js   7Ec                     d }|d   dkD  r#t        ||j                  |j                  |      }|=t        ||j                  |d         j	                  |j                        }||n||z   }|S )Nr5   r    )r   r   r   )r   r   r   r   r   )rh   r  r   rB  r   combined_attention_maskexpanded_attn_masks          rn   _prepare_decoder_attention_maskz6Kosmos2TextTransformer._prepare_decoder_attention_mask  s     #'r?Q&7##$++'=	'# %!-nm>Q>Q[fgi[j!k!n!n$$" '>&E"K]`wKw $ '&rp   NrB  r   img_input_maskr   rD   c                    || j                  |      }|[|j                  |j                        j                  d|j	                  d            ||j                  t
        j                        <   || j                  z  }| j                  ||||      }|j                  |j                        }||z   }t        j                  j                  || j                  | j                        }|S )Nr5   r   )ra  rB  r   rD   r  )r[   r   r   r   r   rE   r   r  r  r   r   r  r	  )	rh   ra  rB  r   r  r   rD   	positionsr   s	            rn   forward_embeddingz(Kosmos2TextTransformer.forward_embedding  s       --i8M#AMQ^QeQeAfAkAkL%%b)BM.++%**+=> &(8(88 (('#9%	 ) 
	 LL!5!56	%	1--mt||VZVcVc-drp   ra  r  image_embeds_position_maskry  r  r   	use_cacher  output_hidden_statesreturn_dictr  r   c           	         ||t        d      |"|j                  }|j                  d|d         }n!||j                         dd }nt        d      |
rd|b|| j                  j
                  r4t        t        | j                        t        | j                              nt        | j                        }||j                         nd}|dkD  rd}d}| j                  ||||||	      }| j                  ||||      }||t        ||j                  |d         }t        j                  j                  || j                  | j                   	      }| j"                  D ]C  }| j                   r%t%        j&                  g       }|| j(                  k  r4 ||||f||||
d
|}E | j+                  |      }t-        ||      S )N  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        NzDYou cannot specify both input_ids and inputs_embeds at the same timer5   z5You have to specify either input_ids or inputs_embeds)r&   r   )ra  rB  r   r  r   rD   r  r  )r  r   r  r  )r   r   )r   rG   r   r   r&   is_encoder_decoderr   r   get_seq_lengthr  r  r   r   r   r   r  r	  r?  rE   randr  r  r   )rh   ra  r  r   r  ry  r  r   rB  rD   r  r  r  r  r  r   r   r   decoder_layerdropout_probabilitys                       rn   r   zKosmos2TextTransformer.forward  s   <  ]%>cdd"#//K!r;r?;I&',,.s3KTUU0 )48V8V $L$DlZ^ZeZeFfg!5  FUE`!?!?!Afg "A%L)-&..'%5#9% / 
 ==K8N

 !,1G1S%12H-J]J]grsugv%w"--mt||VZVcVc-d![[ 	M}}&+jjn#&7)%	 (> /"3#	 	M	& 68++
 	
rp   )NNNr   N)NNNNNNNNNNNNN)rq   rr   rs   r"   rt   ru   r*   r   rS   rP  r   r  rE   r   r   r  r   r   r   r
   r   r   r   r   r   r   r   r   s   @rn   rZ   rZ     s    )$%8kZ*+>aTbc0 *'4 .2,0.2&',0! ||d*! llT)	!
 t+! !$! llT)!F   *..2,0:>596:(,-1,0!%)-,0#'_
<<$&_
 t+_
 llT)	_

 %*LL4$7_
  %||d2_
 !&t 3_
 _
 ||d*_
 llT)_
 $;_
  $;_
 #Tk_
 D[_
 -._
  
:	:!_
    _
rp   rZ   c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
ee	 	 ddej                  dz  ded	ee   deez  fd
              Z xZS )Kosmos2VisionModelr&   r   )r'   c                 d    t         |   |       t        |      | _        | j	                          y r%  )r   r   rE  modelrJ  r   s     rn   r   zKosmos2VisionModel.__init__J  s&     -f5
rp   r   c                 B    | j                   j                  j                  S r%  )r  r   r?   r   s    rn   get_input_embeddingsz'Kosmos2VisionModel.get_input_embeddingsQ  s    zz$$444rp   Nr   r  c                 ,     | j                   d||d|S )N)r   r   r}   r  )rh   r   r   r  s       rn   r   zKosmos2VisionModel.forwardT  s,     tzz 
%%=
 
 	
rp   r<  )rq   rr   rs   r#   rt   main_input_nameru   r   r   r|   r  r   r   rE   r   r   r   r   r   r   r   r   r   s   @rn   r  r  D  s    $O!2 5bii 5  26).

''$.

 #'

 +,	


 
8	8

  

rp   r  c                       e Zd ZU eed<   dZdef fdZdej                  fdZ	e
e	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej                  dz  d	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dedz  dee   deez  fd              Z xZS )Kosmos2TextModelr&   r  c                 d    t         |   |       t        |      | _        | j	                          y r%  )r   r   rZ   r  rJ  r   s     rn   r   zKosmos2TextModel.__init__g  s&     +F3
rp   r   c                 .    | j                   j                  S r%  r  r[   r   s    rn   r  z%Kosmos2TextModel.get_input_embeddingsm      zz&&&rp   Nra  r  r   r  ry  r  r   rB  rD   r  r  c                 <     | j                   d|||||||||	|
d
|S )r  
ra  r  r   r  ry  r  r   rB  rD   r  r}   r  )rh   ra  r  r   r  ry  r  r   rB  rD   r  r  s               rn   r   zKosmos2TextModel.forwardp  sD    4 tzz 
)%'A"7#9+'%
 
 	
rp   )
NNNNNNNNNN)rq   rr   rs   r"   rt   ru   r   r   r|   r  r   r   rE   r   r
   r   r   r   r   r   r   r   r   s   @rn   r  r  c  s:    0 'bii '  *..2,0:>596:(,-1,0!%$
<<$&$
 t+$
 llT)	$

 %*LL4$7$
  %||d2$
 !&t 3$
 $
 ||d*$
 llT)$
 $;$
 +,$
 
:	:$
  $
rp   r  z
    The text model from KOSMOS-2 with a language modeling head on top (linear layer with weights tied to the input
    embeddings).
    c                        e Zd ZU eed<   ddiZdef fdZdej                  fdZ	dej                  fdZ
ee	 	 	 	 	 	 	 	 	 	 	 	 dd	ej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedz  dej                  dz  dej                  dz  dej                   dz  dedz  deej                  z  dee   deez  fd              Z	 	 	 	 	 	 	 d fd	Z xZS )rU   r&   zlm_head.weightzmodel.embed_tokens.weightc                     t         |   |       t        |      | _        t	        j
                  |j                  |j                  d      | _        | j                          y )NF)in_featuresout_featuresr`   )
r   r   rZ   r  r   rg   r>   r  rV   rJ  r   s     rn   r   zKosmos2TextForCausalLM.__init__  sI     +F3
yyV-=-=FL]L]dij 	rp   r   c                 .    | j                   j                  S r%  r  r   s    rn   r  z+Kosmos2TextForCausalLM.get_input_embeddings  r  rp   c                     | j                   S r%  )rV   r   s    rn   get_output_embeddingsz,Kosmos2TextForCausalLM.get_output_embeddings  s    ||rp   Nra  r  r   r  ry  r  r   rB  rD   labelsr  logits_to_keepr  c                    |
|rt         j                  d       d} | j                  d|||||||||	|d
|}|j                  }t	        |t
              rt        | d      n|}| j                  |dd|ddf         }d}|
* | j                  d||
| j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )a  
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
        NzJThe `use_cache` argument is changed to `False` since `labels` is provided.Fr  )r   r  r  )r   r   r   r   r   r  r}   )loggerwarningr  r   r9   r   slicerV   loss_functionr&   r  r   r   r   r   r  )rh   ra  r  r   r  ry  r  r   rB  rD   r  r  r  r  outputsr   slice_indicesr   r   s                      rn   r   zKosmos2TextForCausalLM.forward  s
   @ klI=GTZZ >
)%'A"7#9+'%>
 >
  118B>SV8W~ot4]kmA}a,?@A%4%%pVFt{{OeOepiopD0#33!//))$55
 	
rp   c	                    |s|rd }d }n|||j                         d d n|j                         \  }
}|j                         d   }t        j                  |t        j                  |
||z
  ft        j                  |j
                        fd      }t        |   |f|||||||d|	}|j                  dd        |S )Nr5   )r   r   r   r    r   )r   r  r   r  rB  r  is_first_iterationrD   )	r   rE   r   r   r   r   r   prepare_inputs_for_generationpop)rh   ra  r   r  r   r  rB  r  r  model_kwargsr   rg  mask_lenmodel_inputsr   s                 rn   r  z4Kosmos2TextForCausalLM.prepare_inputs_for_generation  s    $ "iL)-& (3?L?X-"4"4"6s";^l^q^q^sJ1668<H)..KKj'H2D%EUZZ`i`p`pq *& w<

+)%'A'1

 

 	.rp   )NNNNNNNNNNNr   )NNNNNNF)rq   rr   rs   r"   rt   _tied_weights_keysr   r   r|   r  r  r   r   rE   r   r
   
LongTensorr   r   r   r   r   r   r   r  r   r   s   @rn   rU   rU     s    *,GH0 'bii 'ryy   *..2,0:>596:(,-1,0*.!%-.A
<<$&A
 t+A
 llT)	A

 %*LL4$7A
  %||d2A
 !&t 3A
 A
 ||d*A
 llT)A
   4'A
 $;A
 ell*A
 +,A
 
2	2A
  A
L #' 0 0rp   rU   c                   .     e Zd ZdZdef fdZd Z xZS )rW   zmThe layer that transforms the image model's output to part of the text model's input (namely, image features)r&   c                    t         |           t        j                  |j                  j
                  |j                  j                        | _        t        j                  t        j                  |j                  |j                  j                              | _        t        |j                  |j                  j                  |j                  j                  |j                  j                   dd      | _        y )NF)r  rt  ru  )r   r   r   rg   r.   rP   r0   r>   rX   r   rE   r   latent_query_numrY   rS   r  r  x_attnr   s     rn   r   z%Kosmos2ImageToTextProjection.__init__-  s    YYv33??ASASA]A]^
LLV5L5LfN`N`NjNj)kl)((..&&88%*
rp   c                    | j                  |      }| j                  j                  d      j                  |j	                  d      dd      }t        j                  ||gd      }| j                  ||d d d       \  }}||fS )Nr   r5   r    r   )r   ry  r   r  r  )rX   rY   r   rH   r   rE   r   r  )rh   featuresr   rY   key_value_statesr  s         rn   r   z$Kosmos2ImageToTextProjection.forward;  s    

8, ((2215<<]=O=OPQ=RTVXZ[ 99m\%BJ&*kk&"2 " '2 '
#| l**rp   )rq   rr   rs   r   r!   r   r   r   r   s   @rn   rW   rW   *  s    w
} 
+rp   rW   z}
    KOSMOS-2 Model for generating text and image features. The model consists of a vision encoder and a language model.
    c                       e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
ee	 ddej                  dedz  d	ee   deez  fd
              Zee	 	 	 	 	 	 	 	 	 	 ddej*                  dz  dej*                  dz  dej*                  dz  dej*                  dz  dedz  dej*                  dz  dej*                  dz  dej*                  dz  dedz  ded	ee   deez  fd              Z xZS )Kosmos2Modelr&   r   c                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r%  )r   r   r  r0   
text_modelr  r.   vision_modelrW   image_to_text_projectionrJ  r   s     rn   r   zKosmos2Model.__init__V  sN     *6+=+=>.v/C/CD(DV(L% 	rp   r   c                 B    | j                   j                  j                  S r%  r  r  r[   r   s    rn   r  z!Kosmos2Model.get_input_embeddings`      $$111rp   c                 :    || j                   j                  _        y r%  r  rh   r  s     rn   set_input_embeddingsz!Kosmos2Model.set_input_embeddingsc      -2*rp   r   Nr  c                 h   d|v r,t        j                  dt               |j                  dd         | j                  d||dd|}| j                  j
                  j                  |d         }t        j                  j                  |d      }| j                  |      \  }}||_        ||_        |S )	Nreturn_attentionsz`return_attentions` is deprecated and will be removed in a future version. Please use `return_dict` and access `projection_attentions` from the returned `ModelOutput` instead.T)r   r   r  r   r5   r   r}   )warningswarnFutureWarningr  r  r  rI  r   r   	normalizer  rM  r   )rh   r   r   r  vision_outputr   r   s          rn   get_image_featureszKosmos2Model.get_image_featuresf  s     &(MM_
 JJ*D1ARARAR B
%%=B
 	B
 ((..==mA>NO}}..|.D.2.K.KL.Y++&2#.C+rp   ra  r  r  r   r   rB  rD   r  c                 0   d}d}|;|t        d       | j                  |f|
dd|}|j                  }|j                  } | j                  d||||||||	dd	|}t        |j                  |j                  |j                  |j                  |||      S )a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Kosmos2Model

        >>> model = Kosmos2Model.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> text = (
        ...     "<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863>"
        ...     "</object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911>"
        ...     "</object>"
        ... )

        >>> inputs = processor(text=text, images=image, return_tensors="pt", add_eos_token=True)

        >>> last_hidden_state = model(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ... ).last_hidden_state
        >>> list(last_hidden_state.shape)
        [1, 91, 2048]
        ```N<You have to specify either `pixel_values` or `image_embeds`.T)r   r  )	ra  r  r   r  r   rB  rD   r  r  )r   r   r   r   r   r   r   r}   )
r   r  rM  r   r  r   r   r   r   r   )rh   r   ra  r  r  r   r   rB  rD   r  r   r  r   r   image_featuresr  s                   rn   r   zKosmos2Model.forward  s    t # $# !_``4T447O]aekN *77L$2$H$H!!$// 
)%'A+'%
 
 "%77#33!//))%"7 3
 	
rp   r   )
NNNNNNNNNF)rq   rr   rs   r!   rt   r  r   r   r|   r  r  r   r   rE   r   r   r   r   r   r   r  r   r
   r   r   r   r   s   @rn   r  r  M  s    $O} 2bii 23  16'' #'+ +,	
 
8	8  <  -1)-:>.2(,,0-1,0!%).X
llT)X
 <<$&X
 %*LL4$7	X

 t+X
 X
 llT)X
 ||d*X
 llT)X
 $;X
 #'X
 +,X
 
#	#X
  X
rp   r  z
    KOSMOS-2 Model for generating text and bounding boxes given an image. The model consists of a vision encoder and a
    language model.
    c                       e Zd ZU eed<   dZddiZdef fdZdej                  fdZ
d Zdej                  fd	Zd
 Zee	 	 	 	 	 	 	 	 	 	 	 ddej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dedz  dej"                  dz  dej"                  dz  dej"                  dz  dej&                  dz  dedz  deej"                  z  dee   deez  fd              Z ej6                         	 	 	 	 	 	 ddej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  dej"                  dz  fd       Z xZS )Kosmos2ForConditionalGenerationr&   r   ztext_model.lm_head.weightz$text_model.model.embed_tokens.weightc                     t         |   |       t        |j                        | _        t        |j                        | _        t        |      | _	        | j                          y r%  )r   r   rU   r0   r  r  r.   r  rW   r  rJ  r   s     rn   r   z(Kosmos2ForConditionalGeneration.__init__  sN     01C1CD.v/C/CD(DV(L% 	rp   r   c                 B    | j                   j                  j                  S r%  r  r   s    rn   r  z4Kosmos2ForConditionalGeneration.get_input_embeddings  r  rp   c                 :    || j                   j                  _        y r%  r  r  s     rn   r  z4Kosmos2ForConditionalGeneration.set_input_embeddings  r  rp   c                 6    | j                   j                         S r%  )r  r  r   s    rn   r  z5Kosmos2ForConditionalGeneration.get_output_embeddings  s    4466rp   c                 :    | j                   j                  |       y r%  )r  set_output_embeddings)rh   new_embeddingss     rn   r  z5Kosmos2ForConditionalGeneration.set_output_embeddings  s    --n=rp   Nra  r  r  r   r   rB  rD   r  r  r  r  c                    d}d}|||t        d      | j                  |      }| j                  j                  j                  |d         }t        j
                  j                  |d      }| j                  |      \  }} | j                  d	||||||||	|
|d
|}t        |j                  |j                  |j                  |j                  |j                  |||      S )
a  
        image_embeds_position_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
            Mask to indicate the location in a sequence to insert the image features . Mask values selected in `[0,
            1]`:

            - 1 for places where to put the image features,
            - 0 for places that are not for image features (i.e. for text tokens).
        image_embeds (`torch.FloatTensor` of shape `(batch_size, latent_query_num, hidden_size)`, *optional*):
            Sequence of hidden-states at the output of `Kosmos2ImageToTextProjection`.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`

        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Kosmos2ForConditionalGeneration

        >>> model = Kosmos2ForConditionalGeneration.from_pretrained("microsoft/kosmos-2-patch14-224")
        >>> processor = AutoProcessor.from_pretrained("microsoft/kosmos-2-patch14-224")

        >>> url = "https://huggingface.co/microsoft/kosmos-2-patch14-224/resolve/main/snowman.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> prompt = "<grounding> An image of"

        >>> inputs = processor(text=prompt, images=image, return_tensors="pt")

        >>> generated_ids = model.generate(
        ...     pixel_values=inputs["pixel_values"],
        ...     input_ids=inputs["input_ids"],
        ...     attention_mask=inputs["attention_mask"],
        ...     image_embeds=None,
        ...     image_embeds_position_mask=inputs["image_embeds_position_mask"],
        ...     use_cache=True,
        ...     max_new_tokens=64,
        ... )
        >>> generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
        >>> processed_text = processor.post_process_generation(generated_text, cleanup_and_extract=False)
        >>> processed_text
        '<grounding> An image of<phrase> a snowman</phrase><object><patch_index_0044><patch_index_0863></object> warming himself by<phrase> a fire</phrase><object><patch_index_0005><patch_index_0911></object>.'

        >>> caption, entities = processor.post_process_generation(generated_text)
        >>> caption
        'An image of a snowman warming himself by a fire.'

        >>> entities
        [('a snowman', (12, 21), [(0.390625, 0.046875, 0.984375, 0.828125)]), ('a fire', (41, 47), [(0.171875, 0.015625, 0.484375, 0.890625)])]
        ```Nr  )r   r   r5   r   )
ra  r  r   r  r   rB  rD   r  r  r  )r   r   r   r   r   r   r   r   r}   )r   r  r  rI  r   r   r  r  r  r   r   r   r   r   r   )rh   r   ra  r  r  r   r   rB  rD   r  r  r  r  r   r   
lm_outputss                   rn   r   z'Kosmos2ForConditionalGeneration.forward  s   N # $# !_``"&"3"3) #4 #  ,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/8G 9
)%'A+'%)9
 9

 :$$&66$22!,,%"7 3	
 		
rp   c           	         |j                  dd       }||t        d| d      |||}|n| j                  |      }	| j                  j                  j	                  |	d         }t
        j                  j                  |d      }| j                  |      \  }}
 | j                  j                  d|||||d|}|S )	Ninputsz
`inputs`: zp were passed alongside `pixel_values` which is not allowed.Make sure to either pass `inputs` or pixel_values=...r   r5   r   )ra  r  r   r  rB  r}   )r  r   r  r  rI  r   r   r  r  r  generate)rh   r   r  ra  r  r   rB  r  r  r   r   outputs               rn   r  z(Kosmos2ForConditionalGeneration.generatet  s     Hd+#(:VH %H I  F$6!L"&"3"3L"A,,22AABUVWBXYL==22<R2HL262O2OP\2]/L/))) 
)%'A'
 
 rp   )NNNNNNNNNNr   )NNNNNN)rq   rr   rs   r!   rt   r  r  r   r   r|   r  r  r  r  r   r   rE   r   r
   r  r   r   r   r   r   r   r   r{   r  r   r   s   @rn   r  r    s(    $O57]^	} 	2bii 237ryy 7>  -1)-:>.2(,,0-1,0*.!%-.k
llT)k
 <<$&k
 %*LL4$7	k

 t+k
 k
 llT)k
 ||d*k
 llT)k
   4'k
 $;k
 ell*k
 +,k
 
;	;k
  k
Z U]]_ -1:>)-.2,0-1%llT)% %*LL4$7% <<$&	%
 t+% llT)% ||d*% %rp   r  )r  r  r%   r%  rq  )r1   )Wr   rW  r  collections.abcr   dataclassesr   typingr   rE   r    r   r;   activationsr	   cache_utilsr
   r   r   
generationr   modeling_flash_attention_utilsr   modeling_layersr   modeling_outputsr   r   r   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   utils.output_capturingr   r   configuration_kosmos2r!   r"   r#   
get_loggerrq   r  r%   r   r   r   r   Sizer   r   r   r   r   r|   r:   r[  r  rI   rO   r)   r:  rE  ra   rS   rT   r*   rZ   r  r  rU   rW   r  r  __all__r}   rp   rn   <module>r)     sH      $ !    & ! C C ) B 9  G & j j 7 E X X 
		H	% B%_ B% B%J[u|| [EKK [#* [ jk\ZZ\(-\=B\\\cf\" 
B.H 
B  
B 
 
  
  
F 
%
 %
 %
RPbii Pv %II%<<% 
% <<	%
 LL4'% % %,<)RYY <)@ryy   : B-
299 -
b,
5 ,
`j8ryy j8Zo)")) o)dRYY .V1 Vrz
3 z
z
/ 
>3
- 3
l H3_ HHV +299  +F 
N
) N

N
b q&<o qqh Xrp   