
    i_                     (   d Z ddlmZ ddlmZ ddlZddlmZ ddlmZ ddl	m
Z
 dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZ ddlmZ ddlmZmZmZmZmZmZ ddlm Z  ddl!m"Z" ddl#m$Z$  ejJ                  e&      Z'e ed       G d de                    Z(e ed       G d de                    Z) G d dejT                        Z+dejX                  defd Z- e d!d"d#$      	 	 	 	 d6d%e
d#ejX                  d&ejX                  dz  d'edz  d(ejX                  dz  d)ejX                  dz  d*ej\                  dz  d+e/dz  d,e/dz  de0fd-       Z1e G d. d/e             Z2 ed0       G d1 d2e2             Z3 ed0       G d3 d4e2e             Z4g d5Z5y)7zPyTorch PaliGemmamodel.    )Callable)	dataclassN)nn   )Cache)PreTrainedConfig)GenerationMixin)create_masks_for_generate)FlashAttentionKwargs)BaseModelOutputWithPastBaseModelOutputWithPooling)PreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringcan_return_tupleloggingtorch_compilable_check)deprecate_kwarg   )	AutoModel   )PaliGemmaConfigzN
    Base class for Paligemma outputs, with hidden states and attentions.
    custom_introc                   :    e Zd ZU dZdZej                  dz  ed<   y)PaligemmaModelOutputWithPasta  
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
    Nimage_hidden_states)__name__
__module____qualname____doc__r   torchFloatTensor__annotations__     /var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.pyr   r   .   s     59**T18r(   r   zU
    Base class for PaliGemma causal language model (or autoregressive) outputs.
    c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	e
dz  ed<   dZeej                     dz  ed<   dZeej                     dz  ed<   dZej                  dz  ed<   y)	PaliGemmaCausalLMOutputWithPasta8  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Language modeling loss (for next-token prediction).
    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).

        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
        `past_key_values` input) to speed up sequential decoding.
    image_hidden_states (`torch.FloatTensor`, *optional*):
        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
    Nlosslogitspast_key_valueshidden_states
attentionsr   )r    r!   r"   r#   r,   r$   r%   r&   r-   r.   r   r/   tupler0   r   r'   r(   r)   r+   r+   >   s     &*D%

d
")'+FE$+$(OUT\(59M5**+d2926Je''(4/648**T18r(   r+   c                   *     e Zd Zdef fdZd Z xZS )PaliGemmaMultiModalProjectorconfigc                     t         |           t        j                  |j                  j
                  |j                  j                  d      | _        y )NTbias)super__init__r   Linearvision_confighidden_sizeprojection_dimlinearselfr4   	__class__s     r)   r9   z%PaliGemmaMultiModalProjector.__init__]   s;    ii 4 4 @ @&BVBVBeBelpqr(   c                 (    | j                  |      }|S N)r>   )r@   image_featuresr/   s      r)   forwardz$PaliGemmaMultiModalProjector.forwarda   s    N3r(   )r    r!   r"   r   r9   rE   __classcell__rA   s   @r)   r3   r3   \   s    r rr(   r3   	group_idsreturnc           
      P     dt         dt         dt         dt         dt        f
 fd}|S )au  
    This function adds the correct offsets to the `q_idx` and `kv_idx` as the torch API can only accept lengths,
    not start and end indices.
    Args:
        group_ids (`torch.Tensor`):
            A tensor of shape `(bs, len)` assigning each token to a vision group. Tokens with the same group
            come from the same input image. Text is denoted by `-1`.
    	batch_idxhead_idxq_idxkv_idxrI   c                    	j                   d   }|j                  |dz
        }|j                  |dz
        }	| |f   }	| |f   }t        j                  ||k  |d      }t        j                  ||k  |d      }||k(  |dk\  z  S )Nr   )maxr   )shapeclampr$   where)
rK   rL   rM   rN   
seq_lengthq_idx_clampedkv_idx_clampedq_groupkv_grouprH   s
            r)   
inner_maskz0token_type_ids_mask_function.<locals>.inner_maskq   s    __R(
 
Q7*q.9 I}45Y67++ej0'2>;;v
2HbA8#155r(   )intbool)rH   rZ   s   ` r)   token_type_ids_mask_functionr]   g   s3    6c 6S 6 6c 6d 6 r(   input_embeds5.6.0inputs_embedsversionnew_namer4   attention_maskr.   position_idstoken_type_idspixel_valuesis_trainingis_first_iterationc	                    |r|t        d      | j                         ||||d}
|r|n|du xs |j                   xs |du}|s|	j                  dd      s<|d|z
  }n4t        j                  d       t        j                  |      dddddf   }||r|dk(  j                  |j                        }t        j                  j                  |d	d
      ddddf   }|| z  }t        j                  |j                         d      dz
  }t        j                  ||t        j                   |d            }t#        |      |
d<   t%        di |
S )a"  
    Overwrites the base `create_masks_for_generate` with `token_type_ids` masking to create the causal mask mapping
    for all kinds of forward passes. Paligemma uses a bidirectional mask on the prompt tokens.

    Uses `pixel_values` as an optional input to disambiguate edge cases.
    Nz;`token_type_ids` is required as a model input when training)r4   r`   rd   r.   re   	use_cacheTr   zIt is a prefill stage but The `token_type_ids` is not provided. We recommend passing `token_type_ids` to the model to prevent bad attention masking.r   )r   r   )valuerP   )dimor_mask_functionr'   )
ValueErrorget_text_configis_initializedgetloggerwarning_oncer$   	ones_liketodevicer   
functionalpadcumsumr[   rT   	full_liker]   r
   )r4   r`   rd   r.   re   rf   rg   rh   ri   kwargsmask_kwargsis_imageis_previous_imagenew_image_startrH   s                  r)   create_causal_mask_mappingr      su   & ~-VWW ((*&(*$K  	%g_-K-K)Kg|cgOg  K!>% /NZ
 #__];Aq!GDN
 !&8 #a'++M,@,@AMM--ha-HCRCP"&7%77LL!4!4!6A>B	KK)U__^UW5XY	*Fy*Q&'$3{33r(   c                   B    e Zd ZU eed<   dZdZdZdgZdZ	dZ
dZdZdZdZy)	PaliGemmaPreTrainedModelr4   model)imagetextTr3   r.   FN)r    r!   r"   r   r&   base_model_prefixinput_modalitiessupports_gradient_checkpointing_no_split_modules_skip_keys_device_placement_can_compile_fullgraph_supports_flash_attn_supports_sdpa_supports_flex_attn_supports_attention_backendr'   r(   r)   r   r      sF    (&*#78"3"N"&r(   r   z|
    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
    c                       e Zd ZdZdef fdZd Zd Ze e	d      de
j                  d	ee   d
eez  fd              Zde
j"                  de
j                  de
j                  fdZee		 	 	 	 	 	 	 	 	 dde
j"                  dz  de
j                  dz  de
j&                  dz  de
j"                  dz  dedz  de
j"                  dz  de
j                  dz  de
j"                  dz  dedz  d	ee   d
eez  fd              Z xZS )PaliGemmaModelFr4   c                    t         |   |       t        j                  |j                        | _        t        |      | _        |j                  j                  | _	        t        j                  |j                        }|| _
        | j                  j                         j                  xs | j                  | _        | j                          y )N)r4   )r8   r9   r   from_configr;   vision_towerr3   multi_modal_projectortext_config
vocab_sizelanguage_modelr4   rp   dtypetext_config_dtype	post_init)r@   r4   r   rA   s      r)   r9   zPaliGemmaModel.__init__   s     %119M9MN%A&%I" ,,77"..f6H6HI,!%!<!<!>!D!D!R

r(   c                 6    | j                   j                         S rC   )r   get_input_embeddingsr@   s    r)   r   z#PaliGemmaModel.get_input_embeddings   s    ""7799r(   c                 :    | j                   j                  |       y rC   )r   set_input_embeddingsr@   rl   s     r)   r   z#PaliGemmaModel.set_input_embeddings   s    007r(   zWObtains image last hidden states from the vision tower and apply multimodal projection.r   rg   r|   rI   c                 t     | j                   |fi |}|j                  }| j                  |      }||_        |S rC   )r   last_hidden_stater   pooler_output)r@   rg   r|   image_outputsselected_image_featurerD   s         r)   get_image_featuresz!PaliGemmaModel.get_image_features   sF     *)),A&A!.!@!@334JK&4#r(   	input_idsr`   rD   c                 N   |m| | j                         t        j                  | j                  j                  t        j
                  |j                              k(  }|j                  d      }n|| j                  j                  k(  }|j                         }|j                  d   |j                  d   z  }|j                  d      j                  |      j                  |j                        }t        ||   j                         |j                         k(  d| d|        |S )z
        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
        equal to the length of multimodal features. If the lengths are different, an error is raised.
        )r   rw   rP   r   r   z6Image features and image tokens do not match, tokens: z, features: )r   r$   tensorr4   image_token_idlongrw   allsumrR   	unsqueeze	expand_asrv   r   numel)r@   r   r`   rD   special_image_maskn_image_tokensn_image_featuress          r)   get_placeholder_maskz#PaliGemmaModel.get_placeholder_mask  s    !.2M$2K2K2MT[[77uzzR_RfRfg3 " "4!7!7!;!*dkk.H.H!H+//1)//2^5I5I!5LL/99"=GGVYYZgZnZno,-3359M9M9OOD^DTT`aq`rs	
 "!r(   Nrd   re   r.   rf   labelsrk   c
           
         |du |duz  rt        d      |R| j                  j                  | j                  k\  r/|| j                  j                  k(  }|j	                         }d||<   n|}| | j                         |      }|Y||j                         nd}t        j                  |j                  d   |j                        |z   }|j                  d      dz   }|g| j                  |      j                  }|j                  |j                  |j                        }| j!                  |||      }|j#                  ||      }t%        |x}t&              s't)        | j                  ||||||| j*                        } | j,                  d
|||||	d|
}t/        |j0                  |j2                  |j4                  |j6                  |	      S d	      S )  
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.

        Example:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration

        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")

        >>> prompt = "Where is the cat standing?"
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")

        >>> # Generate
        >>> generate_ids = model.generate(**inputs,)
        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
        "Where is the cat standing?\nsnow"
        ```Nz:You must specify exactly one of input_ids or inputs_embedsr   r   )rw   )r`   rD   )rh   )rd   re   r.   r`   rk   )r   r.   r/   r0   r   r'   )ro   r4   r   r   cloner   get_seq_lengthr$   arangerR   rw   r   r   r   rv   r   r   masked_scatter
isinstancedictr   trainingr   r   r   r.   r/   r0   )r@   r   rg   rd   re   r.   rf   r`   r   rk   r|   r   llm_input_idspast_seen_tokensrD   causal_mask_mappingoutputss                    r)   rE   zPaliGemmaModel.forward  s   Z -t";<YZZ  T[[%?%?4??%R!*dkk.H.H!H%OO-M01M,-%M 7D557FMCRC^==?de <<(;(;A(>}G[G[\_ooL'11!4q8L #!44\BPPN+..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M ?-F"< MM	# &$%% 
.%+'
 
 ,%77#33!//))2>2J
 	

 QU
 	
r(   )	NNNNNNNNN)r    r!   r"   accepts_loss_kwargsr   r9   r   r   r   r   r$   r%   r   r   r1   r   r   
LongTensorr   Tensorr   r\   r   r   rE   rF   rG   s   @r)   r   r      s     
 
:8 n!--9?@R9S	+	+ "))":?:K:K"]b]n]n"0  .215.204(,2626*.!%c
##d*c
 ''$.c
 t+	c

 &&-c
 c
 ((4/c
 ((4/c
   4'c
 $;c
 -.c
 
-	-c
  c
r(   r   c                       e Zd ZddiZdef fdZd Zd Zede	j                  dee   fd	       Zee	 	 	 	 	 	 	 	 	 	 dde	j                  d
z  de	j                  d
z  de	j                   d
z  de	j                  d
z  ded
z  de	j                  d
z  de	j                  d
z  de	j                  d
z  ded
z  dee	j                   z  dee   deez  fd              Z	 	 	 	 	 	 	 	 	 	 d fd	Ze eddd      	 	 ddede	j                   de	j                   d
z  ded
z  de	j                   d
z  de	j                   d
z  ded
z  defd              Z xZS )!PaliGemmaForConditionalGenerationzlm_head.weightz(model.language_model.embed_tokens.weightr4   c                     t         |   |       t        |      | _        t	        j
                  |j                  j                  |j                  j                  d      | _	        | j                          y )NFr6   )r8   r9   r   r   r   r:   r   r<   r   lm_headr   r?   s     r)   r9   z*PaliGemmaForConditionalGeneration.__init__  sS     #F+
yy!3!3!?!?ASASA^A^ejkr(   c                 6    | j                   j                         S rC   )r   r   r   s    r)   r   z6PaliGemmaForConditionalGeneration.get_input_embeddings  s    zz..00r(   c                 :    | j                   j                  |       y rC   )r   r   r   s     r)   r   z6PaliGemmaForConditionalGeneration.set_input_embeddings  s    

''.r(   rg   r|   c                 <     | j                   j                  |fi |S rC   )r   r   )r@   rg   r|   s      r)   r   z4PaliGemmaForConditionalGeneration.get_image_features  s    ,tzz,,\DVDDr(   Nr   rd   re   r.   rf   r`   r   rk   logits_to_keeprI   c                     | j                   d||||||||	|d	|}|d   }t        |
t              rt        |
 d      n|
}| j	                  |dd|ddf         }d}|4 | j
                  d||| j                  j                  j                  d|}t        |||j                  |j                  |j                  |j                        S )r   )	r   rg   rf   rd   re   r.   r`   rk   r   r   N)r-   r   r   )r,   r-   r.   r/   r0   r   r'   )r   r   r[   slicer   loss_functionr4   r   r   r+   r.   r/   r0   r   )r@   r   rg   rd   re   r.   rf   r`   r   rk   r   r|   r   r/   slice_indicesr-   r,   s                    r)   rE   z)PaliGemmaForConditionalGeneration.forward  s    Z $** 
%))%+'
 
  
8B>SV8W~ot4]kmA}a,?@A%4%% f9P9P9[9[_eD /#33!//)) ' ; ;
 	
r(   c                     t        |   |f||||||	||d|}|j                  d      |d   dz   |d<   |s|s||d<   |S )N)r.   r`   rd   re   rk   r   rf   ri   re   r   rg   )r8   prepare_inputs_for_generationrr   )r@   r   r.   r`   re   rg   rd   rf   rk   r   r   ri   r|   model_inputsrA   s                 r)   r   z?PaliGemmaForConditionalGeneration.prepare_inputs_for_generation  s|      w<
+')%))1
 
 N+7+7+G!+KL( Y+7L(r(   r^   r_   ra   ri   c           
          t        | |||||fd|i|j                         D 	ci c]  \  }}	|dk7  s||	 c}	}S c c}	}w )Nri   rg   )r   items)
r4   r`   rd   r.   re   rf   ri   r|   kvs
             r)   r
   z;PaliGemmaForConditionalGeneration.create_masks_for_generate  s]     *	
  2	
 !'F1!~2Eq!tF	
 		
 Gs   <<)
NNNNNNNNNr   )
NNNNNNTNNF)NF)r    r!   r"   _tied_weights_keysr   r9   r   r   r   r$   r%   r   r   r   r   r   r   r   r\   r[   r1   r+   rE   r   staticmethodr   r   r   r
   rF   rG   s   @r)   r   r     sI    +,VW 1/ Eu/@/@ EFSeLf E E  .215.204(,2626*.!%-.J
##d*J
 ''$.J
 t+	J

 &&-J
 J
 ((4/J
 ((4/J
   4'J
 $;J
 ell*J
 +,J
 
0	0J
  J
^  )V ^WO /3*/
 
||
 t+
 	

 llT)
 t+
 !4K
 

 P 
r(   r   )r   r   r   )NNFN)6r#   collections.abcr   dataclassesr   r$   r   cache_utilsr   configuration_utilsr   
generationr	   masking_utilsr
   modeling_flash_attention_utilsr   modeling_outputsr   r   modeling_utilsr   processing_utilsr   utilsr   r   r   r   r   r   utils.deprecationr   autor   configuration_paligemmar   
get_loggerr    rs   r   r+   Moduler3   r   r]   r%   r\   r   r   r   r   r   __all__r'   r(   r)   <module>r      s!    $ !     3 ) 6 B S - &  1  4 
		H	% 
9#: 9 9 
9k 9 90299 ELL X 6 ?K +/-1$&*C4C4<<C4 LL4'C4 T\	C4
 ,,%C4 LL4'C4 ##d*C4 C4 tC4 
C4 LC4L ' ' ' 
c
- c

c
L 
b
(@/ b

b
J ^r(   