
    ih.                     X   d dl Z d dl mZ d dlmZmZmZmZmZ ddlm	Z	 ddl
mZ ddlmZmZ ddlmZ dd	lmZmZmZ dd
lmZ ddlmZ  ej2                  e      Z G d de      Z G d de      Z G d dej<                        Z G d de      Z  G d de      Z! G d de      Z"g dZ#y)    N)nn)LlavaCausalLMOutputWithPastLlavaForConditionalGeneration
LlavaModelLlavaModelOutputWithPastLlavaPreTrainedModel   )ACT2FN)Cache)BaseModelOutputWithPastBaseModelOutputWithPooling)Unpack)TransformersKwargsauto_docstringlogging)can_return_tuple   )VipLlavaConfigc                       e Zd Zy)VipLlavaModelOutputWithPastN__name__
__module____qualname__     ~/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/vipllava/modular_vipllava.pyr   r   &       r   r   c                       e Zd Zy)VipLlavaCausalLMOutputWithPastNr   r   r   r   r    r    *   r   r   r    c                   *     e Zd Zdef fdZd Z xZS )VipLlavaMultiModalProjectorconfigc                 H   t         |           t        |j                  t              rdnt        |j                        }t        j                  ||j                  j                  z  |j                        | _        t        j                  ||j                  j                  z  |j                  j                  d      | _        t        |j                      | _        t        j                  |j                  j                  |j                  j                  d      | _        y )Nr   )epsT)bias)super__init__
isinstancevision_feature_layersintlenr   	LayerNormvision_confighidden_sizeprojector_layernorm_epsprojector_layernormLineartext_configlinear_1r
   projector_hidden_actactlinear_2)selfr#   num_feature_layers	__class__s      r   r(   z$VipLlavaMultiModalProjector.__init__/   s    ",V-I-I3"OQUXY_YuYuUv#%<<!5!5!A!AAvGeGe$
  		!5!5!A!AA**

 &556		&"4"4"@"@&BTBTB`B`gklr   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }|S N)r1   r4   r6   r7   )r8   hidden_statess     r   forwardz#VipLlavaMultiModalProjector.forward>   sB    00?m4/m4r   )r   r   r   r   r(   r>   __classcell__)r:   s   @r   r"   r"   .   s    m~ mr   r"   c                       e Zd Zy)VipLlavaPreTrainedModelNr   r   r   r   rA   rA   F   r   r   rA   c                   ~   e Zd Ze ed      	 ddej                  deee   z  dz  de	e
   deez  fd              Zee	 	 	 	 	 	 	 	 dd	ej                  dz  dej                  dz  d
ej                  dz  dej                  dz  dedz  dej                  dz  deee   z  dz  dedz  de	e
   deez  fd              Zy)VipLlavaModelzWObtains image last hidden states from the vision tower and apply multimodal projection.)custom_introNpixel_valuesr*   kwargsreturnc                 n   ||n| j                   j                  }d|d<    | j                  |fi |}t        |t              r|j
                  |   ddddf   }n<|D cg c]  }|j
                  |   ddddf    }}t        j                  |d      }| j                  |      }||_	        |S c c}w )\  
        pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
            The tensors corresponding to the input images.
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        NToutput_hidden_statesr   )dim)
r#   r*   vision_towerr)   r+   r=   torchcatmulti_modal_projectorpooler_output)r8   rE   r*   rF   image_outputsimage_featuresindexs          r   get_image_featuresz VipLlavaModel.get_image_featuresK   s    $ &;%F!DKKLmLm 	 *.%&)))

 +S1*889NOPQSTSUPUVN VkkEm99%@ABGkNk"YY~2>N33NC&4# ls    B2	input_idsattention_maskposition_idspast_key_valuesinputs_embeds	use_cache	lm_kwargsc	           	         ||n| j                   j                  }|du |duz  rt        d      | | j                         |      }|i| j	                  ||      j
                  }
|
j                  |j                  |j                        }
| j                  |||
      }|j                  ||
      } | j                  d|||||d|	}t        |j                  |j                  |j                  |j                   |
nd      }|S )z
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        Nz:You must specify exactly one of input_ids or inputs_embedsrE   r*   )rZ   rS   )rW   rX   rY   rZ   r[   )last_hidden_staterY   r=   
attentionsimage_hidden_statesr   )r#   r*   
ValueErrorget_input_embeddingsrU   rQ   todevicedtypeget_placeholder_maskmasked_scatterlanguage_modelr   r_   rY   r=   r`   )r8   rV   rE   rW   rX   rY   rZ   r*   r[   r\   rS   special_image_maskoutputsoutputs                 r   r>   zVipLlavaModel.forwards   sF   ( &;%F!DKKLmLm 	 -t";<YZZ 7D557	BM#!44)AV 5 m  ,..}/C/C]EXEXYN!%!:!:~ "; " *889K^\M+>4+>+> ,
)%+',
 ,
 -%77#33!//))2>2JPT
 r   r<   )NNNNNNNN)r   r   r   r   r   rN   FloatTensorr+   listr   r   tupler   rU   
LongTensorTensorr   boolr   r>   r   r   r   rC   rC   J   s]   n 9="''"  #T#Y5" +,	"
 
+	+" "H  .215.204(,268<!%5##d*5 ''$.5 t+	5
 &&-5 5 ((4/5  #T#Y55 $;5 ./5 
,	,5  5r   rC   c                      e Zd Ze	 ddej
                  deee   z  dz  dee	   de
ez  fd       Zee	 	 	 	 	 	 	 	 	 	 ddej                  dz  dej
                  dz  dej                  dz  d	ej                  dz  d
edz  dej
                  dz  deee   z  dz  dej                  dz  dedz  deej                  z  dee	   de
ez  fd              Zy) VipLlavaForConditionalGenerationNrE   r*   rF   rG   c                 @     | j                   j                  d||d|S )rI   r^   r   )modelrU   )r8   rE   r*   rF   s       r   rU   z3VipLlavaForConditionalGeneration.get_image_features   s0     -tzz,, 
%=R
V\
 	
r   rV   rW   rX   rY   rZ   labelsr[   logits_to_keepr\   c                    ||n| j                   j                  } | j                  d|||||||	|d|}|j                  }t	        |
t
              rt        |
 d      n|
}| j                  |dd|ddf         }d}|2| j                  ||| j                   j                  j                        }t        |||j                  |j                  |j                  |j                        S )a  
        vision_feature_layers (`Union[int, list[int]]`, *optional*):
            The vision feature layer, or the list of indexes of the layers to select
            the vision feature.
        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.

        Example:

        ```python
        >>> import torch
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, VipLlavaForConditionalGeneration

        >>> model = VipLlavaForConditionalGeneration.from_pretrained("llava-hf/vip-llava-7b-hf", device_map="auto", dtype=torch.float16)
        >>> processor = AutoProcessor.from_pretrained("llava-hf/vip-llava-7b-hf")

        >>> prompt = "A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.###Human: <image>\n{}###Assistant:"
        >>> question = "Can you please describe this image?"
        >>> prompt = prompt.format(question)
        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/diffusers/compel-neg.png"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(text=text, images=image, return_tensors="pt").to(0, torch.float16)

        >>> # Generate
        >>> generate_ids = model.generate(**inputs, max_new_tokens=20)
        >>> processor.decode(generate_ids[0][len(inputs["input_ids"][0]):], skip_special_tokens=True)
        The image features a brown and white cat sitting on a green surface, with a red ball in its
        ```N)rV   rE   rW   rX   rY   rZ   r[   r*   )logitsrw   
vocab_size)lossrz   rY   r=   r`   ra   r   )r#   r*   rv   r_   r)   r+   slicelm_headloss_functionr3   r{   r    rY   r=   r`   ra   )r8   rV   rE   rW   rX   rY   rZ   r*   rw   r[   rx   r\   rk   r=   slice_indicesrz   r|   s                    r   r>   z(VipLlavaForConditionalGeneration.forward   s   j &;%F!DKKLmLm 	 0:tzz 
0
%)%+'"7
0
 
0
  118B>SV8W~ot4]kmA}a,?@A%%VFt{{OfOfOqOq%rD-#33!//)) ' ; ;
 	
r   r<   )
NNNNNNNNNr   )r   r   r   r   rN   rm   r+   rn   r   r   ro   r   rU   r   rp   rq   r   rr   r    r>   r   r   r   rt   rt      s    9=
''
  #T#Y5
 +,	

 
+	+
 
"  .215.204(,268<*.!%-.R
##d*R
 ''$.R
 t+	R

 &&-R
 R
 ((4/R
  #T#Y5R
   4'R
 $;R
 ell*R
 ./R
 
/	/R
  R
r   rt   )rC   rt   rA   )$rN   r   (transformers.models.llava.modeling_llavar   r   r   r   r   activationsr
   cache_utilsr   modeling_outputsr   r   processing_utilsr   utilsr   r   r   utils.genericr   configuration_vipllavar   
get_loggerr   loggerr   r    Moduler"   rA   rC   rt   __all__r   r   r   <module>r      s       "   S & @ @ - 2 
		H	%	": 		%@ 	")) 0	2 	`J `Fg
'D g
T [r   