
    i                        d Z ddlZddlZddlmZ ddlmZ ddlmZ ddl	Z	ddl	m
Z
 ddlmZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*  e jV                  e,      Z-de	j\                  de	j\                  fdZ/de	j\                  de	j\                  fdZ0ee G d de                    Z1ee G d de                    Z2ee G d de                    Z3 G d d e
jh                        Z5 G d! d"e
jh                        Z6	 dHd#e
jh                  d$e	j\                  d%e	j\                  d&e	j\                  d'e	j\                  dz  d(e7d)e7fd*Z8 G d+ d,e
jh                        Z9 G d- d.e
jh                        Z: G d/ d0e      Z; G d1 d2e
jh                        Z<e G d3 d4e             Z= G d5 d6e
jh                        Z> G d7 d8e=      Z? G d9 d:e=      Z@ G d; d<e=      ZA G d= d>e
jh                        ZB G d? d@e=      ZCe G dA dBe=             ZD edCD       G dE dFe=             ZEg dGZFy)IzPyTorch CLIPSeg model.    N)Callable)	dataclass)Any)nn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringlogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )CLIPSegConfigCLIPSegTextConfigCLIPSegVisionConfiglogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr!   r   s    }/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/clipseg/modeling_clipseg.pycontrastive_lossr)   +   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r)   t)r+   caption_loss
image_losss      r(   clipseg_lossr0   0   s,    #J/L!*,,.1J:%,,r*   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)CLIPSegOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegTextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of [`CLIPSegVisionModel`].
    text_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegTextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr   c                 B    t        d | j                         D              S )Nc              3   `   K   | ]&  }t        |t              r|j                         n| ( y wN
isinstancer   to_tuple.0vs     r(   	<genexpr>z)CLIPSegOutput.to_tuple.<locals>.<genexpr>V   $     ^1Z;%?QZZ\QF^   ,.tuplevaluesselfs    r(   r?   zCLIPSegOutput.to_tupleU       ^PTP[P[P]^^^r*   )__name__
__module____qualname____doc__r3   r$   FloatTensor__annotations__r4   r5   r6   r7   r8   r   r9   rG   r   r?    r*   r(   r2   r2   6   s    & &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:_%* _r*   r2   c                       e Zd ZU dZdZej                  dz  ed<   dZe	ej                     dz  ed<   dZ
e	ej                     dz  ed<   y)CLIPSegDecoderOutputz|
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    Nr   hidden_states
attentions)rL   rM   rN   rO   r   r$   rP   rQ   rU   rG   rV   rR   r*   r(   rT   rT   Y   sR    
 (,FE$+59M5**+d2926Je''(4/6r*   rT   c                       e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZeed<   dZeed<   d	ee   fd
Zy)CLIPSegImageSegmentationOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
        Binary cross entropy loss for segmentation.
    logits (`torch.FloatTensor` of shape `(batch_size, height, width)`):
        Classification scores for each pixel.
    conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, projection_dim)`):
        Conditional embeddings used for segmentation.
    pooled_output (`torch.FloatTensor` of shape `(batch_size, embed_dim)`):
        Pooled output of the [`CLIPSegVisionModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`CLIPSegVisionModel`].
    decoder_output (`CLIPSegDecoderOutput`):
        The output of the [`CLIPSegDecoder`].
    Nr3   r   conditional_embeddingspooled_outputr9   decoder_outputr   c                 B    t        d | j                         D              S )Nc              3   `   K   | ]&  }t        |t              r|j                         n| ( y wr<   r=   r@   s     r(   rC   z:CLIPSegImageSegmentationOutput.to_tuple.<locals>.<genexpr>   rD   rE   rF   rI   s    r(   r?   z'CLIPSegImageSegmentationOutput.to_tuple   rK   r*   )rL   rM   rN   rO   r3   r$   rP   rQ   r   rY   rZ   r9   r   r[   rT   rG   r   r?   rR   r*   r(   rX   rX   f   s     &*D%

d
")'+FE$+7;E--4;.2M5$$t+26:3:+/N(/_%* _r*   rX   c                        e Zd Zdef fdZdej                  dededej                  fdZd
dej                  dej                  fd	Z
 xZS )CLIPSegVisionEmbeddingsconfigc                    t         |           || _        |j                  | _        |j
                  | _        |j                  | _        t        j                  t        j                  | j                              | _        t        j                  |j                  | j                  | j                  | j                  d      | _        | j
                  | j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebias   r   position_idsr   
persistent)super__init__r`   hidden_size	embed_dim
image_size
patch_sizer   	Parameterr$   randnclass_embeddingConv2dnum_channelspatch_embeddingnum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr%   expandrJ   r`   	__class__s     r(   rn   z CLIPSegVisionEmbeddings.__init__   s	   ++ ++ ++!||EKK,GH!yy++?? 
 !OOt>1D!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr*   
embeddingsheightwidthr   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nrj   g      ?r   rg   bicubicF)sizemodealign_cornersdim)shaper|   weight	unsqueezer$   jit
is_tracingrh   rr   r   reshapepermuter   r"   interpolateviewcat)rJ   r   r   r   ry   r|   rz   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r(   interpolate_pos_encodingz0CLIPSegVisionEmbeddings.interpolate_pos_encoding   sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr*   pixel_valuesc                     |j                   \  }}}}|sJ|| j                  k7  s|| j                  k7  r,t        d| d| d| j                   d| j                   d	      | j                  |      }|j	                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )	NzInput image size (*z) doesn't match model ().rg   r   rj   r   )r   rq   
ValueErrorrx   flatten	transposeru   r~   r$   r   r   r|   rh   )
rJ   r   r   
batch_size_r   r   patch_embedsclass_embedsr   s
             r(   forwardzCLIPSegVisionEmbeddings.forward   s   '3'9'9$
Avu'Vt-F%SWSbSbJb$VHAeW4KDOOK\\]^b^m^m]nnpq  ++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr*   T)rL   rM   rN   r   rn   r$   Tensorintr   rP   r   __classcell__r   s   @r(   r_   r_      se    q2 q,'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 Y^YeYe r*   r_   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
CLIPSegTextEmbeddingsr`   c                 N   t         |           |j                  }t        j                  |j
                  |      | _        t        j                  |j                  |      | _        | j                  dt        j                  |j                        j                  d      d       y )Nrh   ri   Frk   )rm   rn   ro   r   r{   
vocab_sizetoken_embeddingmax_position_embeddingsr|   r}   r$   r%   r~   rJ   r`   rp   r   s      r(   rn   zCLIPSegTextEmbeddings.__init__   s    &&	!||F,=,=yI"$,,v/M/My"Y 	ELL)G)GHOOPWXej 	 	
r*   N	input_idsrh   inputs_embedsr   c                 8   ||j                   d   n|j                   d   }| j                  j                  j                   d   }||kD  rt        d| d|       || j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nrj   r   zRSequence length must be less than max_position_embeddings (got `sequence length`: z and max_position_embeddings: )r   r|   r   r   rh   r   )rJ   r   rh   r   
seq_lengthmax_position_embeddingposition_embeddingsr   s           r(   r   zCLIPSegTextEmbeddings.forward   s     -6,AY__R(}GZGZ[]G^
!%!8!8!?!?!E!Ea!H..d,<=S<TV 
 ,,Q^<L  00;M"55lC"%88
r*   NNN)rL   rM   rN   r   rn   r$   
LongTensorrP   r   r   r   r   s   @r(   r   r      sk    

0 

 .20426	##d* &&- ((4/	
 
r*   r   modulequerykeyvalueattention_maskscalingdropoutc                    t        j                  ||j                  dd            |z  }|||z   }t        j                  j                  |dt         j                        j                  |j                        }t        j                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nrj   r   )r   dtype)ptrainingr   rg   )r$   matmulr   r   r"   softmaxfloat32tor   r   r   
contiguous)
r   r   r   r   r   r   r   kwargsattn_weightsattn_outputs
             r(   eager_attention_forwardr      s     <<s}}R'<=GL!#n4==((2U]](SVVW\WbWbcL==((6??([L,,|U3K''1-88:K$$r*   c                        e Zd ZdZdeez  f fdZ	 d
dej                  dej                  dz  de	e
   deej                  ej                  dz  f   fd	Z xZS )CLIPSegAttentionz=Multi-headed attention from 'Attention Is All You Need' paperr`   c                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: r         F)rm   rn   r`   ro   rp   num_attention_heads	num_headshead_dimr   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r(   rn   zCLIPSegAttention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar*   NrU   r   r   r   c                    |j                   dd }g |d| j                  }| j                  |      }| j                  |      }| j	                  |      }|j                  |      j                  dd      }|j                  |      j                  dd      }|j                  |      j                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  sdn| j                  d|\  }
} |
j                  g |d j!                         }
| j#                  |
      }
|
|fS )z#Input shape: Batch x Time x ChannelNrj   r   rg           )r   r   )r   r   r   r   r   r   r   r   get_interfacer`   _attn_implementationr   r   r   r   r   r   r   )rJ   rU   r   r   input_shapehidden_shapequerieskeysrH   attention_interfacer   r   s               r(   r   zCLIPSegAttention.forward.  sO    $))#2.88b8$--8++m,{{=)]+,,|,66q!<yy&00A6\*44Q:(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ *k));;;;FFHmmK0L((r*   r<   )rL   rM   rN   rO   r   r   rn   r$   r   r   r   rG   r   r   r   s   @r(   r   r     su    GB25FF B. /3$)||$) t+$) +,	$)
 
u||U\\D00	1$)r*   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )
CLIPSegMLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y r<   )rm   rn   r`   r	   
hidden_actactivation_fnr   r   ro   intermediate_sizefc1fc2r   s     r(   rn   zCLIPSegMLP.__init__W  sd    #F$5$5699V//1I1IJ99V55v7I7IJr*   rU   r   c                 l    | j                  |      }| j                  |      }| j                  |      }|S r<   )r   r   r   )rJ   rU   s     r(   r   zCLIPSegMLP.forward^  s4    /**=9/r*   )rL   rM   rN   rn   r$   r   r   r   r   s   @r(   r   r   V  s$    KU\\ ell r*   r   c                        e Zd Zdef fdZdej                  dej                  dee   de	ej                  ej                  dz  f   fdZ xZS )	CLIPSegEncoderLayerr`   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y N)epsrm   rn   ro   rp   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r   mlplayer_norm2r   s     r(   rn   zCLIPSegEncoderLayer.__init__g  m    ++)&1<<F<Q<QRf%<<F<Q<QRr*   rU   r   r   r   Nc                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )NrU   r   rR   )r   r   r   r   rJ   rU   r   r   residualr   s         r(   r   zCLIPSegEncoderLayer.forwardo  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r*   )rL   rM   rN   r   rn   r$   r   r   r   rG   rP   r   r   r   s   @r(   r   r   f  sc    S} S||  +,	
 
u  %,,"55	6r*   r   c                   ~     e Zd ZdZdef fdZdej                  dej                  deej                     fdZ
 xZS )CLIPSegDecoderLayerz
    CLIPSeg decoder layer, which is identical to `CLIPSegEncoderLayer`, except that normalization is applied after
    self-attention/MLP, rather than before.
    r`   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y r   r   r   s     r(   rn   zCLIPSegDecoderLayer.__init__  r   r*   rU   r   r   c                     |} | j                   d||d|\  }}||z   }| j                  |      }|}| j                  |      }||z   }| j                  |      }|S )aI  
        Args:
            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
            attention_mask (`torch.FloatTensor`): attention mask of size
                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
                `(config.encoder_attention_heads,)`.
            output_attentions (`bool`, *optional*):
                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
                returned tensors for more detail.
        r   rR   )r   r   r   r   r   s         r(   r   zCLIPSegDecoderLayer.forward  s      !)4>> 
')
 
q !=0((7 / =0((7r*   )rL   rM   rN   rO   r   rn   r$   r   rG   rP   r   r   r   s   @r(   r  r    sJ    S} S ||   
 
u  	! r*   r  c                   `    e Zd ZU eed<   dZdZdZee	ge
dZ ej                         d        Zy)CLIPSegPreTrainedModelr`   clip)imagetextT)rU   rV   c                 
   | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              r| j                   j                  }t	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j(                        j                  d             nt        |t*              r| j                   j                  }|j"                  dz  d|j                   j,                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       t	        j
                  |j4                  j                  |       nt        |t6              r| j                   j                  }|j                   j8                  dz  d|j                   j,                  z  dz  z  |z  }d|j                   j8                  z  dz  |z  }t	        j
                  |j:                  j                  |       t	        j
                  |j<                  j                  |       nt        |t>              rt	        j
                  |j@                  j                  |jB                  dz  | j                   j                  z         t	        j
                  |jD                  j                  |jF                  dz  | j                   j                  z         t        |tH        jJ                        r>t	        jL                  |jN                         t	        jP                  |j                         t        |tH        jR                        r-|jN                   t	        jL                  |jN                         y	y	y	)
zInitialize the weightsr   g{Gz?)meanstdrj   ri   r   )r  rg   N)*r`   initializer_factorr>   r   initnormal_r   r   r|   copy_rh   r$   r%   r   r~   r_   ru   rp   rx   initializer_rangerz   r   num_hidden_layersr   r   r   r   r   ro   r   r   CLIPSegModeltext_projectiontext_embed_dimvisual_projectionvision_embed_dimr   r   zeros_rf   ones_r   )rJ   r   factorin_proj_stdout_proj_stdfc_stds         r(   _init_weightsz$CLIPSegPreTrainedModel._init_weights  s    //f34LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 78[[33FLL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9M9M,N,U,UV],^_ 01[[33F!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B
+[[33F!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<-LL&&--))4/$++2P2PP LL((//++T1DKK4R4RR
 fbll+KK$JJv}}%fbii(V[[-DKK$ .E(r*   N)rL   rM   rN   r   rQ   base_model_prefixinput_modalitiessupports_gradient_checkpointingr   r  r   _can_record_outputsr$   no_gradr  rR   r*   r(   r  r    sJ    (&*#-/BC&
 U]]_)% )%r*   r  c                   f     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
ez  fdZ xZS )
CLIPSegEncoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`CLIPSegEncoderLayer`].

    Args:
        config: CLIPSegConfig
    r`   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w )NF)
rm   rn   r`   r   
ModuleListranger  r   layersgradient_checkpointing)rJ   r`   r   r   s      r(   rn   zCLIPSegEncoder.__init__  sP    mm%PVPhPhJi$jQ%8%@$jk&+# %ks   A#Nr   r   r   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a8  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)

        )last_hidden_state)r*  r   )rJ   r   r   r   rU   encoder_layers         r(   r   zCLIPSegEncoder.forward  sH    * &![[ 	M) M	 +
 	
r*   r<   )rL   rM   rN   rO   r   rn   r$   r   r   r   rG   r   r   r   r   s   @r(   r&  r&    sP    ,} , /3
 t+
 +,	

 
	 
r*   r&  c                        e Zd Zdef fdZee	 d	deej                     dej                  de
dz  dee   fd              Z xZS )
CLIPSegDecoderr`   c                 &   t         |   |       |j                  | _        t        j                  |j
                  |j                        | _        t        j                  |j
                  |j                        | _        |j                  r|j                  j                  dz  |j                  j                  dz  f}t        j                  t        j                  |j                  |j                  dd      t        j                         t        j                  |j                  |j                  dz  |d   |d         t        j                         t        j                  |j                  dz  d|d   |d               | _        nPt        j                  |j                  d|j                  j                  |j                  j                        | _        t#        |j$                        }t        j&                  t)        |      D cg c]6  }t        j                  |j                  j*                  |j                        8 c}      | _        t/        j0                  |j                        }|j                  |_        |j2                  |_        |j6                  |_        d	|_        t        j&                  t)        t#        |j$                              D cg c]  }t=        |       c}      | _        | jA                          y c c}w c c}w )
N   r   r   )rd   paddingrg   r   )rd   re   )re   relu)!rm   rn   conditional_layerr   r   projection_dim
reduce_dimfilm_mulfilm_add"use_complex_transposed_convolutionvision_configrr   
Sequentialrv   ReLUConvTranspose2dtransposed_convolutionr&   extract_layersr(  r)  ro   reducescopydeepcopydecoder_num_attention_headsr   decoder_intermediate_sizer   r   r  r*  	post_init)rJ   r`   transposed_kernelsdepthr   decoder_configr   s         r(   rn   zCLIPSegDecoder.__init__$  s[    !'!9!9		&"7"79J9JK		&"7"79J9JK44"("6"6"A"AQ"FH\H\HgHgklHl!m*,--		&++V->->AWXY	""%%%%* 21 5-a0	 	""%%*A;Ma;PYklmYn+D' +-*<*<!!1f&:&:&E&EfNbNbNmNm+D' F))*}}UZ[`UabPQRYYv++779J9JKb
 v';';<%+%6%6"-3-O-O*+1+K+K($*!mmRWX[\b\q\qXrRs$tQ%8%H$tu c %us   ;L	LNrU   rY   output_attentionsr   c                    |d d d   }d }t        t        || j                  | j                              D ]  \  }\  }}	}
| |
|      |z   }n |
|      }|| j                  k(  rJ| j                  |      |j                  ddd      z  | j                  |      z   }|j                  ddd      } |	|fd d d|} |d d dd d d f   j                  ddd      }t        t        j                  |j                  d               }|j                  d   }|j                  ||j                  d   ||      }| j                  |      j                  d      }t        |      S )Nrj   r   r   rg   )r   causal_attention_maskr'   )	enumeratezipr*  rA  r5  r8  r   r9  r   mathsqrtr   r   r?  squeezerT   )rJ   rU   rY   rJ  r   activationsoutputi
activationlayerreducer   r   r   s                 r(   r   zCLIPSegDecoder.forwardP  sp    $DbD).7KVZVbVb8c.d 	^*A*
E6!
+f4
+D***'=>PQSTVWAXX[_[h[h*\   1a06]$d]V\]F	^ 12q!))!Q2499V\\!_-.+11!4
Za$E,,V4<<Q?#622r*   r<   )rL   rM   rN   r   rn   r   r   rG   r$   r   boolr   r   r   r   r   s   @r(   r0  r0  #  sn    *} *X  
 *.	!3U\\*!3 !&!3  $;	!3
 +,!3   !3r*   r0  c                        e Zd Zdef fdZe	 	 	 	 	 	 ddej                  dz  dej                  dz  dej                  dz  dedz  dedz  d	edz  d
e	e
z  fd       Z xZS )CLIPSegTextTransformerr`   c                 
   t         |   |       |j                  }t        |      | _        t        |      | _        t        j                  ||j                        | _
        |j                  | _        | j                          y r   )rm   rn   ro   r   r   r&  encoderr   r   r   final_layer_normeos_token_idrF  r   s      r(   rn   zCLIPSegTextTransformer.__init__w  sf     &&	/7%f- "YF<Q<Q R #//r*   Nr   r   rh   rJ  output_hidden_statesreturn_dictr   c           
      @   ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }|t	        d      |j                         }|j                  d|d         }| j                  ||      }	t        | j                   |	|d       }|j                  dd         | j                  d|	||||dd|}
|
d   }| j                  |      }| j                  d	k(  rm|t        j                  |j                  d   |j                   
      |j#                  t        j$                  |j                         j'                  d      f   }n|t        j                  |j                  d   |j                   
      |j#                  t        j$                  |j                         | j                  k(  j%                         j'                  d      f   }|s
||f|
dd  z   S t)        |||
j*                  |
j,                        S )NzYou have to specify input_idsrj   )r   rh   )r`   r   r   past_key_valuesr   T)r   r   rJ  r_  r`  r   r   rg   r    )r   r!   r   r   r-  pooler_outputrU   rV   rR   )r`   rJ  r_  r`  r   r   r   r   r
   popr\  r]  r^  r$   r%   r   r!   r   r   argmaxr   rU   rV   )rJ   r   r   rh   rJ  r_  r`  r   r   rU   encoder_outputsr-  rZ   s                r(   r   zCLIPSegTextTransformer.forward  s)    2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBY<==nn&NN2{27	),W+;;') 	
 	

;%&$,, 
')/!5#
 
 ,A. 112CD! ..44Q7@Q@X@XY5995F5M5MNUUZ\U]_M ..44Q7@Q@X@XY EII6G6N6NOSWSdSddB!M %}58KKK)/')77&11	
 	
r*   )NNNNNN)rL   rM   rN   r   rn   r   r$   r   rX  rG   r   r   r   r   s   @r(   rZ  rZ  v  s    0   *..2,0)-,0#'K
<<$&K
 t+K
 llT)	K

  $;K
 #TkK
 D[K
 
+	+K
 K
r*   rZ  c                        e Zd ZU eed<   dZddgZdef fdZdej                  fdZ
d Zeee	 	 	 dd
ej                   d	z  dej                   d	z  dej                   d	z  dee   deez  f
d                     Z xZS )CLIPSegTextModelr`   )r
  r   r   c                 d    t         |   |       t        |      | _        | j	                          y r<   )rm   rn   rZ  
text_modelrF  r   s     r(   rn   zCLIPSegTextModel.__init__  s&     08r*   r   c                 B    | j                   j                  j                  S r<   rk  r   r   rI   s    r(   get_input_embeddingsz%CLIPSegTextModel.get_input_embeddings  s    ))999r*   c                 :    || j                   j                  _        y r<   rm  )rJ   r   s     r(   set_input_embeddingsz%CLIPSegTextModel.set_input_embeddings  s    5:""2r*   Nr   r   rh   r   c                 0     | j                   d|||dd|S )a;  
        Examples:

        ```python
        >>> from transformers import AutoTokenizer, CLIPSegTextModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegTextModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```T)r   r   rh   r`  rR   )rk  )rJ   r   r   rh   r   s        r(   r   zCLIPSegTextModel.forward  s2    2 t 
)%	

 
 	
r*   r   )rL   rM   rN   r   rQ   r!  _no_split_modulesrn   r   Modulern  rp  r   r   r   r$   r   r   r   rG   r   r   r   r   s   @r(   ri  ri    s     02GH0 :bii :;   *..2,0	
<<$&
 t+
 llT)	

 +,
 
+	+
    
r*   ri  c                        e Zd Zdef fdZe	 	 	 	 ddej                  dz  dedz  dedz  dedz  dedz  d	e	e
z  fd
       Z xZS )CLIPSegVisionTransformerr`   c                     t         |           || _        |j                  }t	        |      | _        t        j                  ||j                        | _	        t        |      | _        t        j                  ||j                        | _        y r   )rm   rn   r`   ro   r_   r   r   r   r   pre_layrnormr&  r\  post_layernormr   s      r(   rn   z!CLIPSegVisionTransformer.__init__	  sj    &&	1&9LL8M8MN%f- ll9&:O:OPr*   Nr   rJ  r_  r`  r   r   c                    ||n| j                   j                  }||n| j                   j                  }||n| j                   j                  }| j	                  ||      }| j                  |      }| j                  ||||      }|d   }|d d dd d f   }	| j                  |	      }	|s
||	f|dd  z   S t        ||	|j                  |j                        S )N)r   )r   rJ  r_  r`  r   r   rc  )r`   rJ  r_  r`  r   rw  r\  rx  r   rU   rV   )
rJ   r   rJ  r_  r`  r   rU   rg  r-  rZ   s
             r(   r   z CLIPSegVisionTransformer.forward  s     2C1N-TXT_T_TqTq$8$D $++JjJj 	 &1%<k$++BYBYOgh))-8,,'/!5#	 ' 
 ,A.)!Q'2++M:%}58KKK)/')77&11	
 	
r*   )NNNT)rL   rM   rN   r   rn   r   r$   rP   rX  rG   r   r   r   r   s   @r(   ru  ru    s    Q2 Q  *.,0#'04$
''$.$
  $;$
 #Tk	$

 D[$
 #'+$
 
+	+$
 $
r*   ru  c                        e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e ed      e	 	 ddej                  d	z  d
ed	z  dee   deez  fd                     Z xZS )CLIPSegVisionModelr`   r   )r	  c                 d    t         |   |       t        |      | _        | j	                          y r<   )rm   rn   ru  vision_modelrF  r   s     r(   rn   zCLIPSegVisionModel.__init__@  s'     4V<r*   r   c                 B    | j                   j                  j                  S r<   )r}  r   rx   rI   s    r(   rn  z'CLIPSegVisionModel.get_input_embeddingsF  s      ++;;;r*   Ftie_last_hidden_statesNr   r   c                 ,     | j                   d||d|S )a+  
        Examples:

        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, CLIPSegVisionModel

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegVisionModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rR   )r}  )rJ   r   r   r   s       r(   r   zCLIPSegVisionModel.forwardI  s.    > !t   
%%=
 
 	
r*   )NT)rL   rM   rN   r   rQ   main_input_namer!  rn   r   rs  rn  r   r   r   r$   rP   rX  r   r   rG   r   r   r   r   s   @r(   r{  r{  ;  s    $O!2 <bii <  E2 2604 
''$. 
 #'+ 
 +,	 

 
+	+ 
  3   
r*   r{  c                       e Zd ZU eed<   def fdZeee	 	 dde	j                  de	j                  dz  de	j                  dz  dee   deez  f
d	                     Ze ed
      e	 dde	j                   dedee   deez  fd                     Zee	 	 	 	 	 	 dde	j(                  dz  de	j                   dz  de	j                  dz  de	j(                  dz  dedz  dedee   deez  fd              Z xZS )r  r`   c                    t         |   |       t        |j                  t              s"t        dt        |j                         d      t        |j                  t              s"t        dt        |j                         d      |j                  }|j                  }|j                  |_	        |j                  |_	        |j                  | _
        |j                  | _        |j                  | _        t        |      | _        t!        |      | _        t%        j&                  | j                  | j                  d      | _        t%        j&                  | j                  | j                  d      | _        t%        j,                  t/        j0                  | j2                  j4                              | _        | j9                          y )NzNconfig.text_config is expected to be of type CLIPSegTextConfig but is of type .zRconfig.vision_config is expected to be of type CLIPSegVisionConfig but is of type F)rf   )rm   rn   r>   text_configr   	TypeErrortyper;  r   r   r6  ro   r  r  rZ  rk  ru  r}  r   r   r  r  rs   r$   tensorr`   logit_scale_init_valuelogit_scalerF  )rJ   r`   r  r;  r   s       r(   rn   zCLIPSegModel.__init__s  ss    &,,.?@++,-Q0 
 &..0CD--./q2 
 ((,,+1+F+F(-3-H-H*$33)55 - 9 90=4]C!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<T[[5W5W(XY 	r*   Nr   r   rh   r   r   c                 v     | j                   d|||d|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoTokenizer, CLIPSegModel

        >>> tokenizer = AutoTokenizer.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> inputs = tokenizer(["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```r   r   rh   rR   )rk  rd  r  )rJ   r   r   rh   r   text_outputsrZ   s          r(   get_text_featureszCLIPSegModel.get_text_features  sS    0 4C4?? 4
)%4
 	4
 %22%)%9%9-%H"r*   Fr  r   r   c                 t     | j                   d||d|}|j                  }| j                  |      |_        |S )a  
        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```r  rR   )r}  rd  r  )rJ   r   r   r   vision_outputsrZ   s         r(   get_image_featureszCLIPSegModel.get_image_features  sR    8 6GT5F5F 6
%%=6
 6

 '44'+'='=m'L$r*   return_lossc           	          | j                   d	||d|} | j                  d	|||d|}	|j                  }
|	j                  }|
|
j                  ddd      z  }
||j                  ddd      z  }| j                  j                         }t        j                  ||
j                               |z  }|j                         }d}|rt        |      }t        |||||
|	|      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegModel
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegModel.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(
        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
        ... )

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r  r  rg   rj   T)r   r   keepdimN)r3   r4   r5   r6   r7   r8   r9   rR   )r  r  rd  normr  expr$   r   r-   r0   r2   )rJ   r   r   r   rh   r  r   r   r  r  r7   r6   r  r5   r4   r3   s                   r(   r   zCLIPSegModel.forward  s   L 100 
%%=
 
 .t-- 
)%
 	
 &33"00 $l&7&7!T&7&RR!K$4$4qb$$4$OO &&**,,,{LNN4DES*,,.0D-+#%* .
 	
r*   )NNr   )NNNNNT)rL   rM   rN   r   rQ   rn   r   r   r   r$   r   r   r   rG   r   r  rP   rX  r  r   r   r2   r   r   r   s   @r(   r  r  o  s   "} "H   /3,0	<< t+ llT)	
 +, 
+	+    @  E2 *.!''! #'! +,	!
 
+	+!  3  !F  .215.204#')-J
##d*J
 ''$.J
 t+	J

 &&-J
 D[J
 #'J
 +,J
 
	J
  J
r*   r  zn
    CLIPSeg model with a Transformer-based decoder on top for zero-shot and one-shot image segmentation.
    )custom_introc                       e Zd ZU eed<   def fdZ	 	 	 	 	 ddedz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  f
d	Z	e
e	 	 	 	 	 	 	 	 ddej                  dz  d
ej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dej                  dz  dedee   deez  fd              Z xZS )CLIPSegForImageSegmentationr`   c                     t         |   |       || _        t        |      | _        |j
                  | _        t        |      | _        | j                          y r<   )	rm   rn   r`   r  r  r@  r0  decoderrF  r   s     r(   rn   z$CLIPSegForImageSegmentation.__init__7  sI      (	$33%f- 	r*   Nr   r   r   rh   conditional_pixel_valuesc                    |`t        |      |k7  rt        d      t        j                         5  | j                  j                  |||      j                  }d d d        |S |]t        |      |k7  rt        d      t        j                         5  | j                  j                  |      j                  }d d d        |S t        d      # 1 sw Y   S xY w# 1 sw Y   S xY w)Nz@Make sure to pass as many prompt texts as there are query images)r   rh   zAMake sure to pass as many prompt images as there are query imagesz[Invalid conditional, should be either provided as `input_ids` or `conditional_pixel_values`)r&   r   r$   r$  r  r  rd  r  )rJ   r   r   r   rh   r  rY   s          r(   get_conditional_embeddingsz6CLIPSegForImageSegmentation.get_conditional_embeddingsD  s      9~+ !cdd  )-)D)Dn< *E *- '  &% &1+,
: !dee n)-)E)EF^)_)m)m&n &%	 m   &%n &%s   )C&CCC$r   rY   labelsr   r   r   c	                 ^   t        j                         5  d|	d<    | j                  j                  d||d|	}
|
j                  }|
j
                  }| j                  D cg c]
  }||dz       }}t        |
j                  |
j                  |
j
                  |
j                        }
ddd       |$| j                  |j                  d   ||||      }n[|j                  d   |j                  d   k7  rt        d	      |j                  d   | j                  j                  k7  rt        d
       | j                  |fi |	}|j                   }d}|8|j#                  |j$                        }t'        j(                         } |||      }t+        |||
|      S c c}w # 1 sw Y   xY w)a~  
        conditional_pixel_values (`torch.FloatTensor`, *optional*):
            The pixel values of the conditional images.
        conditional_embeddings (`torch.FloatTensor` of shape `(batch_size, config.projection_dim)`, *optional*):
            The conditional embeddings for the query images. If provided, the model will use this instead of computing
            the embeddings from the conditional_pixel_values.
        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).

        Examples:

        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, CLIPSegForImageSegmentation
        >>> from transformers.image_utils import load_image

        >>> processor = AutoProcessor.from_pretrained("CIDAS/clipseg-rd64-refined")
        >>> model = CLIPSegForImageSegmentation.from_pretrained("CIDAS/clipseg-rd64-refined")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> texts = ["a cat", "a remote", "a blanket"]
        >>> inputs = processor(text=texts, images=[image] * len(texts), padding=True, return_tensors="pt")

        >>> with torch.inference_mode():
        ...     outputs = model(**inputs)

        >>> logits = outputs.logits
        >>> print(logits.shape)
        torch.Size([3, 352, 352])
        ```Tr_  r  r   rc  Nr   )r   r   r   rh   r  zWMake sure to pass as many conditional embeddings as there are query images in the batchzcMake sure that the feature dimension of the conditional embeddings matches `config.projection_dim`.)r3   r   rY   rZ   r9   r[   rR   )r$   r$  r  r  rd  rU   r@  r   r-  rV   r  r   r   r`   r6  r  r   r   r!   r   BCEWithLogitsLossrX   )rJ   r   r   r  rY   r   rh   r  r   r   r  rZ   rU   rT  rR  decoder_outputsr   r3   loss_fns                      r(   r   z#CLIPSegForImageSegmentation.forwarda  s   b ]]_ 	-1F)*9TYY99 ))A N
 +88M*88M9=9L9LMA=Q/MKM 8"0"B"B,::,::)44	N	, ")%)%D%D'--a0#-))A &E &" &++A.,2D2DQ2GG m  &++A.$++2L2LL 0  '$,,"
 

 !''YYv}}-F**,G66*D-#9' .*
 	
[ N	 	s   AF# F/9F#F##F,)NNNNN)NNNNNNNT)rL   rM   rN   r   rQ   rn   r   r$   r   r  r   r   rP   r   rX  r   r   rG   r2   r   r   r   s   @r(   r  r  /  s    }  "&)-.2,08<&$J& <<$&& t+	&
 llT)& #(,,"5&:  /315=A;?.204*.)-n
$$t+n
 ''$.n
 #("3"3d":	n

 !& 1 1D 8n
 t+n
 &&-n
   4'n
 #'n
 +,n
 
	n
  n
r*   r  )r  r  ri  r{  r  )r   )GrO   rB  rO  collections.abcr   dataclassesr   typingr   r$   r    r   r  rR  r	   masking_utilsr
   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_clipsegr   r   r   
get_loggerrL   loggerr   r)   r0   r2   rT   rX   rs  r_   r   floatr   r   r   r   r  r  r&  r0  rZ  ri  ru  r{  r  r  __all__rR   r*   r(   <module>r     s      $ !    & ! / 9 K F & X X I 5 X X 
		H	%
`U\\ `ell `
-U\\ -ell - _K _  _@ 7; 7  7 _[ _  _6Pbii Ph%BII %` %II%<<% 
% <<	%
 LL4'% % %.;)ryy ;)~  4 B/")) /d 4%_ 4% 4%p.
RYY .
bP3+ P3fZ
3 Z
z1
- 1
h1
ryy 1
h1
/ 1
h |
) |
 |
~ 
]
"8 ]

]
@r*   