
    i^                       d Z ddlmZ ddlmZ ddlmZ ddlZddlmZm	Z	 ddl
mZ dd	lmZ dd
lmZ ddlmZ ddlmZmZ ddlmZmZ ddlmZ ddlmZmZmZmZm Z m!Z! ddl"m#Z#m$Z$ ddl%m&Z& ddl'm(Z(m)Z)m*Z*  e       rddl+m,Z,  e jZ                  e.      Z/dej                  dej                  fdZ0dej                  dej                  fdZ1ee G d de                    Z2dedefdZ3dedefdZ4d  Z5d! Z6e ed"#       G d$ d%e                    Z7e ed&#       G d' d(e                    Z8 G d) d*e	jr                        Z: G d+ d,e	jr                        Z;	 	 dQd-e	jr                  d.ej                  d/ej                  d0ej                  d1ej                  dz  d2e<dz  d3e<d4ee   fd5Z= G d6 d7e	jr                        Z> G d8 d9e	jr                        Z? G d: d;e      Z@e G d< d=e             ZA G d> d?e	jr                        ZB G d@ dAeA      ZC G dB dCeA      ZD G dD dEeA      ZE G dF dGeA      ZFe G dH dIeA             ZG G dJ dKe	jr                        ZH G dL dMe	jr                        ZI G dN dOeA      ZJg dPZKy)RzPyTorch OWLv2 model.    )Callable)	dataclass)AnyN)Tensornn   )initialization)ACT2FN)create_causal_mask)GradientCheckpointingLayer)BaseModelOutputBaseModelOutputWithPooling)ALL_ATTENTION_FUNCTIONSPreTrainedModel)Unpack)ModelOutputTransformersKwargsauto_docstringis_vision_availablelogging	torch_int)can_return_tuplemerge_with_config_defaults)capture_outputs   )Owlv2ConfigOwlv2TextConfigOwlv2VisionConfig)center_to_corners_formatlogitsreturnc                     t         j                  j                  | t        j                  t        |       | j                              S )Ndevice)r   
functionalcross_entropytorcharangelenr$   )r    s    y/var/www/vps2.regionflexible.com/Desarrollo/venv/lib/python3.12/site-packages/transformers/models/owlv2/modeling_owlv2.pycontrastive_lossr+   6   s/    ==&&vu||CKPVP]P]/^__    
similarityc                 Z    t        |       }t        | j                               }||z   dz  S )Ng       @)r+   t)r-   caption_loss
image_losss      r*   
owlv2_lossr2   ;   s,    #J/L!*,,.1J:%,,r,   c                      e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZeed<   dZeed	<   d
ee   fdZy)Owlv2Outputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
        Contrastive loss for image-text similarity.
    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
        similarity scores.
    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
        similarity scores.
    text_embeds (`torch.FloatTensor` of shape `(batch_size * num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
        The image embeddings obtained by applying the projection layer to the pooled output of
        [`Owlv2VisionModel`].
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
    Nlosslogits_per_imagelogits_per_texttext_embedsimage_embedstext_model_outputvision_model_outputr!   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) yw)r:   r;   Ngetattrto_tuple.0kselfs     r*   	<genexpr>z'Owlv2Output.to_tuple.<locals>.<genexpr>a   =      
  LLDGRYZ^`aRbRkRkRmm
   -0tuplekeysrE   s   `r*   rA   zOwlv2Output.to_tuple`   #     
YY[
 
 	
r,   )__name__
__module____qualname____doc__r5   r'   FloatTensor__annotations__r6   r7   r8   r9   r:   r   r;   rJ   r   rA    r,   r*   r4   r4   A   s    ( &*D%

d
")15e''$.504OU&&-4,0K""T)0-1L%##d*148186:3:
%* 
r,   r4   r/   c                    | j                         r>| j                  t        j                  t        j                  fv r| S | j                         S | j                  t        j                  t        j                  fv r| S | j                         S N)	is_floating_pointdtyper'   float32float64floatint32int64int)r/   s    r*   _upcastr_   h   s`    GGu}}==qL1779LGGU[[99qFquuwFr,   boxesc                 f    t        |       } | dddf   | dddf   z
  | dddf   | dddf   z
  z  S )a  
    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.

    Args:
        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
            < x2` and `0 <= y1 < y2`.

    Returns:
        `torch.FloatTensor`: a tensor containing the area for each box.
    N   r   r   r   )r_   )r`   s    r*   box_arearc   q   sB     ENE!Q$K%1+%%1+ad*CDDr,   c                 ^   t        |       }t        |      }t        j                  | d d d d df   |d d d df         }t        j                  | d d d dd f   |d d dd f         }||z
  j	                  d      }|d d d d df   |d d d d df   z  }|d d d f   |z   |z
  }||z  }	|	|fS )Nrb   r   minr   )rc   r'   maxrf   clamp)
boxes1boxes2area1area2left_topright_bottomwidth_heightinterunionious
             r*   box_iours      s    VEVEyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29LAq!LAq$99E!T'NU"U*E
%-C:r,   c                    | ddddf   | ddddf   k\  j                         st        d|        |ddddf   |ddddf   k\  j                         st        d|       t        | |      \  }}t        j                  | dddddf   |ddddf         }t        j
                  | dddddf   |ddddf         }||z
  j                  d      }|dddddf   |dddddf   z  }|||z
  |z  z
  S )z
    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.

    Returns:
        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
    Nrb   z<boxes1 must be in [x0, y0, x1, y1] (corner) format, but got z<boxes2 must be in [x0, y0, x1, y1] (corner) format, but got r   re   r   )all
ValueErrorrs   r'   rf   rg   rh   )ri   rj   rr   rq   top_leftbottom_rightro   areas           r*   generalized_box_iourz      s*    1ab5MVArrE]*//1WX^W_`aa1ab5MVArrE]*//1WX^W_`aa(JCyy4!,fQUm<H99VAtQRK0&AB-@L 8+22q29L1a <1a#88D$,$&&&r,   z5
    Output type of [`Owlv2ForObjectDetection`].
    )custom_introc                   l   e Zd ZU dZdZej                  dz  ed<   dZe	dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed	<   dZej                  dz  ed
<   dZeed<   dZeed<   dee   fdZy)Owlv2ObjectDetectionOutputa  
    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
        scale-invariant IoU loss.
    loss_dict (`Dict`, *optional*):
        A dictionary containing the individual losses. Useful for logging.
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    objectness_logits (`torch.FloatTensor` of shape `(batch_size, num_patches, 1)`):
        The objectness logits of all image patches. OWL-ViT represents images as a set of image patches where the
        total number of patches is (image_size / patch_size)**2.
    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
        possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to retrieve the
        unnormalized bounding boxes.
    text_embeds (`torch.FloatTensor` of shape `(batch_size, num_max_text_queries, output_dim`):
        The text embeddings obtained by applying the projection layer to the pooled output of [`Owlv2TextModel`].
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes image
        embeddings for each patch.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
    Nr5   	loss_dictr    objectness_logits
pred_boxesr8   r9   class_embedsr:   r;   r!   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr>   r?   rB   s     r*   rF   z6Owlv2ObjectDetectionOutput.to_tuple.<locals>.<genexpr>   rG   rH   rI   rL   s   `r*   rA   z#Owlv2ObjectDetectionOutput.to_tuple   rM   r,   )rN   rO   rP   rQ   r5   r'   rR   rS   r~   dictr    r   r   r8   r9   r   r:   r   r;   rJ   r   rA   rT   r,   r*   r}   r}      s    > &*D%

d
")!Itd{!'+FE$+26u((4/6+/J!!D(/,0K""T)0-1L%##d*1-1L%##d*148186:3:
%* 
r,   r}   zL
    Output type of [`Owlv2ForObjectDetection.image_guided_detection`].
    c                   0   e Zd ZU dZdZej                  dz  ed<   dZej                  dz  ed<   dZ	ej                  dz  ed<   dZ
ej                  dz  ed<   dZej                  dz  ed<   dZej                  dz  ed<   dZeed	<   dZeed
<   dee   fdZy)%Owlv2ImageGuidedObjectDetectionOutputa  
    logits (`torch.FloatTensor` of shape `(batch_size, num_patches, num_queries)`):
        Classification logits (including no-object) for all queries.
    image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
        image embeddings for each patch.
    query_image_embeds (`torch.FloatTensor` of shape `(batch_size, patch_size, patch_size, output_dim`):
        Pooled output of [`Owlv2VisionModel`]. OWLv2 represents images as a set of image patches and computes
        image embeddings for each patch.
    target_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual target image in the batch
        (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    query_pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_patches, 4)`):
        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
        values are normalized in [0, 1], relative to the size of each individual query image in the batch
        (disregarding possible padding). You can use [`~Owlv2ImageProcessor.post_process_object_detection`] to
        retrieve the unnormalized bounding boxes.
    class_embeds (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_size)`):
        Class embeddings of all image patches. OWLv2 represents images as a set of image patches where the total
        number of patches is (image_size / patch_size)**2.
    text_model_output (tuple[`BaseModelOutputWithPooling`]):
        The output of the [`Owlv2TextModel`].
    vision_model_output (`BaseModelOutputWithPooling`):
        The output of the [`Owlv2VisionModel`].
    Nr    r9   query_image_embedstarget_pred_boxesquery_pred_boxesr   r:   r;   r!   c                 H     t         fd j                         D              S )Nc              3   d   K   | ]'  }|d vr|   nt        |      j                          ) ywr>   r?   rB   s     r*   rF   zAOwlv2ImageGuidedObjectDetectionOutput.to_tuple.<locals>.<genexpr>  rG   rH   rI   rL   s   `r*   rA   z.Owlv2ImageGuidedObjectDetectionOutput.to_tuple  rM   r,   )rN   rO   rP   rQ   r    r'   rR   rS   r9   r   r   r   r   r:   r   r;   rJ   r   rA   rT   r,   r*   r   r      s    8 (,FE$+-1L%##d*137))D0726u((4/615e''$.5-1L%##d*148186:3:
%* 
r,   r   c                        e Zd Zdef fdZdej                  dededej                  fdZddej                  d	e
dej                  fd
Z xZS )Owlv2VisionEmbeddingsconfigc                    t         |           |j                  | _        || _        |j                  | _        t        j                  t        j                  |j                              | _
        t        j                  |j                  | j
                  |j                  |j                  d      | _        |j                  |j                  z  dz  | _        | j                  dz   | _        t        j"                  | j                   | j
                        | _        | j'                  dt        j(                  | j                         j+                  d      d       y )NF)in_channelsout_channelskernel_sizestridebiasrb   r   position_idsr   
persistent)super__init__
patch_sizer   hidden_size	embed_dimr   	Parameterr'   randnclass_embeddingConv2dnum_channelspatch_embedding
image_sizenum_patchesnum_positions	Embeddingposition_embeddingregister_bufferr(   expandrE   r   	__class__s     r*   r   zOwlv2VisionEmbeddings.__init__  s    ++++!||EKK8J8J,KL!yy++))$$ 
 #--1B1BBqH!--1"$,,t/A/A4>>"R^U\\$:L:L-M-T-TU\-]jopr,   
embeddingsheightwidthr!   c                    |j                   d   dz
  }| j                  j                  j                  d      }|j                   d   dz
  }t        j
                  j                         s%||k(  r ||k(  r| j                  | j                        S |ddddf   }|ddddf   }|j                   d   }	|| j                  z  }
|| j                  z  }t        |dz        }|j                  d|||	      }|j                  dddd      }t        j                  j                  ||
|fdd	
      }|j                  dddd      j                  dd|	      }t	        j                   ||fd      S )a   
        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
        images. This method is also adapted to support torch.jit tracing.

        Adapted from:
        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
        r   r   Nr   g      ?r   rb   bicubicF)sizemodealign_cornersdim)shaper   weight	unsqueezer'   jit
is_tracingr   r   r   reshapepermuter   r%   interpolateviewcat)rE   r   r   r   r   r   r   class_pos_embedpatch_pos_embedr   
new_height	new_widthsqrt_num_positionss                r*   interpolate_pos_encodingz.Owlv2VisionEmbeddings.interpolate_pos_encoding.  sv    !&&q)A-!44;;EEaH*003a7 yy##%+*F6UZ?**4+<+<==,QU3,QU3r"t.
T__,	&}c'9:)11!5GI[]`a)11!Q1=--33i(	 4 
 *11!Q1=BB1b#Nyy/?;CCr,   pixel_valuesr   c                 h   |j                   \  }}}}| j                  |      }|j                  d      j                  dd      }| j                  j                  |dd      }t        j                  ||gd      }	|r|	| j                  |	||      z   }	|	S |	| j                  | j                        z   }	|	S )Nrb   r   r   r   )r   r   flatten	transposer   r   r'   r   r   r   r   )
rE   r   r   
batch_size_r   r   patch_embedsr   r   s
             r*   forwardzOwlv2VisionEmbeddings.forwardW  s    '3'9'9$
Avu++L9#++A.88A>++22:q"EYYl;C
##d&C&CJPVX]&^^J  $d&=&=d>O>O&PPJr,   F)rN   rO   rP   r   r   r'   r   r^   r   rR   boolr   __classcell__r   s   @r*   r   r     sm    q0 q*'D5<< 'D 'DUX 'D]b]i]i 'DRE$5$5 QU bgbnbn r,   r   c            	            e Zd Zdef fdZ	 	 	 d	dej                  dz  dej                  dz  dej                  dz  dej                  fdZ	 xZ
S )
Owlv2TextEmbeddingsr   c                 ^   t         |           t        j                  |j                  |j
                        | _        t        j                  |j                  |j
                        | _        | j                  dt        j                  |j                        j                  d      d       y )Nr   r   Fr   )r   r   r   r   
vocab_sizer   token_embeddingmax_position_embeddingsr   r   r'   r(   r   r   s     r*   r   zOwlv2TextEmbeddings.__init__g  s    !||F,=,=v?Q?QR"$,,v/M/MvOaOa"b 	ELL)G)GHOOPWXej 	 	
r,   N	input_idsr   inputs_embedsr!   c                     ||j                   d   n|j                   d   }|| j                  d d d |f   }|| j                  |      }| j                  |      }||z   }|S )Nr   )r   r   r   r   )rE   r   r   r   
seq_lengthposition_embeddingsr   s          r*   r   zOwlv2TextEmbeddings.forwardq  s{     -6,AY__R(}GZGZ[]G^
,,Q^<L  00;M"55lC"%88
r,   NNN)rN   rO   rP   r   r   r'   
LongTensorrR   r   r   r   r   s   @r*   r   r   f  sj    
 
 .20426	##d* &&- ((4/	
 
r,   r   modulequerykeyvalueattention_maskscalingdropoutkwargsc                    ||j                  d      dz  }t        j                  ||j                  dd            |z  }|||z   }t        j
                  j                  |d      }t        j
                  j                  ||| j                        }t        j                  ||      }	|	j                  dd      j                         }	|	|fS )Nr         rb   r   r   )ptrainingr   )
r   r'   matmulr   r   r%   softmaxr   r   
contiguous)
r   r   r   r   r   r   r   r   attn_weightsattn_outputs
             r*   eager_attention_forwardr     s     **R.D( <<s}}Q':;gEL!#n4==((2(>L==((6??([L,,|U3K''1-88:K$$r,   c                        e Zd ZdZ fdZ	 d	dej                  dej                  dz  dee   de	ej                  ej                  dz  f   fdZ
 xZS )
Owlv2Attentionz=Multi-headed attention from 'Attention Is All You Need' paperc                    t         |           || _        |j                  | _        |j
                  | _        | j                  | j                  z  | _        | j                  | j                  z  | j                  k7  r&t        d| j                   d| j                   d      | j                  dz  | _	        |j                  | _        d| _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        t        j                  | j                  | j                        | _        y )Nz;embed_dim must be divisible by num_heads (got `embed_dim`: z and `num_heads`: z).r   F)r   r   r   r   r   num_attention_heads	num_headshead_dimrv   scaleattention_dropoutr   	is_causalr   Lineark_projv_projq_projout_projr   s     r*   r   zOwlv2Attention.__init__  s   ++33$..8==4>>)T^^;MdnnM] ^NN#2'  ]]D(
//ii?ii?ii?		$..$..Ar,   Nhidden_statesr   r   r!   c                    |j                   d d }g |d| j                  } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      } | j                  |      j                  | j	                  dd      }t        j                  | j                  j                  t              }	 |	| ||||f| j                  | j                  sdn| j                  d|\  }
} |
j                  g |d j!                         }
| j#                  |
      }
|
|fS )Nr   r   rb           )r   r   )r   r   r   r   r   r   r   r   get_interfacer   _attn_implementationr   r   r   r   r   r   r  )rE   r  r   r   input_shapehidden_shapequery_states
key_statesvalue_statesattention_interfacer   r   s               r*   r   zOwlv2Attention.forward  sQ    $))#2.88b8$--86t{{=166EOOPQSTU4T[[/44lCMMaQRS
6t{{=166EOOPQSTU(?(M(MKK,,.E)
 %8	%
 JJ#}}C$,,	%
 	%
!\ *k));;;;FFHmmK0L((r,   rV   )rN   rO   rP   rQ   r   r'   r   r   r   rJ   r   r   r   s   @r*   r   r     sf    GB. /3)||) t+) +,	)
 
u||U\\D00	1)r,   r   c                   V     e Zd Z fdZdej
                  dej
                  fdZ xZS )Owlv2MLPc                    t         |           || _        t        |j                     | _        t        j                  |j                  |j                        | _
        t        j                  |j                  |j                        | _        y rV   )r   r   r   r
   
hidden_actactivation_fnr   r   r   intermediate_sizefc1fc2r   s     r*   r   zOwlv2MLP.__init__  sd    #F$5$5699V//1I1IJ99V55v7I7IJr,   r  r!   c                 l    | j                  |      }| j                  |      }| j                  |      }|S rV   )r  r  r  )rE   r  s     r*   r   zOwlv2MLP.forward  s4    /**=9/r,   )rN   rO   rP   r   r'   r   r   r   r   s   @r*   r  r    s$    KU\\ ell r,   r  c                        e Zd Zdeez  f fdZdej                  dej                  dee	   dej                  fdZ xZS )Owlv2EncoderLayerr   c                 D   t         |           |j                  | _        t	        |      | _        t        j                  | j                  |j                        | _	        t        |      | _        t        j                  | j                  |j                        | _        y Neps)r   r   r   r   r   	self_attnr   	LayerNormlayer_norm_epslayer_norm1r  mlplayer_norm2r   s     r*   r   zOwlv2EncoderLayer.__init__  sm    ++'/<<F<Q<QRF#<<F<Q<QRr,   r  r   r   r!   c                     |}| j                  |      } | j                  d||d|\  }}||z   }|}| j                  |      }| j                  |      }||z   }|S )N)r  r   rT   )r  r  r!  r   )rE   r  r   r   residualr   s         r*   r   zOwlv2EncoderLayer.forward  s     !((7)4>> 
')
 
q
 !=0 ((7/ =0r,   )rN   rO   rP   r   r   r   r'   r   r   r   rR   r   r   r   s   @r*   r  r    sV    S0?B S||  +,	
 
		r,   r  c                       e Zd ZU eed<   dZdZdZdZdZ	dZ
dZdgZeedZddgZ ej$                         d	ej(                  fd
       Zy)Owlv2PreTrainedModelr   owlv2)imagetextTr  )r  
attentionsz&.*text_model\.embeddings\.position_idsz(.*vision_model\.embeddings\.position_idsr   c                 $   | j                   j                  }t        |t              rt	        j
                  |j                  j                  d|dz         t	        j
                  |j                  j                  d|dz         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t              rt	        j
                  |j                   d|j"                  dz  |z         t	        j
                  |j$                  j                  |j                   j&                  |z         t	        j
                  |j                  j                  |j                   j&                  |z         t	        j                  |j                  t        j                  |j                  j                  d         j                  d             nt        |t(              r|j"                  dz  d|j                   j*                  z  dz  z  |z  }|j"                  dz  |z  }t	        j
                  |j,                  j                  |       t	        j
                  |j.                  j                  |       t	        j
                  |j0                  j                  |       t	        j
                  |j2                  j                  |       nt        |t4              r|j                   j6                  dz  d|j                   j*                  z  dz  z  |z  }d|j                   j6                  z  dz  |z  }t	        j
                  |j8                  j                  |       t	        j
                  |j:                  j                  |       nt        |t<              rt	        j
                  |j>                  j                  |j@                  dz  |z         t	        j
                  |jB                  j                  |jD                  dz  |z         t	        jF                  |jH                  | j                   jJ                         nTt        |tL              rDt	        j                  |jN                  |jQ                  |jR                  |jT                               t        |tV        jX                        r>t	        jZ                  |j\                         t	        j^                  |j                         t        |tV        j`                        rOt	        j
                  |j                  d|       |j\                   t	        jZ                  |j\                         y	y	y	)
zInitialize the weightsr  g{Gz?)meanstdr   r   r   )r,  rb   N)1r   initializer_factor
isinstancer   initnormal_r   r   r   copy_r   r'   r(   r   r   r   r   r   r   initializer_ranger   num_hidden_layersr   r   r   r  r  r   r  r  
Owlv2Modeltext_projectiontext_embed_dimvisual_projectionvision_embed_dim	constant_logit_scalelogit_scale_init_valueOwlv2ForObjectDetectionbox_biascompute_box_biasnum_patches_heightnum_patches_widthr   r  zeros_r   ones_r   )rE   r   factorin_proj_stdout_proj_stdfc_stds         r*   _init_weightsz"Owlv2PreTrainedModel._init_weights#  s    //f12LL//66SftmTLL2299RVWJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh 56LL//cv?O?OQU?UX^?^_LL//66FMM<[<[^d<deLL2299v}}?^?^ag?ghJJv**ELL9L9L9R9RSU9V,W,^,^_f,gh/!++T1q6==;Z;Z7Z_c6cdgmmK",,d2f<LLL--;?LL--;?LL--;?LL//\B)!==44d:FMMDcDc@chl?lmpvvK&--333<vEFLL**7LL**<
+LL&&--))4/&8 LL((//++T1F: NN6--t{{/Q/QR 78JJv(?(?@Y@Y[a[s[s(tufbll+KK$JJv}}%fbii(LLSf={{&FKK( ' )r,   N)rN   rO   rP   r   rS   base_model_prefixinput_modalitiessupports_gradient_checkpointing_supports_sdpa_supports_flash_attn_supports_flex_attn_supports_attention_backend_no_split_modulesr  r   _can_record_outputs"_keys_to_ignore_on_load_unexpectedr'   no_gradr   ModulerG  rT   r,   r*   r%  r%    s     (&*#N"&,-*$
 	23*&
 U]]_*)BII *) *)r,   r%  c                   `     e Zd ZdZdef fdZ	 d	dej                  dz  dee	   de
fdZ xZS )
Owlv2Encoderz
    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
    [`Owlv2EncoderLayer`].

    Args:
        config: Owlv2Config
    r   c                     t         |           || _        t        j                  t        |j                        D cg c]  }t        |       c}      | _        d| _	        y c c}w NF)
r   r   r   r   
ModuleListranger3  r  layersgradient_checkpointing)rE   r   r   r   s      r*   r   zOwlv2Encoder.__init__[  sO    mmfNfNfHg$h1%6v%>$hi&+# %is   A#Nr   r   r!   c                 T    |}| j                   D ]  } |||fi |} t        |      S )a7  
        Args:
            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
                than the model's internal embedding lookup matrix.
            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:

                - 1 for tokens that are **not masked**,
                - 0 for tokens that are **masked**.

                [What are attention masks?](../glossary#attention-mask)
        )last_hidden_state)rZ  r   )rE   r   r   r   r  encoder_layers         r*   r   zOwlv2Encoder.forwarda  sH    ( &![[ 	M) M	 +
 	
r,   rV   )rN   rO   rP   rQ   r   r   r'   r   r   r   r   r   r   r   s   @r*   rU  rU  R  sK    ,{ , /3
 t+
 +,	

 

r,   rU  c                        e Zd Zdef fdZe ed      e	 	 	 ddej                  dz  dej                  dz  dej                  dz  d	e
e   d
eez  f
d                     Z xZS )Owlv2TextTransformerr   c                     t         |   |       |j                  }t        |      | _        t        |      | _        t        j                  ||j                        | _
        | j                          y r  )r   r   r   r   r   rU  encoderr   r  r  final_layer_norm	post_init)rE   r   r   r   s      r*   r   zOwlv2TextTransformer.__init__  sX     &&	-f5#F+ "YF<Q<Q R 	r,   Ftie_last_hidden_statesNr   r   r   r   r!   c                 :   |j                         }|j                  d|d         }| j                  ||      }t        | j                  ||d      }|j                  dd        | j                  d||dd|}|j                  }| j                  |      }|t        j                  |j                  d   |j                  	      |j                  t        j                        j                  d
      j                  |j                        f   }	t!        ||	      S )a|  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)
        r   )r   r   N)r   r   r   past_key_valuesr   T)r   r   r   r   r#   r   r]  pooler_outputrT   )r   r   r   r   r   poprb  r]  rc  r'   r(   r   r$   tor^   argmaxr   )
rE   r   r   r   r   r  r  encoder_outputsr]  pooled_outputs
             r*   r   zOwlv2TextTransformer.forward  s       nn&NN2{27	),W+;;') 	
 	

;%+74<< ,
'),
 	,
 ,== 112CD *LL*003<M<T<TULL#**r*2556G6N6NOQ

 */'
 	
r,   r   )rN   rO   rP   r   r   r   r   r   r'   r   r   r   rJ   r   r   r   r   s   @r*   r`  r`    s    	 	  E2 *..2,0	-
<<$&-
 t+-
 llT)	-

 +,-
 
+	+-
  3  -
r,   r`  c                        e Zd ZU eed<   dZdef fdZdej                  fdZ	d Z
e	 	 ddej                  dz  d	ej                  dz  d
ee   deez  fd       Z xZS )Owlv2TextModelr   )r(  c                 d    t         |   |       t        |      | _        | j	                          y rV   )r   r   r`  
text_modelrd  r   s     r*   r   zOwlv2TextModel.__init__  s&     .v6r,   r!   c                 B    | j                   j                  j                  S rV   rs  r   r   rL   s    r*   get_input_embeddingsz#Owlv2TextModel.get_input_embeddings  s    ))999r,   c                 :    || j                   j                  _        y rV   ru  )rE   r   s     r*   set_input_embeddingsz#Owlv2TextModel.set_input_embeddings  s    5:""2r,   Nr   r   r   c                 ,     | j                   d||d|S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> from transformers import AutoProcessor, Owlv2TextModel

        >>> model = Owlv2TextModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled (EOS token) states
        ```r   r   rT   )rs  )rE   r   r   r   s       r*   r   zOwlv2TextModel.forward  s,    6 t 
)
 
 	
r,   NN)rN   rO   rP   r   rS   rI  r   r   rS  rv  rx  r   r'   r   r   r   rJ   r   r   r   r   s   @r*   rq  rq    s      :bii :;  *..2
<<$&
 t+
 +,	

 
+	+
 
r,   rq  c                        e Zd Zdef fdZe ed      e	 ddej                  de
dz  dee   d	eez  fd
                     Z xZS )Owlv2VisionTransformerr   c                 D   t         |   |       t        |      | _        t	        j
                  |j                  |j                        | _        t        |      | _
        t	        j
                  |j                  |j                        | _        | j                          y r  )r   r   r   r   r   r  r   r  pre_layernormrU  rb  post_layernormrd  r   s     r*   r   zOwlv2VisionTransformer.__init__  sr     /7\\&*<*<&BWBWX#F+ ll6+=+=6CXCXY 	r,   Fre  r   r   Nr   r!   c                 T   | j                   j                  j                  j                  }|j	                  |      }| j                  ||      }| j                  |      } | j                  dd|i|}|j                  }|d d dd d f   }| j                  |      }t        ||      S )N)r   r   r   ri  rT   )
r   r   r   rX   rl  r  rb  r]  r  r   )	rE   r   r   r   expected_input_dtyper  rn  r]  ro  s	            r*   r   zOwlv2VisionTransformer.forward  s      $>>EEKK#';<Ogh**=9+74<< ,
',
,

 ,==)!Q'2++M:)/'
 	
r,   r   )rN   rO   rP   r   r   r   r   r   r'   rR   r   r   r   rJ   r   r   r   r   s   @r*   r}  r}    sz    	0 	  E2 16
''
 #'+
 +,	

 
+	+
  3  
r,   r}  c            
            e Zd ZU eed<   dZdZdef fdZdej                  fdZ
e	 	 ddej                  dz  ded	ee   defd
       Z xZS )Owlv2VisionModelr   r   )r'  c                 d    t         |   |       t        |      | _        | j	                          y rV   )r   r   r}  vision_modelrd  r   s     r*   r   zOwlv2VisionModel.__init__'  s'     26:r,   r!   c                 B    | j                   j                  j                  S rV   )r  r   r   rL   s    r*   rv  z%Owlv2VisionModel.get_input_embeddings-  s      ++;;;r,   Nr   r   c                 ,     | j                   d||d|S )a#  
        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Owlv2VisionModel

        >>> model = Owlv2VisionModel.from_pretrained("google/owlv2-base-patch16")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))

        >>> inputs = processor(images=image, return_tensors="pt")

        >>> outputs = model(**inputs)
        >>> last_hidden_state = outputs.last_hidden_state
        >>> pooled_output = outputs.pooler_output  # pooled CLS states
        ```r   r   rT   )r  )rE   r   r   r   s       r*   r   zOwlv2VisionModel.forward0  s.    8 !t   
%%=
 
 	
r,   rW  )rN   rO   rP   r   rS   main_input_namerI  r   r   rS  rv  r   r'   rR   r   r   r   r   r   r   r   s   @r*   r  r  "  s    $O!0 <bii <  26).
''$.
 #'
 +,	

 
$
 
r,   r  c                       e Zd ZU eed<   def fdZee	 ddej                  dej                  dz  de
e   deez  fd              Zee	 dd	ej                  d
ede
e   deez  fd              Zee	 	 	 	 	 	 ddej"                  dz  d	ej$                  dz  dej                  dz  dedz  d
ededz  de
e   deez  fd              Z xZS )r4  r   c                 L   t         |   |       |j                  }|j                  }|j                  | _        |j
                  | _        |j
                  | _        t        |      | _	        t        |      | _        t        j                  | j                  | j                  d      | _        t        j                  | j                  | j                  d      | _        t        j                   t#        j$                  |j&                              | _        | j+                          y )NF)r   )r   r   text_configvision_configprojection_dimr   r6  r8  r`  rs  r}  r  r   r   r7  r5  r   r'   tensorr;  r:  rd  )rE   r   r  r  r   s       r*   r   zOwlv2Model.__init__X  s     ((,,$33)55 - 9 9.{;2=A!#4+@+@$BUBU\a!b!yy)<)<d>Q>QX]^<<V5R5R(ST 	r,   Nr   r   r   r!   c                 t     | j                   d||d|}|j                  }| j                  |      |_        |S )a  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids)

        Examples:
        ```python
        >>> import torch
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> inputs = processor(
        ...     text=[["a photo of a cat", "a photo of a dog"], ["photo of a astranaut"]], return_tensors="pt"
        ... )
        >>> with torch.inference_mode():
        ...     text_features = model.get_text_features(**inputs)
        ```rz  rT   )rs  rj  r5  )rE   r   r   r   text_outputsro  s         r*   get_text_featureszOwlv2Model.get_text_featuresl  sP    6 4C4?? 4
)4
 4

 %22%)%9%9-%H"r,   r   r   c                 p     | j                   d||d|}| j                  |j                        |_        |S )a  
        Examples:
        ```python
        >>> import torch
        >>> from transformers.image_utils import load_image
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> image = load_image(url)

        >>> inputs = processor(images=image, return_tensors="pt")
        >>> with torch.inference_mode():
        ...     image_features = model.get_image_features(**inputs)
        ```r  rT   )r  r7  rj  )rE   r   r   r   vision_outputss        r*   get_image_featureszOwlv2Model.get_image_features  sM    2 6GT5F5F 6
%%=6
 6

 (,'='=n>Z>Z'[$r,   return_lossreturn_base_image_embedsc           	      |    | j                   d	||d|} | j                  d	||d|}	|	j                  }
| j                  |
      }
|j                  }| j	                  |      }|t
        j                  j                  |ddd      z  }|
t
        j                  j                  |
ddd      z  }| j                  j                         j                  |j                        }t        j                  ||j                               |z  }|j                         }d}|rt        |      }|}
t        ||||
||	|      S )
a  
        return_loss (`bool`, *optional*):
            Whether or not to return the contrastive loss.
        return_base_image_embeds (`bool`, *optional*):
            Whether or not to return the base image embeddings.

        Examples:
        ```python
        >>> from PIL import Image
        >>> import httpx
        >>> from io import BytesIO
        >>> from transformers import AutoProcessor, Owlv2Model

        >>> model = Owlv2Model.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(text=[["a photo of a cat", "a photo of a dog"]], images=image, return_tensors="pt")
        >>> outputs = model(**inputs)
        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
        ```r  rz  rb   r   T)ordr   keepdimN)r5   r6   r7   r8   r9   r:   r;   rT   )r  rs  rj  r5  r7  r'   linalgnormr:  exprl  r$   r   r/   r2   r4   )rE   r   r   r   r  r   r  r   r  r  r8   r9   text_embeds_normr:  r7   r6   r5   s                    r*   r   zOwlv2Model.forward  sd   F 6GT5F5F 6
%%=6
 6
 4C4?? 4
)4
 4
 #00**;7%33--l; $ell&7&7!QS]a&7&bb&):):;ASU_c):)dd &&**,//0C0CD,,'79IJ[X*,,.o.D&-+#%* .
 	
r,   rV   r   )NNNNFN)rN   rO   rP   r   rS   r   r   r   r'   r   r   r   rJ   r   r  r   r  r   rR   r4   r   r   r   s   @r*   r4  r4  S  s    { (  /3!<<! t+! +,	!
 
+	+!  !F  */ll #' +,	
 
+	+  @  .215.2#').04K
##d*K
 ''$.K
 t+	K

 D[K
 #'K
 #'+K
 +,K
 
	K
  K
r,   r4  c                   b     e Zd Zddedef fdZdej                  dej                  fdZ	 xZ
S )Owlv2BoxPredictionHeadr   out_dimc                 "   t         |           |j                  j                  }t	        j
                  ||      | _        t	        j
                  ||      | _        t	        j                         | _	        t	        j
                  ||      | _
        y rV   )r   r   r  r   r   r   dense0dense1GELUgeludense2)rE   r   r  r   r   s       r*   r   zOwlv2BoxPredictionHead.__init__  sb    $$00iiu-iiu-GGI	iiw/r,   image_featuresr!   c                     | j                  |      }| j                  |      }| j                  |      }| j                  |      }| j                  |      }|S rV   )r  r  r  r  )rE   r  outputs      r*   r   zOwlv2BoxPredictionHead.forward  sM    ^,6"V$6"V$r,   )   )rN   rO   rP   r   r^   r   r'   r   rR   r   r   r   s   @r*   r  r    s3    0{ 0S 0ell u7H7H r,   r  c            	            e Zd Zdef fdZdej                  dej                  dz  dej                  dz  deej                     fdZ	 xZ
S )	Owlv2ClassPredictionHeadr   c                    t         |           |j                  j                  }|j                  j                  | _        t        j                  | j
                  |      | _        t        j                  | j
                  d      | _	        t        j                  | j
                  d      | _
        t        j                         | _        y )Nr   )r   r   r  r   r  	query_dimr   r   r  logit_shiftr:  ELUelu)rE   r   r  r   s      r*   r   z!Owlv2ClassPredictionHead.__init__  s    $$00--99ii899T^^Q799T^^Q7668r,   r9   query_embedsN
query_maskr!   c                 0   | j                  |      }|S|j                  }|j                  d d \  }}t        j                  ||| j
                  f      j                  |      }||fS |t        j                  j                  |dd      dz   z  }|t        j                  j                  |dd      dz   z  }t        j                  d||      }| j                  |      }	| j                  |      }
| j                  |
      dz   }
||	z   |
z  }||j                  dkD  rt        j                  |d	      }t        j                  |d
k(  t        j                   |j"                        j$                  |      }|j                  t        j&                        }||fS )Nrb   r   T)r   r  gư>z...pd,...qd->...pqr   r   r   r   )r  r$   r   r'   zerosr  rl  r  r  einsumr  r:  r  ndimr   wherefinforX   rf   rY   )rE   r9   r  r  image_class_embedsr$   r   r   pred_logitsr  r:  s              r*   r   z Owlv2ClassPredictionHead.forward$  s    "[[6'..F&8&>&>r&B#J++z;&OPSSTZ[K!344 05<<3D3DEW]_im3D3nqu3uv#u||'8'82W['8'\_c'cd ll#79K\Z &&|4&&|4hh{+a/"[0K?!""__ZR@
++jAou{{;CTCT7U7Y7Y[fgK%..7K/00r,   )rN   rO   rP   r   r   r'   rR   r   rJ   r   r   r   s   @r*   r  r    s_    	{ 	!1''!1 ''$.!1 LL4'	!1
 
u  	!!1r,   r  c                       e Zd ZU eed<   def fdZedededej                  fd       Z
dej                  dej                  fdZdededej                  fd	Z	 dd
ej                  dej                  dedej                  fdZ	 	 dd
ej                  dej                  dz  dej                  dz  deej                     fdZ	 ddej                  dej                  dej                  dedee   deej                     fdZ	 ddej                  dedee   deej                     fdZ	 ddej                  dej                  dedej                  fdZee	 	 d dej                  dej                  dz  dedee   def
d              Zee	 	 d dej                  dej                  dej                  dz  dedee   defd              Z xZS )!r<  r   c                    t         |   |       t        |      | _        t	        |      | _        t        |      | _        t        |d      | _        t        j                  |j                  j                  |j                  j                        | _        t        j                         | _        || _        | j"                  j                  j$                  | j"                  j                  j&                  z  | _        | j"                  j                  j$                  | j"                  j                  j&                  z  | _        | j-                  d| j/                  | j(                  | j*                        d       | j1                          y )Nr   )r  r  r=  Fr   )r   r   r4  r&  r  
class_headr  box_headobjectness_headr   r  r  r   r  
layer_normSigmoidsigmoidr   r   r   r?  r@  r   r>  rd  r   s     r*   r   z Owlv2ForObjectDetection.__init__K  s    '
26:.v65faH,,v';';'G'GVMaMaMpMpqzz|"&++";";"F"F$++JcJcJnJn"n!%!:!:!E!EIbIbImIm!m--d.E.EtG]G]^kp 	 	

 	r,   r?  r@  r!   c                 j   t        j                  d|dz   t         j                        }t        j                  d| dz   t         j                        }t        j                  ||d      \  }}t        j                  ||fd      }|dxx   |z  cc<   |dxx   | z  cc<   |j                  dd	      }|S )
Nr   )rX   xy)indexingr   r   .r   .r   rb   )r'   r(   rY   meshgridstackr   )r?  r@  x_coordinatesy_coordinatesxxyybox_coordinatess          r*   !normalize_grid_corner_coordinatesz9Owlv2ForObjectDetection.normalize_grid_corner_coordinates_  s     Q(9A(=U]]SQ(:Q(>emmT}tLB  ++r2hB7#44#55 *..r15r,   r  c                 R    |j                         }| j                  |      }|d   }|S )a#  Predicts the probability that each image feature token is an object.

        Args:
            image_features (`torch.FloatTensor` of shape `(batch_size, num_patches, hidden_dim)`)):
                Features extracted from the image.
        Returns:
            Objectness scores.
        r  )detachr  )rE   r  r   s      r*   objectness_predictorz,Owlv2ForObjectDetection.objectness_predictorq  s4     (..0 00@-f5  r,   c                    | j                  ||      }t        j                  |dd      }t        j                  |dz         t        j                  | dz         z
  }t        j
                  |d      }|dxx   |z  cc<   |dxx   |z  cc<   t        j                  |dz         t        j                  | dz         z
  }t        j                  ||gd      }|S )Nr  g      ?g-C6?r  r  r   r   )r  r'   cliploglog1p	full_liker   )rE   r?  r@  r  box_coord_biasbox_sizebox_size_biasr=  s           r*   r>  z(Owlv2ForObjectDetection.compute_box_bias  s    @@ASUfg**_c3? ?T#9:U[[/IY\`I`=aa ??>37--..		(T/2U[[(TAQ5RR 99nm<"Er,   image_featsfeature_mapr   c                     | j                  |      }|r$|j                  \  }}}}| j                  ||      }n| j                  }|j	                  |j
                        }||z  }| j                  |      }|S )a  
        Args:
            image_feats:
                Features extracted from the image, returned by the `image_text_embedder` method.
            feature_map:
                A spatial re-arrangement of image_features, also returned by the `image_text_embedder` method.
            interpolate_pos_encoding:
                Whether to interpolate the pre-trained position encodings.
        Returns:
            pred_boxes:
                List of predicted boxes (cxcywh normalized to 0, 1) nested within a dictionary.
        )r  r   r>  r=  rl  r$   r  )	rE   r  r  r   r   r   r?  r@  r=  s	            r*   box_predictorz%Owlv2ForObjectDetection.box_predictor  s|    & ]];/
 $:E:K:K7A!#4a,,-?ARSH}}H;;{112h
\\*-
r,   Nr  r  c                 6    | j                  |||      \  }}||fS )a8  
        Args:
            image_feats:
                Features extracted from the `image_text_embedder`.
            query_embeds:
                Text query embeddings.
            query_mask:
                Must be provided with query_embeddings. A mask indicating which query embeddings are valid.
        )r  )rE   r  r  r  r  r  s         r*   class_predictorz'Owlv2ForObjectDetection.class_predictor  s)     -1OOKWa,b)(/00r,   r   r   r   r   c                     | j                   d||||d|}|rX|j                  \  }}}}	|| j                  j                  j                  z  }
|	| j                  j                  j                  z  }n| j
                  }
| j                  }|j                  d   }| j                   j                  j                  |      }t        j                  |d d d dd d f   |d d d df   j                        }|d d dd d d f   |z  }| j                  |      }|j                  d   |
||j                  d   f}|j                  |      }|d   }|||fS )N)r   r   r   r   r   r   r   rT   )r&  r   r   r  r   r?  r@  r;   r  r  r'   broadcast_tor  r   )rE   r   r   r   r   r   outputsr   r   r   r?  r@  r]  r9   class_token_outnew_sizer8   s                    r*   image_text_embedderz+Owlv2ForObjectDetection.image_text_embedder  sy    $** 
%)%=	

 
 $"."4"4Aq&%!'4;;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 $77:zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5bk\733r,   c                     | j                   j                  d||d|}|rX|j                  \  }}}}|| j                  j                  j
                  z  }|| j                  j                  j
                  z  }	n| j                  }| j                  }	|d   }
| j                   j                  j                  |
      }t        j                  |d d d dd d f   |d d d df   j                        }|d d dd d d f   |z  }| j                  |      }|j                  d   ||	|j                  d   f}|j                  |      }||fS )Nr  r   r   r   rT   )r&  r  r   r   r  r   r?  r@  r  r'   r  r  r   )rE   r   r   r   r  r   r   r   r?  r@  r]  r9   r  r  s                 r*   image_embedderz&Owlv2ForObjectDetection.image_embedder  sg    6MTZZ5L5L 6
%@X6
\b6
 $"."4"4Aq&%!'4;;+D+D+O+O!O %)B)B)M)M M!%!8!8 $ 6 6 +1-zz..==>OP  ,,\!RaR(-C\RSUXVXUXRXEYE_E_` $Aqr1H-?|4 q!r"	
 $++H5n--r,   query_image_featuresquery_feature_mapc                 j   | j                  |      \  }}| j                  |||      }t        |      }g }g }	|j                  }
t	        |j
                  d         D ]  }t        j                  g dg|
      }||   }t        ||      \  }}t        j                  |d   dk(        rt        ||      }t        j                  |      dz  }|d   |k\  j                         }|j                         s||   |j                  d         }t        j                  ||   d      }t        j                   d||      }|t        j"                  |         }|j%                  ||   |          |	j%                  |       " |r+t        j&                  |      }t        j&                  |	      }nd	\  }}|||fS )
Nr   )r   r   r   r   r#   r  g?r   )axiszd,id->ir{  )r  r  r   r$   rY  r   r'   r  rs   ru   rz   rg   nonzeronumelsqueezer+  r  argminappendr  )rE   r  r  r   r   r   r   pred_boxes_as_cornersbest_class_embedsbest_box_indicespred_boxes_deviceieach_query_boxeach_query_pred_boxesiousiou_thresholdselected_indsselected_embeddingsmean_embedsmean_simbest_box_indr  box_indicess                          r*   embed_image_queryz)Owlv2ForObjectDetection.embed_image_query#  s    ../CD<''(<>OQij
 8 D 188+11!45 	6A"\\<.ARSN$9!$<!n.CDGD! yyaC(*>;PQ "IIdOc1M!!W5>>@M""$&21om6K6KA6N&O##jjaqA <<	;@ST,U\\(-CD!((a)FG ''5'	6*  ;;'89L++&67K(2%L+[*44r,   query_pixel_valuesc           
         | j                  ||      d   } | j                   d||d|\  }}|j                  \  }}	}
}t        j                  |||	|
z  |f      }|j                  \  }}	}
}t        j                  |||	|
z  |f      }| j	                  |||      \  }}}| j                  ||      \  }}| j                  |||      }t        ||||||d|      S )a  
        query_pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
            Pixel values of query image(s) to be detected. Pass in one query image per target image.

        Examples:
        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch
        >>> from transformers import AutoProcessor, Owlv2ForObjectDetection

        >>> processor = AutoProcessor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> query_url = "http://images.cocodataset.org/val2017/000000001675.jpg"
        >>> with httpx.stream("GET", query_url) as response:
        ...     query_image = Image.open(BytesIO(response.read()))
        >>> inputs = processor(images=image, query_images=query_image, return_tensors="pt")

        >>> # forward pass
        >>> with torch.no_grad():
        ...     outputs = model.image_guided_detection(**inputs)

        >>> target_sizes = torch.Tensor([image.size[::-1]])

        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_image_guided_detection(
        ...     outputs=outputs, threshold=0.9, nms_threshold=0.3, target_sizes=target_sizes
        ... )
        >>> i = 0  # Retrieve predictions for the first image
        >>> boxes, scores = results[i]["boxes"], results[i]["scores"]
        >>> for box, score in zip(boxes, scores):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected similar object with confidence {round(score.item(), 3)} at location {box}")
        Detected similar object with confidence 0.938 at location [327.31, 54.94, 547.39, 268.06]
        Detected similar object with confidence 0.959 at location [5.78, 360.65, 619.12, 366.39]
        Detected similar object with confidence 0.902 at location [2.85, 360.01, 627.63, 380.8]
        Detected similar object with confidence 0.985 at location [176.98, -29.45, 672.69, 182.83]
        Detected similar object with confidence 1.0 at location [6.53, 14.35, 624.87, 470.82]
        Detected similar object with confidence 0.998 at location [579.98, 29.14, 615.49, 489.05]
        Detected similar object with confidence 0.985 at location [206.15, 10.53, 247.74, 466.01]
        Detected similar object with confidence 0.947 at location [18.62, 429.72, 646.5, 457.72]
        Detected similar object with confidence 0.996 at location [523.88, 20.69, 586.84, 483.18]
        Detected similar object with confidence 0.998 at location [3.39, 360.59, 617.29, 499.21]
        Detected similar object with confidence 0.969 at location [4.47, 449.05, 614.5, 474.76]
        Detected similar object with confidence 0.966 at location [31.44, 463.65, 654.66, 471.07]
        Detected similar object with confidence 0.924 at location [30.93, 468.07, 635.35, 475.39]
        ```r  r   )r  r  N)r9   r   r   r   r    r   r:   r;   rT   )r  r   r'   r   r  r  r  r   )rE   r   r  r   r   r  r  r  r   r?  r@  
hidden_dimr  query_image_featsr  r  r   r  r   r   s                       r*   image_guided_detectionz.Owlv2ForObjectDetection.image_guided_detectionO  sH   | !//+F^ 0 

 ':d&9&9 '
%%='
 '
#^ ITHYHYE
&(9:mmK*>PSd>dfp1qrHYH_H_E
&(9:!MM
,>AR,RT^_
 <@;Q;Q02J<
8&(8
 '+&:&:{am&:&n#l !..{KIab4$0/-%" .	
 		
r,   c           
          | j                   d||||d|\  }}}|j                  }	|j                  }
|j                  \  }}}}t	        j
                  ||||z  |f      }|j                  d   |z  }|j                  |||j                  d         }|j                  |||j                  d         }|d   dkD  }| j                  |||      \  }}| j                  |      }| j                  |||      }t        |||||||	|
      S )a	  
        input_ids (`torch.LongTensor` of shape `(batch_size * num_max_text_queries, sequence_length)`, *optional*):
            Indices of input sequence tokens in the vocabulary. Indices can be obtained using [`AutoTokenizer`]. See
            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details. [What are input
            IDs?](../glossary#input-ids).

        Examples:
        ```python
        >>> import httpx
        >>> from io import BytesIO
        >>> from PIL import Image
        >>> import torch

        >>> from transformers import Owlv2Processor, Owlv2ForObjectDetection

        >>> processor = Owlv2Processor.from_pretrained("google/owlv2-base-patch16-ensemble")
        >>> model = Owlv2ForObjectDetection.from_pretrained("google/owlv2-base-patch16-ensemble")

        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
        >>> with httpx.stream("GET", url) as response:
        ...     image = Image.open(BytesIO(response.read()))
        >>> text_labels = [["a photo of a cat", "a photo of a dog"]]
        >>> inputs = processor(text=text_labels, images=image, return_tensors="pt")
        >>> outputs = model(**inputs)

        >>> # Target image sizes (height, width) to rescale box predictions [batch_size, 2]
        >>> target_sizes = torch.tensor([(image.height, image.width)])
        >>> # Convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
        >>> results = processor.post_process_grounded_object_detection(
        ...     outputs=outputs, target_sizes=target_sizes, threshold=0.1, text_labels=text_labels
        ... )
        >>> # Retrieve predictions for the first image for the corresponding text queries
        >>> result = results[0]
        >>> boxes, scores, text_labels = result["boxes"], result["scores"], result["text_labels"]
        >>> for box, score, text_label in zip(boxes, scores, text_labels):
        ...     box = [round(i, 2) for i in box.tolist()]
        ...     print(f"Detected {text_label} with confidence {round(score.item(), 3)} at location {box}")
        Detected a photo of a cat with confidence 0.614 at location [341.67, 23.39, 642.32, 371.35]
        Detected a photo of a cat with confidence 0.665 at location [6.75, 51.96, 326.62, 473.13]
        ```)r   r   r   r   r   r   r  )r9   r8   r   r    r   r   r:   r;   rT   )
r  r:   r;   r   r'   r   r  r  r  r}   )rE   r   r   r   r   r   r  r  r  r  r  r   r?  r@  r  r  max_text_queriesr  r  r   r   r   s                         r*   r   zOwlv2ForObjectDetection.forward  sY   f .FT-E-E .
%)%=	.

 .
*k7 00 44HSHYHYE
&(9:mmK*>PSd>dfp1qr %??1-;#++J8H,J\J\]_J`a %%j2BIOOTVDWX	v&*
 '+&:&:;V`&a#l !55kB ''[BZ[
)$$!/%* .	
 		
r,   r   r{  rW  )rN   rO   rP   r   rS   r   staticmethodr^   r'   r   r  rR   r  r>  r   r  rJ   r  r   r   r  r  r  r   r   r   r
  r}   r   r   r   s   @r*   r<  r<  H  s   { ( c VY ^c^j^j   !53D3D !IZIZ !3 3 SXS_S_ . */	&& && #'	
 
		J 26*.	1&&1 ''$.1 LL4'	1
 
u  	!12 */-4<<-4 ''-4 	-4
 #'-4 +,-4 
u  	!-4f */'.'''. #''. +,	'.
 
u  	!'.\ */	*5#//*5 !,,*5 #'	*5
 
		*5X  8<).	`
''`
 "--4`
 #'	`

 +,`
 
/`
  `
D 
 /3).Z
<<Z
 ''Z
 t+	Z

 #'Z
 +,Z
 
$Z
  Z
r,   r<  )r4  r%  rq  r  r<  )Nr  )LrQ   collections.abcr   dataclassesr   typingr   r'   r   r    r	   r/  activationsr
   masking_utilsr   modeling_layersr   modeling_outputsr   r   modeling_utilsr   r   processing_utilsr   utilsr   r   r   r   r   r   utils.genericr   r   utils.output_capturingr   configuration_owlv2r   r   r   transformers.image_transformsr   
get_loggerrN   loggerr+   r2   r4   r_   rc   rs   rz   r}   r   rS  r   r   r[   r   r   r  r  r%  rU  r`  rq  r}  r  r4  r  r  r<  __all__rT   r,   r*   <module>r      s    $ !    & ! / 9 K F &  J 5 P P F 
		H	%`U\\ `ell `
-5<< -ELL - !
+ !
  !
JGv G& GEF Ev E""'0 
/
 /
 /
d *
K *
 *
\JBII J\")) L !%II%<<% 
% <<	%
 LL4'% T\% % '(%:6)RYY 6)tryy  2 B >)? >) >)D-
299 -
b<
/ <
@/
) /
f(
1 (
X.
+ .
b k
% k
 k
^RYY (-1ryy -1`G
2 G
T rr,   